基础异步爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import asyncio
import logging
import aiohttp
from motor.motor_asyncio import AsyncIOMotorClient

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
# mongodb 配置
MONGO_CONNECTION_STRING = 'mongodb://root:yilvs2021@192.168.0.158:27017'
client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = client.test
collection = db.asyncmovies

INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{id}'

semaphore = asyncio.Semaphore(10)
session = None # type: aiohttp.client.ClientSession

async def get_all_page(page_num):
offset = 18 * page_num
url_format = INDEX_URL.format(offset=offset)
logging.info('url %s', url_format)
return await url_request(url_format)


async def get_all_page_details(_id):
url_format = DETAIL_URL.format(id=_id)
logging.info('url %s', url_format)
response = await url_request(url_format)
try:
logging.info('saving data %s', response.get('id'))
await collection.insert_one(response)
except AttributeError:
logging.error('', exc_info=True)


async def url_request(url):
async with semaphore:
try:
async with session.get(url) as response:
return await response.json()
except aiohttp.ClientError:
logging.error('error occurred while scraping %s', url, exc_info=True)


async def main():
global session
session = aiohttp.ClientSession()
# 创建100个协程去调用request方法
tasks = [asyncio.ensure_future(get_all_page(page_num)) for page_num in range(50)]
gather_tasks = await asyncio.gather(*tasks)
ids = []
for task in gather_tasks: # type: dict
task_results = task.get('results')
for item in task_results:
ids.append(item.get('id'))
# 调用详情页面
detail_tasks = [asyncio.ensure_future(get_all_page_details(_id)) for _id in ids]
await asyncio.gather(*detail_tasks)
await session.close()


if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())

请求偶尔会出现的问题,900条数据报错4条

1
2
3
4
5
6
7
8
9
10
11
12
13
2022-03-07 14:45:30,663 - ERROR: error occurred while scraping https://spa5.scrape.center/api/book/4903440
Traceback (most recent call last):
File "/Users/keeep/Downloads/py-project/pythonSpider/ScrapeSpa5/myspider.py", line 42, in url_request
async with session.get(url) as response:
File "/Users/keeep/Downloads/py-project/pythonSpider/venv/lib/python3.8/site-packages/aiohttp/client.py", line 1138, in __aenter__
self._resp = await self._coro
File "/Users/keeep/Downloads/py-project/pythonSpider/venv/lib/python3.8/site-packages/aiohttp/client.py", line 559, in _request
await resp.start(conn)
File "/Users/keeep/Downloads/py-project/pythonSpider/venv/lib/python3.8/site-packages/aiohttp/client_reqrep.py", line 898, in start
message, payload = await protocol.read() # type: ignore[union-attr]
File "/Users/keeep/Downloads/py-project/pythonSpider/venv/lib/python3.8/site-packages/aiohttp/streams.py", line 616, in read
await self._waiter
aiohttp.client_exceptions.ServerDisconnectedError: Server disconnected
赏个🍗吧
0%