Scrapy框架

几年之前有总结过Scrapy框架,这次重新系统学习,补充一些不足

Scrapy结合ES

  • 第一步创建es bean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# pip install elasticsearch_dsl
from elasticsearch_dsl import Document, Text, Keyword
from elasticsearch_dsl.connections import connections

# 新建连接
connections.create_connection(hosts="http://192.168.0.158:9200")

class Domain(Document):
# 文章类型
text = Text(analyzer="ik_max_word")
author = Keyword()
tags = Text(analyzer="ik_max_word")

class Index:
# 数据库名称和表名称
name = "scrapy"

# 这一步是必须的,没有则分词不生效
if __name__ == '__main__':
Domain.init()
  • 第二步在Pipeline中将值给附上,并设置中开启该Pipeline
1
2
3
4
5
6
7
8
9
10
from scrapytoscrape.scrapyto.bean import Domain

class ScrapytoscrapePipeline:
def process_item(self, item, spider):
domain = Domain()
domain['text'] = item['text']
domain['author'] = item['author']
domain['tags'] = item['tags']
domain.save()
return item

Middleware中间件的使用

Downloader Middleware

1
2
3
4
5
6
7
8
9
# 修改请求头、修改响应、管理cookies、丢弃非200状态码响应、丢弃非指定域名请求等
# 方法必须返回None、Response、Request之一或者抛出IgnoreRequest异常
# Request被Engine发送给Downloader之前被执行
# 用来修改请求头、代理配置等
def process_request(self, request, spider)
# Downloader返回ResponseSpider之前被执行
def process_response(self, request, response, spider)
# 当Downloaderprocess_request方法抛出异常时被执行
def process_exception(self, request, exception, spider)

注意process_request方法的返回,如何不返回任何值那么剩下优先级低于它的开启的Downloader Middleware会继续执行,如果返回request那么这个请求将会处理完后发送给Engine并加回到Scheduler中,此时如果只有一个request那么它将无限循环

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import random
from scrapy.http import HtmlResponse

class RandomUserAgentMiddleware(object):
def __init__(self):
self.user_agents = [
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2',
'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1'
]
# 随机选取User-Agent
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)

# 设置代理
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'http://186.96.50.39:999'


class ChangeResponseMiddleware(object):
# 更改response返回状态
def process_response(self, request, response, spider):
text = response.text
response.status = 201
return response

Spider Middleware

1
2
3
4
5
6
7
8
# 记录深度、丢弃非200状态码响应、丢弃非指定域名请求等
# 处理Spider输入
def process_spider_input(self, response, spider)
# 处理Spider输出
def process_spider_output(self, response, result, spider)
def process_spider_exception(self, response, exception, spider)
# 类似process_spider_output
def process_start_requests(self, start_requests, spider)

ItemPipeline

  • 清洗HTML数据
  • 检查爬取数据
  • 查重
  • 保存数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# 返回item则正常运行,返回DropItem则丢弃数据
def process_item(item, spider)
# 初始化操作,如初始化数据库连接等
def open_spider(self, spider)
# 比如最后关闭数据库连接
def close_spider(self, spider)
# 获取全局配置信息
@classmethod
def from_crawler(cls, crawler)

# 举个例子 MongoDB配置信息
class MongoDBPipeline(object):
def __init__(self, connection_string, database):
self.connection_string = connection_string
self.database = database

@classmethod
def from_crawler(cls, crawler):
return cls(
# 在setting.py中写入 MONGODB_CONNECTION_STRING = 'localhost'
# 这里等于从配置文件中取出全局变量
connection_string=crawler.settings.get(
'MONGODB_CONNECTION_STRING'),
database=crawler.settings.get('MONGODB_DATABASE')
)

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.connection_string)
self.db = self.client[self.database]

def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert_one(dict(item))
return item

def close_spider(self, spider):
self.client.close()
# Image Pipeline
# 1. setting.py设置IMAGES_STORE
IMAGES_STORE = './images'
# 2. 自定义MyImagePipeline,继承ImagesPipeline
class MyImagePipeline(ImagesPipeline):
# (1).提取出所需要的信息将其放入meta中
def get_media_requests(self, item, info):
for director in item['directors']:
director_name = director['name']
# 图片链接
director_image = director['image']
yield Request(director_image, meta={
'name': director_name,
'type': 'director',
'movie': item['name']
})
# (2).将meta信息提取出,组合成路径,进行图片保存路径设置
def file_path(self, request, response=None, info=None):
movie = request.meta['movie']
type = request.meta['type']
name = request.meta['name']
file_name = f'{movie}/{type}/{name}.jpg'
return file_name
# (3).判断是否有下载失败的图片,最后返回item
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item

Scrapy规则化爬虫

赏个🍗吧
0%