爬取matplotlib源码

FilesPipeline

作用

用于下载文件的特殊下载器

用法

1
2
3
4
5
6
7
# 启用setting.py中的FilesPipeline
ITEM_PIPELINES = {
# 需要自己添加
'scrapy.pipelines.files.FilesPipeline': 1,
}
# 下载的文件存放地址
FILES_STORE = '/Users/mintaoyu/Desktop/files'

总代码

items.py代码

1
2
3
4
5
class GetFileItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()

files.py代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import GetFileItem

class FilesSpider(scrapy.Spider):
name = 'files'
# 错误提示allowed_domains accepts only domains, not URLs.
# allowed_domains = ['https://matplotlib.org'] 这个是错误的
allowed_domains = ['matplotlib.org']
start_urls = ['https://matplotlib.org/examples/index.html']

def parse(self, response):
le = LinkExtractor(restrict_xpaths='//li[@class="toctree-l2"]')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_example)

def parse_example(self, response):
href = response.xpath(
"//a[@class='reference external']/@href").extract_first()
url = response.urljoin(href)
example = GetFileItem()
example['file_urls'] = [url]
return example
赏个🍗吧
0%