爬取matplotlib源码

FilesPipeline

作用

用于下载文件的特殊下载器

用法

# 启用setting.py中的FilesPipeline
ITEM_PIPELINES = {
	  # 需要自己添加
     'scrapy.pipelines.files.FilesPipeline': 1,
}
# 下载的文件存放地址
FILES_STORE = '/Users/mintaoyu/Desktop/files'

总代码

items.py代码

class GetFileItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    file_urls = scrapy.Field()
    files = scrapy.Field()

files.py代码

import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import GetFileItem

class FilesSpider(scrapy.Spider):
    name = 'files'
    # 错误提示allowed_domains accepts only domains, not URLs.
    # allowed_domains = ['https://matplotlib.org']  这个是错误的
    allowed_domains = ['matplotlib.org']
    start_urls = ['https://matplotlib.org/examples/index.html']

    def parse(self, response):
        le = LinkExtractor(restrict_xpaths='//li[@class="toctree-l2"]')
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_example)

    def parse_example(self, response):
        href = response.xpath(
            "//a[@class='reference external']/@href").extract_first()
        url = response.urljoin(href)
        example = GetFileItem()
        example['file_urls'] = [url]
        return example