爬取图库 | 俞塘

目标

爬取cosplay图片

实现

items.py代码

class GetImagesItem(scrapy.Item):
	# 名称不能变
    image_urls = scrapy.Field()
    images=scrapy.Field()
    pass

setting.py代码

ITEM_PIPELINES = {
	# 添加Scrapy内置下载器 
    'scrapy.pipelines.images.ImagesPipeline':1,
}
IMAGES_STORE = '/Users/mintaoyu/Desktop/images'
IMAGES_MIN_WIDTH = 600
IMAGES_MIN_HEIGHT = 400

images.py代码

import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import GetImagesItem

class ImagesSpider(scrapy.Spider):
    name = 'images'
    allowed_domains = ['moe.005.tv']
    start_urls = ['http://moe.005.tv/cosplay/']
   
    def parse(self, response):
    	 # 例子页面
        le = LinkExtractor(restrict_xpaths='//div[@class="zhuti_w_list"]')
        for link in le.extract_links(response):
            yield scrapy.Request(link.url,callback=self.parse_images)
        # 下一页
        le = LinkExtractor(restrict_xpaths='//a[@class="n"]')
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url,callback=self.parse)
       
	# 点进页面链接后要获取相关信息的逻辑代码
    def parse_images(self, response):
        example = GetImagesItem()
        example['image_urls'] = response.xpath('//div[@class="content_nr"]//img/@src').extract()
        return example