爬取图库

目标

爬取cosplay图片

实现

items.py代码

1
2
3
4
5
class GetImagesItem(scrapy.Item):
# 名称不能变
image_urls = scrapy.Field()
images=scrapy.Field()
pass

setting.py代码

1
2
3
4
5
6
7
ITEM_PIPELINES = {
# 添加Scrapy内置下载器
'scrapy.pipelines.images.ImagesPipeline':1,
}
IMAGES_STORE = '/Users/mintaoyu/Desktop/images'
IMAGES_MIN_WIDTH = 600
IMAGES_MIN_HEIGHT = 400

images.py代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import GetImagesItem

class ImagesSpider(scrapy.Spider):
name = 'images'
allowed_domains = ['moe.005.tv']
start_urls = ['http://moe.005.tv/cosplay/']

def parse(self, response):
# 例子页面
le = LinkExtractor(restrict_xpaths='//div[@class="zhuti_w_list"]')
for link in le.extract_links(response):
yield scrapy.Request(link.url,callback=self.parse_images)
# 下一页
le = LinkExtractor(restrict_xpaths='//a[@class="n"]')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request(next_url,callback=self.parse)

# 点进页面链接后要获取相关信息的逻辑代码
def parse_images(self, response):
example = GetImagesItem()
example['image_urls'] = response.xpath('//div[@class="content_nr"]//img/@src').extract()
return example
赏个🍗吧
0%