宝马汽车图片爬取
宝马汽车图片爬取
scrapy startproject baoma
建立项目cd baoma
scrapy genspider -t crawl spider baomawx "car.autohome.com.cn"
项目创建完成,接下写启动文件start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw".split( ))
baomawx.py文件
这里有两个解析函数,对应两种不同的下载方式,这里展示的是分目录存储的方式,另一种注释起来了
# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider,Rule from scrapy.linkextractors import LinkExtractor from baoma.items import BaomaItem class BaomawxSpider(CrawlSpider): name = 'baomawx' allowed_domains = ['car.autohome.com.cn'] start_urls = ['https://car.autohome.com.cn/pic/series/65.html'] rules = ( Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+'),callback="parse_page", follow=True),) #不分目录存储 def parse_page(self, response): category = response.xpath(r'//div[@class="uibox"]/div[@class="uibox-title"]/text()').get() srcs = response.xpath(r'//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall() # srcs = list(map(lambda x:x.replace("240x180_0_q95_c42_",""),srcs)) # srcs = list(map(lambda x:response.urljoin(x),srcs)) srcs = list(map(lambda x: response.urljoin(x.replace("240x180_0_q95_c42_","")), srcs)) # urls = [] # for src in srcs: # url = response.urljoin(src) # urls.append(url) yield BaomaItem(body=category,image_urls=srcs) #分目录存储 # def one_page(self,response): # uiboxs = response.xpath('//div[@class="uibox"]')[1:] # for ui in uiboxs: # body = ui.xpath('./div[@class="uibox-title"]/a/text()').get() # urls = ui.xpath('.//ul//li/a//img/@src').getall() # # for url in urls: # # url = response.urljoin(url) #此时的url在全局的url下 用response.urljoin拼接 # # print(url) # # 或者 # urls = list(map(lambda url: response.urljoin(url), urls)) # item = BaomaItem(body=body, image_urls=urls) # yield item
item.py文件
# -*- coding: utf-8 -*-
import scrapy
#对应两种不同的存储方式
class BaomaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
body = scrapy.Field()
# urls = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from baoma import settings
#不分目录存储
class BaomaPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'imags')
if not os.path.exists(self.path):
os.mkdir(self.path)
def process_item(self, item, spider):
body = item['body']
urls = item['urls']
body_path = os.path.join(self.path,body)
if not os.path.exists(body_path):
os.mkdir(body_path)
for url in urls:
imag_name = url.split('_')[-1]
request.urlretrieve(url,os.path.join(body_path,imag_name))
return item
#分目录存储,改写images_pipelines方法
class BaoMaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
#此方法在发送请求之前调用
#其实方法本身就是去发送下载请求的
request_objs = super(BaoMaImagesPipeline, self).get_media_requests(item,info)
for request_obj in request_objs:
request_obj.item = item
return request_objs
#改写父类的方法
def file_path(self, request, response=None, info=None):
#这个方法在图片将要存储的时候调用,来获取图片的存储路径
path = super(BaoMaImagesPipeline, self).file_path(request,response,info)
body = request.item.get("body")
images_store = settings.IMAGES_STORE
body_path = os.path.join(images_store,body)
if not os.path.exists(body_path):
os.mkdir(body_path)
image_name = path.replace("full/","")
image_path = os.path.join(body_path,image_name)
return image_path
settings.py文件
将以下注释去掉,末尾加上IMAGES_STORE
# -*- coding: utf-8 -*-
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
ITEM_PIPELINES = {
# 'baoma.pipelines.BaomaPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline':1,
'baoma.pipelines.BaoMaImagesPipeline':1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')