宝马汽车图片爬取

  1. scrapy startproject baoma建立项目
  2. cd baoma
  3. scrapy genspider -t crawl spider baomawx "car.autohome.com.cn" 项目创建完成,接下写启动文件start.py

    from scrapy import cmdline
    cmdline.execute("scrapy crawl sfw".split( ))
    

    baomawx.py文件

    这里有两个解析函数,对应两种不同的下载方式,这里展示的是分目录存储的方式,另一种注释起来了

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.spiders import CrawlSpider,Rule
    from scrapy.linkextractors import LinkExtractor
    from baoma.items import BaomaItem
    
    class BaomawxSpider(CrawlSpider):
    name = 'baomawx'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']
    rules = (
            Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+'),callback="parse_page", follow=True),)
    #不分目录存储
    def parse_page(self, response):
        category = response.xpath(r'//div[@class="uibox"]/div[@class="uibox-title"]/text()').get()
        srcs = response.xpath(r'//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()
        # srcs = list(map(lambda x:x.replace("240x180_0_q95_c42_",""),srcs))
        # srcs = list(map(lambda x:response.urljoin(x),srcs))
        srcs = list(map(lambda x: response.urljoin(x.replace("240x180_0_q95_c42_","")), srcs))
        # urls = []
        # for src in srcs:
        #     url = response.urljoin(src)
        #     urls.append(url)
        yield BaomaItem(body=category,image_urls=srcs)
    #分目录存储
    # def one_page(self,response):
    #     uiboxs = response.xpath('//div[@class="uibox"]')[1:]
    #     for ui in uiboxs:
    #         body = ui.xpath('./div[@class="uibox-title"]/a/text()').get()
    #         urls = ui.xpath('.//ul//li/a//img/@src').getall()
    #         # for url in urls:
    #         #     url = response.urljoin(url)  #此时的url在全局的url下 用response.urljoin拼接
    #         #     print(url)
    #         # 或者
    #         urls = list(map(lambda url: response.urljoin(url), urls))
    #         item = BaomaItem(body=body, image_urls=urls)
    #         yield item
    
    

item.py文件

# -*- coding: utf-8 -*-
import scrapy
#对应两种不同的存储方式
class BaomaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    body = scrapy.Field()
    # urls = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from baoma import settings
#不分目录存储
class BaomaPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'imags')
        if not os.path.exists(self.path):
            os.mkdir(self.path)
    def process_item(self, item, spider):
        body = item['body']
        urls = item['urls']
        body_path = os.path.join(self.path,body)
        if not os.path.exists(body_path):
            os.mkdir(body_path)
        for url in urls:
            imag_name = url.split('_')[-1]
            request.urlretrieve(url,os.path.join(body_path,imag_name))
        return item
        #分目录存储,改写images_pipelines方法
class BaoMaImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        #此方法在发送请求之前调用
        #其实方法本身就是去发送下载请求的
        request_objs = super(BaoMaImagesPipeline, self).get_media_requests(item,info)
        for request_obj in request_objs:
            request_obj.item = item
        return request_objs
        #改写父类的方法
    def file_path(self, request, response=None, info=None):
        #这个方法在图片将要存储的时候调用,来获取图片的存储路径
        path = super(BaoMaImagesPipeline, self).file_path(request,response,info)
        body = request.item.get("body")
        images_store = settings.IMAGES_STORE
        body_path = os.path.join(images_store,body)
        if not os.path.exists(body_path):
            os.mkdir(body_path)
        image_name = path.replace("full/","")
        image_path = os.path.join(body_path,image_name)
        return image_path

settings.py文件

将以下注释去掉,末尾加上IMAGES_STORE

# -*- coding: utf-8 -*-

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}

ITEM_PIPELINES = {
   # 'baoma.pipelines.BaomaPipeline': 300,
   #  'scrapy.pipelines.images.ImagesPipeline':1,
    'baoma.pipelines.BaoMaImagesPipeline':1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')