糗事百科爬取

  1. scrapy startproject qsbk建立项目
  2. cd baoma
  3. scrapy genspider qsbk_spider "car.autohome.com.cn" 项目创建完成,接下来打开pycharm写启动文件start.py

    from scrapy import cmdline
    cmdline.execute("scrapy crawl sfw".split( ))
    

    qsbk_spider.py文件

    # -*- coding: utf-8 -*-
    import scrapy
    from qsbk.items import QsbkItem
    
    class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/2/']
    
    def parse(self, response):
        content = response.xpath('//div[@id="content-left"]/div')
        for con in content:
            author = con.xpath('.//h2/text()').get().strip()
            fill = con.xpath('.//div[@class="content"]/span/text()').getall()
            fill = "".join(fill).strip()
            item = QsbkItem(author=author,fill=fill)
            yield item
        next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get()
        if not next_url:
            return
        else:
            next_url = 'https://www.qiushibaike.com' + next_url
            yield scrapy.Request(next_url,callback=self.parse)
    

items.py文件

# -*- coding: utf-8 -*-
import scrapy

class QsbkItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field()
    fill = scrapy.Field()

pipelines.py文件

一个是用json导出文件,一个是用scrapy自带的导出json文件的模块

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
#json导出文件
# class QsbkPipeline(object):
#     def __init__(self):
#         self.fp = open('G:/爬取内容/duanzi.json','w',encoding='utf8')
#     def open_spider(self,spider):
#         print("爬虫开始")
#     def process_item(self, item, spider):
#         item_json = json.dumps(dict(item),ensure_ascii=False)
#         self.fp.write(item_json + '\n')
#         return item
#     def close_spider(self,spider):
#         self.fp.close()
#         print("爬虫结束")

#scrapy自带的导出json的模块

from scrapy.exporters import JsonItemExporter,JsonLinesItemExporter

# class QsbkPipeline(object):
#     def __init__(self):
#         self.fp = open('G:/爬取内容/duanzi.json','wb')    #二进制方式打开,因为它是用二进制写的
#         self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf8')
#         self.exporter.start_exporting()    #此种方式先将item存储到内存中,完成后再存储到文件中
#     def open_spider(self,spider):
#         print("爬虫开始")
#     def process_item(self, item, spider):
#         self.exporter.export_item(item)
#         return item
#     def close_spider(self,spider):
#         self.exporter.finish_exporting()  #数据量比较大时这种方式比较消耗内存
#         self.fp.close()
#         print("爬虫结束")

#此种方式是按行存储到文件中,一个字典当作一行,这样的话就不能将整个json当做一个字符串了
class QsbkPipeline(object):
    def __init__(self):
        self.fp = open('G:/爬取内容/duanzi.json', 'wb') 
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf8') 

    def open_spider(self, spider):
        print("爬虫开始")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print("爬虫结束")

settings.py文件

去掉下面几行的注释

# -*- coding: utf-8 -*-
ROBOTSTXT_OBEY = False


DOWNLOAD_DELAY = 1

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

ITEM_PIPELINES = {
   'qsbk.pipelines.QsbkPipeline': 300,
}