糗事百科爬取
糗事百科爬取
scrapy startproject qsbk
建立项目cd baoma
scrapy genspider qsbk_spider "car.autohome.com.cn"
项目创建完成,接下来打开pycharm写启动文件start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw".split( ))
qsbk_spider.py文件
# -*- coding: utf-8 -*- import scrapy from qsbk.items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/page/2/'] def parse(self, response): content = response.xpath('//div[@id="content-left"]/div') for con in content: author = con.xpath('.//h2/text()').get().strip() fill = con.xpath('.//div[@class="content"]/span/text()').getall() fill = "".join(fill).strip() item = QsbkItem(author=author,fill=fill) yield item next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get() if not next_url: return else: next_url = 'https://www.qiushibaike.com' + next_url yield scrapy.Request(next_url,callback=self.parse)
items.py文件
# -*- coding: utf-8 -*-
import scrapy
class QsbkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
fill = scrapy.Field()
pipelines.py文件
一个是用json导出文件,一个是用scrapy自带的导出json文件的模块
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
#json导出文件
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open('G:/爬取内容/duanzi.json','w',encoding='utf8')
# def open_spider(self,spider):
# print("爬虫开始")
# def process_item(self, item, spider):
# item_json = json.dumps(dict(item),ensure_ascii=False)
# self.fp.write(item_json + '\n')
# return item
# def close_spider(self,spider):
# self.fp.close()
# print("爬虫结束")
#scrapy自带的导出json的模块
from scrapy.exporters import JsonItemExporter,JsonLinesItemExporter
# class QsbkPipeline(object):
# def __init__(self):
# self.fp = open('G:/爬取内容/duanzi.json','wb') #二进制方式打开,因为它是用二进制写的
# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf8')
# self.exporter.start_exporting() #此种方式先将item存储到内存中,完成后再存储到文件中
# def open_spider(self,spider):
# print("爬虫开始")
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
# def close_spider(self,spider):
# self.exporter.finish_exporting() #数据量比较大时这种方式比较消耗内存
# self.fp.close()
# print("爬虫结束")
#此种方式是按行存储到文件中,一个字典当作一行,这样的话就不能将整个json当做一个字符串了
class QsbkPipeline(object):
def __init__(self):
self.fp = open('G:/爬取内容/duanzi.json', 'wb')
self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf8')
def open_spider(self, spider):
print("爬虫开始")
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self, spider):
self.fp.close()
print("爬虫结束")
settings.py文件
去掉下面几行的注释
# -*- coding: utf-8 -*-
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}