简书全站文章爬取

  1. scrapy startproject jianshu建立项目
  2. cd jianshu
  3. scrapy genspider -t crawl spider js "www.wxapp-union.com" 项目创建完成,接下来用pycharm打开项目写启动文件start.py

    from scrapy import cmdline
    cmdline.execute("scrapy crawl sfw".split( ))
    

    js.py文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from jianshu.items import JianshuItem,JianshuItem
    class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['http://jianshu.com/']
    
    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True),
    )
    
    def parse_item(self, response):
        avatar = response.xpath(r'//div[@class="article"]/div[@class="author"]/a[@class="avatar"]/img/@src').get()
        title = response.xpath(r'//div[@class="article"]/h1/text()').get()
        author = response.xpath(r'//div[@class="article"]//div[@class="info"]/span/a/text()').get()
        pub_time = response.xpath(r'//span[@class="publish-time"]/text()').get()
        url = response.url
        url1= url.split("?")[0]
        article_id = url1.split('/')[-1]
        content = response.xpath(r'//div[@class="show-content"]').get()
        word_content = int(response.xpath(r'//span[@class="wordage"]/text()').get().split()[-1])
        comment_count = int(response.xpath(r'//span[@class="comments-count"]/text()').get().split()[-1])
        like_count = int(response.xpath(r'//span[@class="likes-count"]/text()').get().split()[-1])
        read_count = int(response.xpath(r'//span[@class="views-count"]/text()').get().split()[-1])
        subjects = ",".join(response.xpath(r'//div[@class="include-collection"]//a/div/text()').getall())
    
        item = JianshuItem(avatar=avatar,
                           title=title,
                           author=author,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content,
                           read_count=read_count,
                           like_count=like_count,
                           word_count=word_content,
                           comment_count=comment_count,
                           subjects=subjects)
        yield item
    

items.py文件

一个存到json文件中,一个存到数据库中

# -*- coding: utf-8 -*-
import scrapy
class JianshuItem(scrapy.Item):
    avatar = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    pub_time = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    content = scrapy.Field()
    read_count = scrapy.Field()
    like_count = scrapy.Field()
    word_count = scrapy.Field()
    comment_count = scrapy.Field()
    subjects = scrapy.Field()

class ShuPipeItem(scrapy.Item):
    avatar = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    pub_time = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    content = scrapy.Field()
    read_count = scrapy.Field()
    like_count = scrapy.Field()
    word_count = scrapy.Field()
    comment_count = scrapy.Field()
    subjects = scrapy.Field()

pipelines.py文件

这里采用两种不同的方式存储到数据库中,一种同步存到数据库中,另一种异步存储

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
#同步存储
class JianshuPipeline(object):
    def __init__(self):
        data = {
            'host':'localhost',
            'port':3306,
            'user':'root',
            'password':'*******',
            'database':'jianshu',
            'charset':'utf8mb4'
        }
        self.conn = pymysql.connect(**data)
        self.cursor = self.conn.cursor()
        if self.conn:
            print('='*30)
            print('连接成功')
            print('='*30)
        self._sql = None
    def process_item(self, item, spider):
        # print(type(item['word_count']))
        # print(item['word_count'])
        # self.cursor.execute(self.sql)
        # self.conn.commit()
        self.cursor.execute(self.sql, (item['avatar'],item['title'],item['author'],item['pub_time'],item['origin_url'],item['article_id'],item['content'],item['read_count'],item['like_count'],item['word_count'],item['comment_count'],item['subjects']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = '''insert into article2(avatar,title,author, pub_time,origin_url,article_id,content,read_count,like_count,word_count,comment_count,subjects) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        '''
            return self._sql
        return self._sql

#异步存储
from twisted.enterprise import adbapi
from pymysql import cursors
class JianShuTwistedPipeline(object):   #使用连接池,异步
    def __init__(self):
        data = {
            'host': 'localhost',
            'port': 3306,
            'user': 'root',
            'password': '*******',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass':cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql')  # 连接数据库,数据库是什么类型就写什么类型
        self._sql = None

        @property
        def sql(self):
            if not self._sql:
                self._sql = '''
                    insert into article(id,avatar,title,author,
                    pub_time,originurl,artical_id,content) values(null,%s,%s,%s,%s,%s,%s,%s)
                    '''
                return self._sql
            return self._sql


        def process_item(self,item,spider):
            defer = self.dbpool.runInteraction(self.insert_item,item)  # 执行插入数据库操作,通过函数来完成,否则就还是同步的
            defer.addErrback(self.handle_error,item,spider)   #发生错误会调用真个函数
        def insert_item(self,cursor,item):  #具体的插入实现
            cursor.execute(self.sql,
                           (item['avatar'], item['title'], item['author'], item['pub_time'],
                            item['originurl'], item['artical_id'], item['content']))

        def handle_error(self,error,item,spider):  #简单打印一下错误
            print('='*10+'error'+"="*10)
            print(error)
            print('='*10+'error'+'='*10)

        #别忘了去settings改一下item-pipeline的设置
from scrapy.exporters import JsonLinesItemExporter
import os
os.chdir('G:/爬取内容')
class ShuPipeline(object):
    def __init__(self):
        self.jianshu = open('jianshu.json','wb')
        self.jianshu_exporter = JsonLinesItemExporter(self.jianshu,ensure_ascii=False)
    def process_item(self, item, spider):
        self.jianshu_exporter.export_item(item)

        return item

    def close_spider(self,spider):
        self.jianshu.close()

settings.py文件设置

# -*- coding: utf-8 -*-
BOT_NAME = 'jianshu'

SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'


ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 2

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}

DOWNLOADER_MIDDLEWARES = {
   # 'jianshu.middlewares.JianshuDownloaderMiddleware': 543,
  'jianshu.middlewares.SeleniumDownLoadMiddleware': 543,
}

ITEM_PIPELINES = {
   'jianshu.pipelines.JianshuPipeline': 300,
# 'jianshu.pipelines.JianShuTwistedPipeline': 300,
#     'jianshu.pipelines.ShuPipeline': 300,

}