简书全站文章爬取
简书全站文章爬取
scrapy startproject jianshu
建立项目cd jianshu
scrapy genspider -t crawl spider js "www.wxapp-union.com"
项目创建完成,接下来用pycharm打开项目写启动文件start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw".split( ))
js.py文件
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jianshu.items import JianshuItem,JianshuItem class JsSpider(CrawlSpider): name = 'js' allowed_domains = ['jianshu.com'] start_urls = ['http://jianshu.com/'] rules = ( Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_item', follow=True), ) def parse_item(self, response): avatar = response.xpath(r'//div[@class="article"]/div[@class="author"]/a[@class="avatar"]/img/@src').get() title = response.xpath(r'//div[@class="article"]/h1/text()').get() author = response.xpath(r'//div[@class="article"]//div[@class="info"]/span/a/text()').get() pub_time = response.xpath(r'//span[@class="publish-time"]/text()').get() url = response.url url1= url.split("?")[0] article_id = url1.split('/')[-1] content = response.xpath(r'//div[@class="show-content"]').get() word_content = int(response.xpath(r'//span[@class="wordage"]/text()').get().split()[-1]) comment_count = int(response.xpath(r'//span[@class="comments-count"]/text()').get().split()[-1]) like_count = int(response.xpath(r'//span[@class="likes-count"]/text()').get().split()[-1]) read_count = int(response.xpath(r'//span[@class="views-count"]/text()').get().split()[-1]) subjects = ",".join(response.xpath(r'//div[@class="include-collection"]//a/div/text()').getall()) item = JianshuItem(avatar=avatar, title=title, author=author, pub_time=pub_time, origin_url=response.url, article_id=article_id, content=content, read_count=read_count, like_count=like_count, word_count=word_content, comment_count=comment_count, subjects=subjects) yield item
items.py文件
一个存到json文件中,一个存到数据库中
# -*- coding: utf-8 -*-
import scrapy
class JianshuItem(scrapy.Item):
avatar = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
content = scrapy.Field()
read_count = scrapy.Field()
like_count = scrapy.Field()
word_count = scrapy.Field()
comment_count = scrapy.Field()
subjects = scrapy.Field()
class ShuPipeItem(scrapy.Item):
avatar = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
content = scrapy.Field()
read_count = scrapy.Field()
like_count = scrapy.Field()
word_count = scrapy.Field()
comment_count = scrapy.Field()
subjects = scrapy.Field()
pipelines.py文件
这里采用两种不同的方式存储到数据库中,一种同步存到数据库中,另一种异步存储
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
#同步存储
class JianshuPipeline(object):
def __init__(self):
data = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'*******',
'database':'jianshu',
'charset':'utf8mb4'
}
self.conn = pymysql.connect(**data)
self.cursor = self.conn.cursor()
if self.conn:
print('='*30)
print('连接成功')
print('='*30)
self._sql = None
def process_item(self, item, spider):
# print(type(item['word_count']))
# print(item['word_count'])
# self.cursor.execute(self.sql)
# self.conn.commit()
self.cursor.execute(self.sql, (item['avatar'],item['title'],item['author'],item['pub_time'],item['origin_url'],item['article_id'],item['content'],item['read_count'],item['like_count'],item['word_count'],item['comment_count'],item['subjects']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = '''insert into article2(avatar,title,author, pub_time,origin_url,article_id,content,read_count,like_count,word_count,comment_count,subjects) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
return self._sql
return self._sql
#异步存储
from twisted.enterprise import adbapi
from pymysql import cursors
class JianShuTwistedPipeline(object): #使用连接池,异步
def __init__(self):
data = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': '*******',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass':cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql') # 连接数据库,数据库是什么类型就写什么类型
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = '''
insert into article(id,avatar,title,author,
pub_time,originurl,artical_id,content) values(null,%s,%s,%s,%s,%s,%s,%s)
'''
return self._sql
return self._sql
def process_item(self,item,spider):
defer = self.dbpool.runInteraction(self.insert_item,item) # 执行插入数据库操作,通过函数来完成,否则就还是同步的
defer.addErrback(self.handle_error,item,spider) #发生错误会调用真个函数
def insert_item(self,cursor,item): #具体的插入实现
cursor.execute(self.sql,
(item['avatar'], item['title'], item['author'], item['pub_time'],
item['originurl'], item['artical_id'], item['content']))
def handle_error(self,error,item,spider): #简单打印一下错误
print('='*10+'error'+"="*10)
print(error)
print('='*10+'error'+'='*10)
#别忘了去settings改一下item-pipeline的设置
from scrapy.exporters import JsonLinesItemExporter
import os
os.chdir('G:/爬取内容')
class ShuPipeline(object):
def __init__(self):
self.jianshu = open('jianshu.json','wb')
self.jianshu_exporter = JsonLinesItemExporter(self.jianshu,ensure_ascii=False)
def process_item(self, item, spider):
self.jianshu_exporter.export_item(item)
return item
def close_spider(self,spider):
self.jianshu.close()
settings.py文件设置
# -*- coding: utf-8 -*-
BOT_NAME = 'jianshu'
SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
DOWNLOADER_MIDDLEWARES = {
# 'jianshu.middlewares.JianshuDownloaderMiddleware': 543,
'jianshu.middlewares.SeleniumDownLoadMiddleware': 543,
}
ITEM_PIPELINES = {
'jianshu.pipelines.JianshuPipeline': 300,
# 'jianshu.pipelines.JianShuTwistedPipeline': 300,
# 'jianshu.pipelines.ShuPipeline': 300,
}