中国记忆力训练网书籍爬取
中国记忆力训练网书籍爬取
爬取网站所有的书籍,按照书名,章节的样式存储到.docx文件中,由于网站的书籍的顺序为章节分开阅读的,且规律性非常强,适合采用scrapy框架,Crawl Spider结构化爬取 +
scrapy startproject memory
建立项目 +cd memory
切换到项目目录下 +scrapy genspider -t crawl spider.py "http://book.jiyili.net/"
建立名为spider的爬虫文件,初始url设为http://book.jiyili.net/ 接下来进入spider.py
文件进行网站解析 + 可以看出网站书籍有7大分类,发现网页可以表示为: http://book.jiyili.net/modules/article/articlelist.php?class=不同的只是class的值不同, 点击书籍之后会跳转到书籍简介界面,再次点击在线阅读会跳转到书籍的目录界面,点击章节名称即可阅读该章内容,从进入网站到阅读文章一共经历了四次跳转,根据四次不同的url可以写出四个Rule规则rules = ( #正则匹配七种不同分类书籍的url,并跟进 Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True), #跳转到具体书籍的编号,并跟进 Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True), #找到书籍目录,并跟进 Rule(LinkExtractor(allow=r'.+index.html'),follow=True), #跳转到阅读界面,并进行解析,不跟进 Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False), )
对文章内容界面进行解析:
def parse_item(self, response): # 文章对应的书籍名称 column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1] #得到章节名称 title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get()) #获取文章内容 content = response.xpath(r'//div[@class="novel_content"]//text()').getall() #将段落开头的特殊字符转换为空格 content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0', ' ',x),content)) #将段落结尾的特殊字符转换为换行,并转换为字符串 content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content))) item = MemotyItem(column=column,title=title,content=content) yield item
接下来是存储文件pipeline.py,在不重写files_pipeline的情况下,想要存储文章内容到不同的书籍名称下,只有在爬取文章内容的时候同时记录文章对应的书籍名称,并且记录的内容要都含有才行,这就是分析中
column
的作用,不同的书籍建立不同的文件夹就依靠columnclass MemotyPipeline(object): def __init__(self): #得到当前路径,并在当前路径创建文件夹存储内容 self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力') if not os.path.exists(self.path): os.mkdir(self.path) def process_item(self, item, spider): #书籍名字不存在则创建对应的文件夹,书籍名字存在直接存储到书籍名字下 body = item['column'] body_path = os.path.join(self.path,body) if not os.path.exists(body_path): os.mkdir(body_path) os.chdir(body_path) with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp: fp.write(item['title']) #存储章节名字 fp.write('\n') fp.write(item['content']) #存储文章内容 return item
- 最后改写settings中的配置即可 取消下列注释
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
ITEM_PIPELINES = {
'memoty.pipelines.MemotyPipeline': 300,
}
完整代码:
#spider.py文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from memoty.items import MemotyItem
class MemorySpider(CrawlSpider):
name = 'memory'
allowed_domains = ['book.jiyili.net']
start_urls = ['http://book.jiyili.net/']
rules = (
Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True),
Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True),
Rule(LinkExtractor(allow=r'.+index.html'),follow=True),
Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False),
)
def parse_item(self, response):
# title = response.xpath(r'//div[@class="novel_head"]/h1/text()').get()
# content = response.xpath(r'//div[@class="novel_volume"]//div[@class="novel_list"]//li/a/@href').getall()
# content = "".join(content)
# for chapter in content:
# chapter = response.urljoin(chapter)
column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1]
title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get())
content = response.xpath(r'//div[@class="novel_content"]//text()').getall()
content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0',' ',x),content))
content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content)))
item = MemotyItem(column=column,title=title,content=content)
yield item
#item文件
import scrapy
class MemotyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
column = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
#pipeline文件
import os
from memoty import settings
from scrapy.pipelines.files import FilesPipeline
class MemotyPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力')
if not os.path.exists(self.path):
os.mkdir(self.path)
def process_item(self, item, spider):
body = item['column']
body_path = os.path.join(self.path,body)
if not os.path.exists(body_path):
os.mkdir(body_path)
os.chdir(body_path)
with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp:
fp.write(item['title'])
fp.write('\n')
fp.write(item['content'])
return item
#middlewares文件不用改写即可完成爬虫任务