中国记忆力训练网书籍爬取

爬取网站所有的书籍,按照书名,章节的样式存储到.docx文件中,由于网站的书籍的顺序为章节分开阅读的,且规律性非常强,适合采用scrapy框架,Crawl Spider结构化爬取 + scrapy startproject memory建立项目 + cd memory切换到项目目录下 + scrapy genspider -t crawl spider.py "http://book.jiyili.net/"建立名为spider的爬虫文件,初始url设为http://book.jiyili.net/ 接下来进入spider.py文件进行网站解析 + 可以看出网站书籍有7大分类,发现网页可以表示为: http://book.jiyili.net/modules/article/articlelist.php?class=不同的只是class的值不同, 点击书籍之后会跳转到书籍简介界面,再次点击在线阅读会跳转到书籍的目录界面,点击章节名称即可阅读该章内容,从进入网站到阅读文章一共经历了四次跳转,根据四次不同的url可以写出四个Rule规则

rules = (
            #正则匹配七种不同分类书籍的url,并跟进
        Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True),
            #跳转到具体书籍的编号,并跟进
        Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True),
            #找到书籍目录,并跟进
        Rule(LinkExtractor(allow=r'.+index.html'),follow=True),
            #跳转到阅读界面,并进行解析,不跟进
        Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False),
    )

对文章内容界面进行解析:

def parse_item(self, response):
            # 文章对应的书籍名称
        column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1]
            #得到章节名称
        title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get())
            #获取文章内容
        content = response.xpath(r'//div[@class="novel_content"]//text()').getall()
            #将段落开头的特殊字符转换为空格
        content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0',
        '      ',x),content))
            #将段落结尾的特殊字符转换为换行,并转换为字符串
        content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content)))
        item = MemotyItem(column=column,title=title,content=content)
        yield item
  • 接下来是存储文件pipeline.py,在不重写files_pipeline的情况下,想要存储文章内容到不同的书籍名称下,只有在爬取文章内容的时候同时记录文章对应的书籍名称,并且记录的内容要都含有才行,这就是分析中column的作用,不同的书籍建立不同的文件夹就依靠column

    class MemotyPipeline(object):
    def __init__(self):
            #得到当前路径,并在当前路径创建文件夹存储内容
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力')
        if not os.path.exists(self.path):
            os.mkdir(self.path)
    def process_item(self, item, spider):
            #书籍名字不存在则创建对应的文件夹,书籍名字存在直接存储到书籍名字下
        body = item['column']   
        body_path = os.path.join(self.path,body)
        if not os.path.exists(body_path):
            os.mkdir(body_path)
        os.chdir(body_path)
        with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp:
            fp.write(item['title']) #存储章节名字
            fp.write('\n')
            fp.write(item['content']) #存储文章内容
        return item
    
    • 最后改写settings中的配置即可 取消下列注释

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', }

ITEM_PIPELINES = { 'memoty.pipelines.MemotyPipeline': 300, } 完整代码:

 #spider.py文件
 import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from memoty.items import MemotyItem
class MemorySpider(CrawlSpider):
    name = 'memory'
    allowed_domains = ['book.jiyili.net']
    start_urls = ['http://book.jiyili.net/']
    rules = (
        Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True),
        Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True),
        Rule(LinkExtractor(allow=r'.+index.html'),follow=True),
        Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False),

    )
    def parse_item(self, response):
        # title = response.xpath(r'//div[@class="novel_head"]/h1/text()').get()
        # content = response.xpath(r'//div[@class="novel_volume"]//div[@class="novel_list"]//li/a/@href').getall()
        # content = "".join(content)
        # for chapter in content:
        #     chapter = response.urljoin(chapter)
        column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1]
        title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get())
        content = response.xpath(r'//div[@class="novel_content"]//text()').getall()
        content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0','    ',x),content))
        content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content)))
        item = MemotyItem(column=column,title=title,content=content)
        yield item

#item文件
import scrapy
class MemotyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    column = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()

#pipeline文件
import os
from memoty import settings
from scrapy.pipelines.files import FilesPipeline
class MemotyPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力')
        if not os.path.exists(self.path):
            os.mkdir(self.path)
    def process_item(self, item, spider):
        body = item['column']
        body_path = os.path.join(self.path,body)
        if not os.path.exists(body_path):
            os.mkdir(body_path)
        os.chdir(body_path)
        with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp:
            fp.write(item['title'])
            fp.write('\n')
            fp.write(item['content'])
        return item

#middlewares文件不用改写即可完成爬虫任务