中国记忆力训练网书籍爬取

爬取网站所有的书籍，按照书名，章节的样式存储到.docx文件中，由于网站的书籍的顺序为章节分开阅读的，且规律性非常强，适合采用scrapy框架，Crawl Spider结构化爬取 + scrapy startproject memory建立项目 + cd memory切换到项目目录下 + scrapy genspider -t crawl spider.py "http://book.jiyili.net/"建立名为spider的爬虫文件，初始url设为http://book.jiyili.net/ 接下来进入spider.py文件进行网站解析 + 可以看出网站书籍有7大分类，发现网页可以表示为： http://book.jiyili.net/modules/article/articlelist.php?class=不同的只是class的值不同，点击书籍之后会跳转到书籍简介界面，再次点击在线阅读会跳转到书籍的目录界面，点击章节名称即可阅读该章内容，从进入网站到阅读文章一共经历了四次跳转，根据四次不同的url可以写出四个Rule规则
rules = (
            #正则匹配七种不同分类书籍的url，并跟进
        Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True),
            #跳转到具体书籍的编号，并跟进
        Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True),
            #找到书籍目录，并跟进
        Rule(LinkExtractor(allow=r'.+index.html'),follow=True),
            #跳转到阅读界面，并进行解析，不跟进
        Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False),
    )
对文章内容界面进行解析：
def parse_item(self, response):
            # 文章对应的书籍名称
        column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1]
            #得到章节名称
        title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get())
            #获取文章内容
        content = response.xpath(r'//div[@class="novel_content"]//text()').getall()
            #将段落开头的特殊字符转换为空格
        content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0',
        '      ',x),content))
            #将段落结尾的特殊字符转换为换行，并转换为字符串
        content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content)))
        item = MemotyItem(column=column,title=title,content=content)
        yield item

接下来是存储文件pipeline.py，在不重写files_pipeline的情况下，想要存储文章内容到不同的书籍名称下，只有在爬取文章内容的时候同时记录文章对应的书籍名称，并且记录的内容要都含有才行，这就是分析中column的作用，不同的书籍建立不同的文件夹就依靠column

class MemotyPipeline(object):
def __init__(self):
        #得到当前路径，并在当前路径创建文件夹存储内容
    self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力')
    if not os.path.exists(self.path):
        os.mkdir(self.path)
def process_item(self, item, spider):
        #书籍名字不存在则创建对应的文件夹，书籍名字存在直接存储到书籍名字下
    body = item['column']   
    body_path = os.path.join(self.path,body)
    if not os.path.exists(body_path):
        os.mkdir(body_path)
    os.chdir(body_path)
    with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp:
        fp.write(item['title']) #存储章节名字
        fp.write('\n')
        fp.write(item['content']) #存储文章内容
    return item

最后改写settings中的配置即可取消下列注释

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', }

ITEM_PIPELINES = { 'memoty.pipelines.MemotyPipeline': 300, } 完整代码:

 #spider.py文件
 import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from memoty.items import MemotyItem
class MemorySpider(CrawlSpider):
    name = 'memory'
    allowed_domains = ['book.jiyili.net']
    start_urls = ['http://book.jiyili.net/']
    rules = (
        Rule(LinkExtractor(allow=r'.+articlelist.php?class=[4-7]'), follow=True),
        Rule(LinkExtractor(allow=r'.*php\?id=.+'),follow=True),
        Rule(LinkExtractor(allow=r'.+index.html'),follow=True),
        Rule(LinkExtractor(allow=r'\d+\.html'), callback="parse_item", follow=False),

    )
    def parse_item(self, response):
        # title = response.xpath(r'//div[@class="novel_head"]/h1/text()').get()
        # content = response.xpath(r'//div[@class="novel_volume"]//div[@class="novel_list"]//li/a/@href').getall()
        # content = "".join(content)
        # for chapter in content:
        #     chapter = response.urljoin(chapter)
        column = response.xpath(r'//div[@class="z"]/h1//a//text()').getall()[-1]
        title = "".join(response.xpath(r'//div[@class="novel_head"]/h1/text()').get())
        content = response.xpath(r'//div[@class="novel_content"]//text()').getall()
        content = list(map(lambda x:re.sub('\r\n\xa0\xa0\xa0\xa0','    ',x),content))
        content = "".join(list(map(lambda x: re.sub('\r\n', '\n', x), content)))
        item = MemotyItem(column=column,title=title,content=content)
        yield item

#item文件
import scrapy
class MemotyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    column = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()

#pipeline文件
import os
from memoty import settings
from scrapy.pipelines.files import FilesPipeline
class MemotyPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '记忆力')
        if not os.path.exists(self.path):
            os.mkdir(self.path)
    def process_item(self, item, spider):
        body = item['column']
        body_path = os.path.join(self.path,body)
        if not os.path.exists(body_path):
            os.mkdir(body_path)
        os.chdir(body_path)
        with open('{}.docx'.format(item['title']),'a',encoding='utf8') as fp:
            fp.write(item['title'])
            fp.write('\n')
            fp.write(item['content'])
        return item

#middlewares文件不用改写即可完成爬虫任务

中国记忆力训练网书籍爬取

中国记忆力训练网书籍爬取

Ferry