微信小程序社区文章爬取

搜房网信息爬取

scrapy startproject wxapp建立项目
cd fang

scrapy genspider -t crawl spider wxapp_spider "http://www.wxapp-union.com" 项目创建完成，接下写启动文件start.py

from scrapy import cmdline
cmdline.execute("scrapy crawl sfw".split( ))

爬虫文件如下所示：

items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class WxappItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
article = scrapy.Field()

pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter #不写入内存，读一行存一行

class WxappPipeline(object):
    def __init__(self):
        self.fp = open('G:\爬取内容\wxjc.json','wb')
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf8')#能看懂的格式
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    def close_spider(self,spider):
        self.fp.close()

wxapp_spider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem

class WxappSpiderSpider(CrawlSpider):
    name = 'wxapp_spider'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
        Rule(LinkExtractor(allow=r'.+?article.*?\.html'),callback="parse_detail",follow=False)
    )

    def parse_detail(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        # return item
        title = response.xpath('//h1[@class="ph"]/text()').get()  #get 返回str  ==extract_first()
        author_p =response.xpath('//p[@class="authors"]')
        author = author_p.xpath('.//a/text()').get()
        pub_time = author_p.xpath('.//span/text()').get()
        article = response.xpath('//td[@id="article_content"]//text()').getall()  #返回list  ==extract()
        article = "".join(article)
        item = WxappItem(title=title,author=author,pub_time=pub_time,article=article)
        yield item

微信小程序社区文章爬取

搜房网信息爬取

items.py文件

pipelines.py文件

wxapp_spider.py

Ferry