搜房网信息爬取

  1. scrapy startproject wxapp建立项目
  2. cd fang
  3. scrapy genspider -t crawl spider wxapp_spider "http://www.wxapp-union.com" 项目创建完成,接下写启动文件start.py

    from scrapy import cmdline
    cmdline.execute("scrapy crawl sfw".split( ))
    

    爬虫文件如下所示:

    items.py文件

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    class WxappItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    pub_time = scrapy.Field()
    article = scrapy.Field()
    

pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter #不写入内存,读一行存一行

class WxappPipeline(object):
    def __init__(self):
        self.fp = open('G:\爬取内容\wxjc.json','wb')
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf8')#能看懂的格式
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    def close_spider(self,spider):
        self.fp.close()

wxapp_spider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem

class WxappSpiderSpider(CrawlSpider):
    name = 'wxapp_spider'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
        Rule(LinkExtractor(allow=r'.+?article.*?\.html'),callback="parse_detail",follow=False)
    )

    def parse_detail(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        # return item
        title = response.xpath('//h1[@class="ph"]/text()').get()  #get 返回str  ==extract_first()
        author_p =response.xpath('//p[@class="authors"]')
        author = author_p.xpath('.//a/text()').get()
        pub_time = author_p.xpath('.//span/text()').get()
        article = response.xpath('//td[@id="article_content"]//text()').getall()  #返回list  ==extract()
        article = "".join(article)
        item = WxappItem(title=title,author=author,pub_time=pub_time,article=article)
        yield item