微信小程序社区文章爬取
搜房网信息爬取
scrapy startproject wxapp
建立项目cd fang
scrapy genspider -t crawl spider wxapp_spider "http://www.wxapp-union.com"
项目创建完成,接下写启动文件start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw".split( ))
爬虫文件如下所示:
items.py文件
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class WxappItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() author = scrapy.Field() pub_time = scrapy.Field() article = scrapy.Field()
pipelines.py文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter #不写入内存,读一行存一行
class WxappPipeline(object):
def __init__(self):
self.fp = open('G:\爬取内容\wxjc.json','wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf8')#能看懂的格式
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
wxapp_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem
class WxappSpiderSpider(CrawlSpider):
name = 'wxapp_spider'
allowed_domains = ['wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = (
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
Rule(LinkExtractor(allow=r'.+?article.*?\.html'),callback="parse_detail",follow=False)
)
def parse_detail(self, response):
# item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
# return item
title = response.xpath('//h1[@class="ph"]/text()').get() #get 返回str ==extract_first()
author_p =response.xpath('//p[@class="authors"]')
author = author_p.xpath('.//a/text()').get()
pub_time = author_p.xpath('.//span/text()').get()
article = response.xpath('//td[@id="article_content"]//text()').getall() #返回list ==extract()
article = "".join(article)
item = WxappItem(title=title,author=author,pub_time=pub_time,article=article)
yield item