搜房网信息爬取

  1. scrapy startproject fang建立项目
  2. cd fang
  3. scrapy genspider sfw "https://www1.fang.com/" 项目创建完成,接下写启动文件start.py

    from scrapy import cmdline
    cmdline.execute("scrapy crawl sfw".split( ))
    

    爬虫文件如下所示:

    swf.py

    # -*- coding: utf-8 -*-
    import scrapy
    from fang.items import newFangItem,esFangItem
    import re
    class SfwSpider(scrapy.Spider):
    name = 'sfw'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.htm']
    
    def parse(self, response):
        trs = response.xpath(r'//table[@class="table01"]//tr')
        for tr in trs:
            province_ready = tr.xpath(r'./td[not(@class)][1]//text()').get()
            province_ready = re.sub('\s','',province_ready)
            if province_ready:
                province = province_ready
            if province_ready == '其它':
                continue
            cities =  tr.xpath(r'./td[not(@class)][2]/a/text()').getall()
            cities_url = tr.xpath(r'./td[not(@class)][2]/a/@href').getall()
    
            for city, city_url in zip(cities,cities_url):
                if 'bj.' in city_url:
                    new_url = 'https://newhouse.fang.com/house/s/'
                    esf_url = 'https://esf.fang.com/'
                else:
                    new_url = ".newhouse.".join(city_url.split('.',1))+'house/s/'
                    esf_url = ".esf.".join(city_url.split('.',1))
                # yield scrapy.Request(url=new_url,callback=self.parse_newhouse,meta={"info":(province,city)})
                yield  scrapy.Request(url=esf_url,callback=self.parse_esf,meta={"info":(province,city)})
    
            #  meta传递省份和城市信息
    def parse_newhouse(self,response):
        province, city = response.meta.get('info')    #接收信息
        ls = response.xpath(r'//div[@id="newhouse_loupai_list"]/ul//li')
        for l in ls:
            name = l.xpath(r'.//div[@class="nlcd_name"]/a/text()').get()
            if name:
                name = name.strip()
    
                house_type = l.xpath(r'.//div[@class="house_type clearfix"]/a/text()').getall()
                rooms_ready = list(map(lambda x:re.sub(" ","",x),house_type))
                rooms = "".join(rooms_ready)
    
                # if not rooms.startswith("'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'"):
                #     rooms = None
                area = l.xpath(r'.//div[@class="house_type clearfix"]/text()').getall()
                area = list(map(lambda x:re.sub('\s|\/|-','',x),area))
                area = "".join(area)
                # print(area)
                sale = l.xpath(r'//span[@class="inSale"]//text()').get()
                addresses = l.xpath(r'//div[@class="address"]/a/@title').get()
                addressss = "".join(addresses)
                district_ready = re.match(r'\[(.+?)\](.+)',addresses)
                if district_ready:
                    district = district_ready.group(1)
                    address = district_ready.group(2)
                price = l.xpath(r'//div[@class="nhouse_price"]/span/text()').get()
                d = l.xpath(r'//div[@class="nhouse_price"]/em/text()').get()
                price = price + d
                price.replace('广告','')
                origin_url = l.xpath('//div[@class="nlcd_name"]/a/@href').get()
            try:
                item = newFangItem(province=province,city=city,name=name,price=price,rooms=rooms,area=area,address=address,
                            district=district,sale=sale,origin_url=origin_url)
            except:
                pass
            try:
                yield item
            except:
                pass
        next = response.xpath(r'//div[@class="page"]//a[@class="next"]').get()
        if next:
            yield scrapy.Request(url=response.urljoin(next),callback=self.parse_newhouse,meta={'info':{province,city}})
    
    
    def parse_esf(self,response):
        province, city = response.meta.get('info')    #接收信息
        ls = response.xpath(r'//div[@class="shop_list shop_list_4"]/dl')
        for l in ls:
            name = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/a/@title').get()
            address = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/span/text()').get()
            style = l.xpath(r'.//dd[not(@class)]/p[@class="tel_shop"]/text()').getall()
            style = list(map(lambda x:re.sub(' |\r\n','',x),style))
            if style:
                rooms = style[0]
                area = style[1]
                height = None
                try:
                    height = style[2]
                except:
                    pass
                direction = None
                try:
                    direction = style[3]
                except:
                    pass
                year = None
                try:
                    year = style[4]
                except:
                    pass
            price = "".join(l.xpath(r'.//dd[@class="price_right"]/span[@class="red"]//text()').getall())
            unit_price = l.xpath(r'.//dd[@class="price_right"]/span[not(@class)]/text()').get()
            origin_url = response.urljoin(l.xpath(r'./dt/a/@href').get())
            item2 = esFangItem(province=province,city=city,name=name,address=address,rooms=rooms,area=area,height=height,
                              direction=direction,year=year,price=price,unit_price=unit_price,origin_url=origin_url)
            yield item2
    #下一页的链接
        next = response.xpath(r'.//div[@class="page_al"]/p[1]/a/@href').get()
        next_url = response.urljoin(next)
        if next_url:
            yield scrapy.Request(url=next_url,callback=self.parse_esf,meta={'info':{province,city}})
    

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class newFangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #省份
    province = scrapy.Field()
    #城市
    city = scrapy.Field()
    #小区名字
    name = scrapy.Field()
    #价格
    price = scrapy.Field()
    #几居室,这是个列表
    rooms = scrapy.Field()
    #面积
    area = scrapy.Field()
    #地址
    address = scrapy.Field()
    #行政区
    district = scrapy.Field()
    #是否在售
    sale = scrapy.Field()
    #房天下详情页面的url
    origin_url = scrapy.Field()
class esFangItem(scrapy.Item):
    #省份
    province = scrapy.Field()
    #城市
    city = scrapy.Field()
    #小区名
    name = scrapy.Field()
    #小区地址
    address = scrapy.Field()
    #几室几厅
    rooms = scrapy.Field()
    #面积
    area= scrapy.Field()
    #多少层
    height= scrapy.Field()
    #朝向
    direction= scrapy.Field()
    #哪年建
    year= scrapy.Field()
    #价格
    price= scrapy.Field()
    #单价
    unit_price= scrapy.Field()
    #链接
    origin_url= scrapy.Field()

middlewares.py


# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

import random
class UserAgentDownLoadMiddleware(object):
    USER_AGENTS = [
        "Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Browzar)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; ko; rv:1.9.1b2) Gecko/20081201 Firefox/60.0",
        "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    def process_request(self, request, spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

pipelines.py

其中pipelines包括三个类: FangPipeline类直接将文件存储到json文件中 而newfangSqlPipeline是将新房信息存储到数据库中 esfangSqlPipeline是将二手房信息存储到数据库中


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter
import pymysql
import os
os.chdir('G:/爬取内容')
class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json','wb')
        self.eshouse_fp = open('eshouse.json','wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
        self.eshouse_exporter = JsonLinesItemExporter(self.eshouse_fp,ensure_ascii=False)

    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.eshouse_exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.newhouse_fp.close()
        self.eshouse_fp.close()


class newfangSqlPipeline(object):
    def __init__(self):
        data = {
            'host':'localhost',
            'port':3306,
            'user':'root',
            'password':'124780',
            'database':'jianshu',
            'charset':'utf8mb4'
        }
        self.conn = pymysql.connect(**data)
        self.cursor = self.conn.cursor()
        if self.conn:
            print('='*30)
            print('连接成功')
            print('='*30)
        self._sql = None
    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (item['province'], item['city'], item['name'], item['price'], item['rooms'], item['area'], item['address'], item[
        'district'], item['sale'], item['origin_url']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = '''
                    insert into newfang(province,city,name,price,rooms,area,address,district,sale,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                    '''
            return self._sql
        return self._sql

class esfangSqlPipeline(object):
    def __init__(self):
        data = {
            'host': 'localhost',
            'port': 3306,
            'user': 'root',
            'password': '124780',
            'database': 'jianshu',
            'charset': 'utf8mb4'
        }
        self.conn = pymysql.connect(**data)
        self.cursor = self.conn.cursor()
        if self.conn:
            print('=' * 30)
            print('连接成功')
            print('=' * 30)
        self._sql = None
    def process_item(self, item2, spider):
        self.cursor.execute(self.sql, (item2['province'], item2['city'], item2['name'], item2['address'], item2['rooms'], item2['area'], item2['height'],
                                       item2['direction'], item2['year'], item2['price'], item2['unit_price'], item2['origin_url']))
        self.conn.commit()
        return item2

    @property
    def sql(self):
        if not self._sql:
            self._sql = '''
            insert into esfang(province,city,name,address,rooms,area,height,direction,year,price,unit_price,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            '''
            return self._sql
        return self._sql

settings.py

只需要打开以下几个注释即可

# -*- coding: utf-8 -*-

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 2 #根据自己需要设置延时

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',

}


DOWNLOADER_MIDDLEWARES = {
   'fang.middlewares.UserAgentDownLoadMiddleware': 300,
}

#根据自己需要打开下载
ITEM_PIPELINES = {
   # 'fang.pipelines.FangPipeline': 543,
   #  'fang.pipelines.newfangSqlPipeline': 543,
    'fang.pipelines.esfangSqlPipeline': 543,
}