搜房网数据爬取

搜房网信息爬取

scrapy startproject fang建立项目
cd fang

scrapy genspider sfw "https://www1.fang.com/" 项目创建完成，接下写启动文件start.py

from scrapy import cmdline
cmdline.execute("scrapy crawl sfw".split( ))

爬虫文件如下所示：

swf.py

# -*- coding: utf-8 -*-
import scrapy
from fang.items import newFangItem,esFangItem
import re
class SfwSpider(scrapy.Spider):
name = 'sfw'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.htm']

def parse(self, response):
    trs = response.xpath(r'//table[@class="table01"]//tr')
    for tr in trs:
        province_ready = tr.xpath(r'./td[not(@class)][1]//text()').get()
        province_ready = re.sub('\s','',province_ready)
        if province_ready:
            province = province_ready
        if province_ready == '其它':
            continue
        cities =  tr.xpath(r'./td[not(@class)][2]/a/text()').getall()
        cities_url = tr.xpath(r'./td[not(@class)][2]/a/@href').getall()

        for city, city_url in zip(cities,cities_url):
            if 'bj.' in city_url:
                new_url = 'https://newhouse.fang.com/house/s/'
                esf_url = 'https://esf.fang.com/'
            else:
                new_url = ".newhouse.".join(city_url.split('.',1))+'house/s/'
                esf_url = ".esf.".join(city_url.split('.',1))
            # yield scrapy.Request(url=new_url,callback=self.parse_newhouse,meta={"info":(province,city)})
            yield  scrapy.Request(url=esf_url,callback=self.parse_esf,meta={"info":(province,city)})

        #  meta传递省份和城市信息
def parse_newhouse(self,response):
    province, city = response.meta.get('info')    #接收信息
    ls = response.xpath(r'//div[@id="newhouse_loupai_list"]/ul//li')
    for l in ls:
        name = l.xpath(r'.//div[@class="nlcd_name"]/a/text()').get()
        if name:
            name = name.strip()

            house_type = l.xpath(r'.//div[@class="house_type clearfix"]/a/text()').getall()
            rooms_ready = list(map(lambda x:re.sub(" ","",x),house_type))
            rooms = "".join(rooms_ready)

            # if not rooms.startswith("'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'"):
            #     rooms = None
            area = l.xpath(r'.//div[@class="house_type clearfix"]/text()').getall()
            area = list(map(lambda x:re.sub('\s|\/|－','',x),area))
            area = "".join(area)
            # print(area)
            sale = l.xpath(r'//span[@class="inSale"]//text()').get()
            addresses = l.xpath(r'//div[@class="address"]/a/@title').get()
            addressss = "".join(addresses)
            district_ready = re.match(r'\[(.+?)\](.+)',addresses)
            if district_ready:
                district = district_ready.group(1)
                address = district_ready.group(2)
            price = l.xpath(r'//div[@class="nhouse_price"]/span/text()').get()
            d = l.xpath(r'//div[@class="nhouse_price"]/em/text()').get()
            price = price + d
            price.replace('广告','')
            origin_url = l.xpath('//div[@class="nlcd_name"]/a/@href').get()
        try:
            item = newFangItem(province=province,city=city,name=name,price=price,rooms=rooms,area=area,address=address,
                        district=district,sale=sale,origin_url=origin_url)
        except:
            pass
        try:
            yield item
        except:
            pass
    next = response.xpath(r'//div[@class="page"]//a[@class="next"]').get()
    if next:
        yield scrapy.Request(url=response.urljoin(next),callback=self.parse_newhouse,meta={'info':{province,city}})


def parse_esf(self,response):
    province, city = response.meta.get('info')    #接收信息
    ls = response.xpath(r'//div[@class="shop_list shop_list_4"]/dl')
    for l in ls:
        name = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/a/@title').get()
        address = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/span/text()').get()
        style = l.xpath(r'.//dd[not(@class)]/p[@class="tel_shop"]/text()').getall()
        style = list(map(lambda x:re.sub(' |\r\n','',x),style))
        if style:
            rooms = style[0]
            area = style[1]
            height = None
            try:
                height = style[2]
            except:
                pass
            direction = None
            try:
                direction = style[3]
            except:
                pass
            year = None
            try:
                year = style[4]
            except:
                pass
        price = "".join(l.xpath(r'.//dd[@class="price_right"]/span[@class="red"]//text()').getall())
        unit_price = l.xpath(r'.//dd[@class="price_right"]/span[not(@class)]/text()').get()
        origin_url = response.urljoin(l.xpath(r'./dt/a/@href').get())
        item2 = esFangItem(province=province,city=city,name=name,address=address,rooms=rooms,area=area,height=height,
                          direction=direction,year=year,price=price,unit_price=unit_price,origin_url=origin_url)
        yield item2
#下一页的链接
    next = response.xpath(r'.//div[@class="page_al"]/p[1]/a/@href').get()
    next_url = response.urljoin(next)
    if next_url:
        yield scrapy.Request(url=next_url,callback=self.parse_esf,meta={'info':{province,city}})

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class newFangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #省份
    province = scrapy.Field()
    #城市
    city = scrapy.Field()
    #小区名字
    name = scrapy.Field()
    #价格
    price = scrapy.Field()
    #几居室，这是个列表
    rooms = scrapy.Field()
    #面积
    area = scrapy.Field()
    #地址
    address = scrapy.Field()
    #行政区
    district = scrapy.Field()
    #是否在售
    sale = scrapy.Field()
    #房天下详情页面的url
    origin_url = scrapy.Field()
class esFangItem(scrapy.Item):
    #省份
    province = scrapy.Field()
    #城市
    city = scrapy.Field()
    #小区名
    name = scrapy.Field()
    #小区地址
    address = scrapy.Field()
    #几室几厅
    rooms = scrapy.Field()
    #面积
    area= scrapy.Field()
    #多少层
    height= scrapy.Field()
    #朝向
    direction= scrapy.Field()
    #哪年建
    year= scrapy.Field()
    #价格
    price= scrapy.Field()
    #单价
    unit_price= scrapy.Field()
    #链接
    origin_url= scrapy.Field()

middlewares.py


# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

import random
class UserAgentDownLoadMiddleware(object):
    USER_AGENTS = [
        "Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Browzar)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; ko; rv:1.9.1b2) Gecko/20081201 Firefox/60.0",
        "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    def process_request(self, request, spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

pipelines.py

其中pipelines包括三个类： FangPipeline类直接将文件存储到json文件中而newfangSqlPipeline是将新房信息存储到数据库中 esfangSqlPipeline是将二手房信息存储到数据库中


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter
import pymysql
import os
os.chdir('G:/爬取内容')
class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json','wb')
        self.eshouse_fp = open('eshouse.json','wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
        self.eshouse_exporter = JsonLinesItemExporter(self.eshouse_fp,ensure_ascii=False)

    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.eshouse_exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.newhouse_fp.close()
        self.eshouse_fp.close()


class newfangSqlPipeline(object):
    def __init__(self):
        data = {
            'host':'localhost',
            'port':3306,
            'user':'root',
            'password':'124780',
            'database':'jianshu',
            'charset':'utf8mb4'
        }
        self.conn = pymysql.connect(**data)
        self.cursor = self.conn.cursor()
        if self.conn:
            print('='*30)
            print('连接成功')
            print('='*30)
        self._sql = None
    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (item['province'], item['city'], item['name'], item['price'], item['rooms'], item['area'], item['address'], item[
        'district'], item['sale'], item['origin_url']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = '''
                    insert into newfang(province,city,name,price,rooms,area,address,district,sale,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                    '''
            return self._sql
        return self._sql

class esfangSqlPipeline(object):
    def __init__(self):
        data = {
            'host': 'localhost',
            'port': 3306,
            'user': 'root',
            'password': '124780',
            'database': 'jianshu',
            'charset': 'utf8mb4'
        }
        self.conn = pymysql.connect(**data)
        self.cursor = self.conn.cursor()
        if self.conn:
            print('=' * 30)
            print('连接成功')
            print('=' * 30)
        self._sql = None
    def process_item(self, item2, spider):
        self.cursor.execute(self.sql, (item2['province'], item2['city'], item2['name'], item2['address'], item2['rooms'], item2['area'], item2['height'],
                                       item2['direction'], item2['year'], item2['price'], item2['unit_price'], item2['origin_url']))
        self.conn.commit()
        return item2

    @property
    def sql(self):
        if not self._sql:
            self._sql = '''
            insert into esfang(province,city,name,address,rooms,area,height,direction,year,price,unit_price,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            '''
            return self._sql
        return self._sql

settings.py

只需要打开以下几个注释即可

# -*- coding: utf-8 -*-

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 2 #根据自己需要设置延时

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',

}


DOWNLOADER_MIDDLEWARES = {
   'fang.middlewares.UserAgentDownLoadMiddleware': 300,
}

#根据自己需要打开下载
ITEM_PIPELINES = {
   # 'fang.pipelines.FangPipeline': 543,
   #  'fang.pipelines.newfangSqlPipeline': 543,
    'fang.pipelines.esfangSqlPipeline': 543,
}