搜房网数据爬取
搜房网信息爬取
scrapy startproject fang
建立项目cd fang
scrapy genspider sfw "https://www1.fang.com/"
项目创建完成,接下写启动文件start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw".split( ))
爬虫文件如下所示:
swf.py
# -*- coding: utf-8 -*- import scrapy from fang.items import newFangItem,esFangItem import re class SfwSpider(scrapy.Spider): name = 'sfw' allowed_domains = ['fang.com'] start_urls = ['https://www.fang.com/SoufunFamily.htm'] def parse(self, response): trs = response.xpath(r'//table[@class="table01"]//tr') for tr in trs: province_ready = tr.xpath(r'./td[not(@class)][1]//text()').get() province_ready = re.sub('\s','',province_ready) if province_ready: province = province_ready if province_ready == '其它': continue cities = tr.xpath(r'./td[not(@class)][2]/a/text()').getall() cities_url = tr.xpath(r'./td[not(@class)][2]/a/@href').getall() for city, city_url in zip(cities,cities_url): if 'bj.' in city_url: new_url = 'https://newhouse.fang.com/house/s/' esf_url = 'https://esf.fang.com/' else: new_url = ".newhouse.".join(city_url.split('.',1))+'house/s/' esf_url = ".esf.".join(city_url.split('.',1)) # yield scrapy.Request(url=new_url,callback=self.parse_newhouse,meta={"info":(province,city)}) yield scrapy.Request(url=esf_url,callback=self.parse_esf,meta={"info":(province,city)}) # meta传递省份和城市信息 def parse_newhouse(self,response): province, city = response.meta.get('info') #接收信息 ls = response.xpath(r'//div[@id="newhouse_loupai_list"]/ul//li') for l in ls: name = l.xpath(r'.//div[@class="nlcd_name"]/a/text()').get() if name: name = name.strip() house_type = l.xpath(r'.//div[@class="house_type clearfix"]/a/text()').getall() rooms_ready = list(map(lambda x:re.sub(" ","",x),house_type)) rooms = "".join(rooms_ready) # if not rooms.startswith("'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'"): # rooms = None area = l.xpath(r'.//div[@class="house_type clearfix"]/text()').getall() area = list(map(lambda x:re.sub('\s|\/|-','',x),area)) area = "".join(area) # print(area) sale = l.xpath(r'//span[@class="inSale"]//text()').get() addresses = l.xpath(r'//div[@class="address"]/a/@title').get() addressss = "".join(addresses) district_ready = re.match(r'\[(.+?)\](.+)',addresses) if district_ready: district = district_ready.group(1) address = district_ready.group(2) price = l.xpath(r'//div[@class="nhouse_price"]/span/text()').get() d = l.xpath(r'//div[@class="nhouse_price"]/em/text()').get() price = price + d price.replace('广告','') origin_url = l.xpath('//div[@class="nlcd_name"]/a/@href').get() try: item = newFangItem(province=province,city=city,name=name,price=price,rooms=rooms,area=area,address=address, district=district,sale=sale,origin_url=origin_url) except: pass try: yield item except: pass next = response.xpath(r'//div[@class="page"]//a[@class="next"]').get() if next: yield scrapy.Request(url=response.urljoin(next),callback=self.parse_newhouse,meta={'info':{province,city}}) def parse_esf(self,response): province, city = response.meta.get('info') #接收信息 ls = response.xpath(r'//div[@class="shop_list shop_list_4"]/dl') for l in ls: name = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/a/@title').get() address = l.xpath(r'.//dd[not(@class)]/p[@class="add_shop"]/span/text()').get() style = l.xpath(r'.//dd[not(@class)]/p[@class="tel_shop"]/text()').getall() style = list(map(lambda x:re.sub(' |\r\n','',x),style)) if style: rooms = style[0] area = style[1] height = None try: height = style[2] except: pass direction = None try: direction = style[3] except: pass year = None try: year = style[4] except: pass price = "".join(l.xpath(r'.//dd[@class="price_right"]/span[@class="red"]//text()').getall()) unit_price = l.xpath(r'.//dd[@class="price_right"]/span[not(@class)]/text()').get() origin_url = response.urljoin(l.xpath(r'./dt/a/@href').get()) item2 = esFangItem(province=province,city=city,name=name,address=address,rooms=rooms,area=area,height=height, direction=direction,year=year,price=price,unit_price=unit_price,origin_url=origin_url) yield item2 #下一页的链接 next = response.xpath(r'.//div[@class="page_al"]/p[1]/a/@href').get() next_url = response.urljoin(next) if next_url: yield scrapy.Request(url=next_url,callback=self.parse_esf,meta={'info':{province,city}})
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class newFangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#省份
province = scrapy.Field()
#城市
city = scrapy.Field()
#小区名字
name = scrapy.Field()
#价格
price = scrapy.Field()
#几居室,这是个列表
rooms = scrapy.Field()
#面积
area = scrapy.Field()
#地址
address = scrapy.Field()
#行政区
district = scrapy.Field()
#是否在售
sale = scrapy.Field()
#房天下详情页面的url
origin_url = scrapy.Field()
class esFangItem(scrapy.Item):
#省份
province = scrapy.Field()
#城市
city = scrapy.Field()
#小区名
name = scrapy.Field()
#小区地址
address = scrapy.Field()
#几室几厅
rooms = scrapy.Field()
#面积
area= scrapy.Field()
#多少层
height= scrapy.Field()
#朝向
direction= scrapy.Field()
#哪年建
year= scrapy.Field()
#价格
price= scrapy.Field()
#单价
unit_price= scrapy.Field()
#链接
origin_url= scrapy.Field()
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
class UserAgentDownLoadMiddleware(object):
USER_AGENTS = [
"Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Browzar)",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; ko; rv:1.9.1b2) Gecko/20081201 Firefox/60.0",
"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
pipelines.py
其中pipelines包括三个类: FangPipeline类直接将文件存储到json文件中 而newfangSqlPipeline是将新房信息存储到数据库中 esfangSqlPipeline是将二手房信息存储到数据库中
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
import pymysql
import os
os.chdir('G:/爬取内容')
class FangPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json','wb')
self.eshouse_fp = open('eshouse.json','wb')
self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
self.eshouse_exporter = JsonLinesItemExporter(self.eshouse_fp,ensure_ascii=False)
def process_item(self, item, spider):
self.newhouse_exporter.export_item(item)
self.eshouse_exporter.export_item(item)
return item
def close_spider(self,spider):
self.newhouse_fp.close()
self.eshouse_fp.close()
class newfangSqlPipeline(object):
def __init__(self):
data = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'124780',
'database':'jianshu',
'charset':'utf8mb4'
}
self.conn = pymysql.connect(**data)
self.cursor = self.conn.cursor()
if self.conn:
print('='*30)
print('连接成功')
print('='*30)
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (item['province'], item['city'], item['name'], item['price'], item['rooms'], item['area'], item['address'], item[
'district'], item['sale'], item['origin_url']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = '''
insert into newfang(province,city,name,price,rooms,area,address,district,sale,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
return self._sql
return self._sql
class esfangSqlPipeline(object):
def __init__(self):
data = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': '124780',
'database': 'jianshu',
'charset': 'utf8mb4'
}
self.conn = pymysql.connect(**data)
self.cursor = self.conn.cursor()
if self.conn:
print('=' * 30)
print('连接成功')
print('=' * 30)
self._sql = None
def process_item(self, item2, spider):
self.cursor.execute(self.sql, (item2['province'], item2['city'], item2['name'], item2['address'], item2['rooms'], item2['area'], item2['height'],
item2['direction'], item2['year'], item2['price'], item2['unit_price'], item2['origin_url']))
self.conn.commit()
return item2
@property
def sql(self):
if not self._sql:
self._sql = '''
insert into esfang(province,city,name,address,rooms,area,height,direction,year,price,unit_price,origin_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
return self._sql
return self._sql
settings.py
只需要打开以下几个注释即可
# -*- coding: utf-8 -*-
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2 #根据自己需要设置延时
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
'fang.middlewares.UserAgentDownLoadMiddleware': 300,
}
#根据自己需要打开下载
ITEM_PIPELINES = {
# 'fang.pipelines.FangPipeline': 543,
# 'fang.pipelines.newfangSqlPipeline': 543,
'fang.pipelines.esfangSqlPipeline': 543,
}