爬取长春公交线路信息并存到本地,可以爬取任意城市的公交信息
import requests
import os
from lxml import html
import re
import time
etree = html.etree
s = requests.Session()
url = 'https://changchun.8684.cn'
def get_content(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
return s.get(url, headers=headers).text
#得到车号
def chehao(i):
url2 = url + i
l = get_content(url2)
tree = etree.HTML(l)
ch = tree.xpath(r'//div[@class="layout-left"]/div[@class="cc-content"]/div[@class="list clearfix"]//a/text()')
chlj = tree.xpath(r'//div[@class="layout-left"]/div[@class="cc-content"]/div[@class="list clearfix"]//a/@href')
return (ch,chlj)
#爬取内容
def paqu(tree,resu):
d1 = {}
d2 = {}
xinxi = {}
# 公交名称
name = str(tree.xpath(r'//h1[@class="title"]/text()'))
print('爬取{}公交车'.format(name))
# 运行时间
time = tree.xpath(r'//ul[@class="bus-desc"]/li')[0].text
print(time)
print("*"*50)
#票价信息
price = tree.xpath(r'//ul[@class="bus-desc"]/li')[1].text
# 最后更新时间
update = tree.xpath(r'//ul[@class="bus-desc"]/li')[3].text
#上行总站数
ontotal = tree.xpath(r'//div[@class="bus-excerpt mb15"][1]/div[@class="other fr"]/div[@class="total"]/text()')
# 下行总站数
downtotal = tree.xpath(r'//div[@class="bus-excerpt mb15"][2]/div[@class="other fr"]/div[@class="total"]/text()')
# 上行路线
sx = tree.xpath(r'//div[@class="excerpt fl"]/div[@class="trip"]/text()')[0] if ontotal else ''
# 上行站点
#orde1 = tree.xpath(r'//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/i/text()') if ontotal else ''# 第几站
content1 = tree.xpath(r'//div[@class="bus-lzlist mb15"][1]/ol//li//text()') if sx else '' # 站名
# 下行路线
xx = tree.xpath(r'//div[@class="excerpt fl"]//div[@class="trip"]/text()')[1] if downtotal else ''
# 下行站点
content2 = tree.xpath(r'//div[@class="bus-lzlist mb15"][2]/ol//li//text()') if xx else '' # 站名
for k, v in zip(range(35), content1):
ord = str(k)
d1[ord] = v
for k, v in zip(range(35), content2):
ord = str(k)
d2[ord] = v
xinxi = {
'名称': name,
'运行时间': time,
'价格': price,
'更新时间': update,
'上行路线':sx,
'上行站数': ontotal,
'上行站点': d1,
'下行路线':xx,
'下行站数': downtotal,
'下行站点': d2
}
resu.append(xinxi)
def main():
i = 1
re = get_content(url)
tree = etree.HTML(re)
num1 = tree.xpath(r'//div[@class="bus-layer depth w120"]//div[1]/div//a/text()')#以几号数字开头
li1 = tree.xpath(r'//div[@class="bus-layer depth w120"]//div[1]/div//a/@href')#几号数字开头的车的链接
#循环得到以几号数字开头的车的链接
print(num1)
print(li1)
for numb ,i in zip(num1,li1):
print('开始爬取第{}页内容'.format(i))
resu = []
number = chehao(i) #得到车号链接的列表
#得到各个具体的车的信息
print(num1)
for ch, n in zip(number[0],number[1]):
url3 = url + n
lx = get_content(url3)
print("url3")
print(url3)
tree = etree.HTML(lx)
paqu(tree,resu)
filename = str(numb) + '路公交车' + '.txt'
filepath = 'G:/爬取内容' + '/' + filename
with open(filepath,'w') as fp:
fp.write(str(resu))
print('第一页内容爬取完毕,接下来休息3s')
time.sleep(3)
if __name__ =='__main__':
main()