爬取长春公交线路信息并存到本地,可以爬取任意城市的公交信息

import requests
import os
from lxml import html
import re
import time
etree = html.etree
s = requests.Session()
url = 'https://changchun.8684.cn'


def get_content(url):
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }
    return s.get(url, headers=headers).text


#得到车号
def chehao(i):
    url2 = url + i
    l = get_content(url2)
    tree = etree.HTML(l)
    ch = tree.xpath(r'//div[@class="layout-left"]/div[@class="cc-content"]/div[@class="list clearfix"]//a/text()')
    chlj = tree.xpath(r'//div[@class="layout-left"]/div[@class="cc-content"]/div[@class="list clearfix"]//a/@href')
    return (ch,chlj)

#爬取内容
def paqu(tree,resu):
    d1 = {}
    d2 = {}
    xinxi = {}
    # 公交名称
    name = str(tree.xpath(r'//h1[@class="title"]/text()'))
    print('爬取{}公交车'.format(name))
    # 运行时间
    time = tree.xpath(r'//ul[@class="bus-desc"]/li')[0].text
    print(time)
    print("*"*50)
    #票价信息
    price = tree.xpath(r'//ul[@class="bus-desc"]/li')[1].text
    # 最后更新时间
    update = tree.xpath(r'//ul[@class="bus-desc"]/li')[3].text
    #上行总站数
    ontotal = tree.xpath(r'//div[@class="bus-excerpt mb15"][1]/div[@class="other fr"]/div[@class="total"]/text()')
    # 下行总站数
    downtotal = tree.xpath(r'//div[@class="bus-excerpt mb15"][2]/div[@class="other fr"]/div[@class="total"]/text()')
    # 上行路线
    sx = tree.xpath(r'//div[@class="excerpt fl"]/div[@class="trip"]/text()')[0] if ontotal else ''
    # 上行站点
    #orde1 = tree.xpath(r'//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/i/text()') if ontotal else ''# 第几站
    content1 = tree.xpath(r'//div[@class="bus-lzlist mb15"][1]/ol//li//text()') if sx else ''  # 站名
    # 下行路线
    xx = tree.xpath(r'//div[@class="excerpt fl"]//div[@class="trip"]/text()')[1] if downtotal else ''
    # 下行站点
    content2 = tree.xpath(r'//div[@class="bus-lzlist mb15"][2]/ol//li//text()') if xx else ''  # 站名

    for k, v in zip(range(35), content1):
        ord = str(k)
        d1[ord] = v
    for k, v in zip(range(35), content2):
        ord = str(k)
        d2[ord] = v
    xinxi = {
        '名称': name,
        '运行时间': time,
        '价格': price,
        '更新时间': update,
        '上行路线':sx,
        '上行站数': ontotal,
        '上行站点': d1,
        '下行路线':xx,
        '下行站数': downtotal,
        '下行站点': d2
    }
    resu.append(xinxi)


def main():
    i = 1
    re = get_content(url)
    tree = etree.HTML(re)
    num1 = tree.xpath(r'//div[@class="bus-layer depth w120"]//div[1]/div//a/text()')#以几号数字开头
    li1 = tree.xpath(r'//div[@class="bus-layer depth w120"]//div[1]/div//a/@href')#几号数字开头的车的链接
    #循环得到以几号数字开头的车的链接
    print(num1)
    print(li1)
    for numb ,i in zip(num1,li1):
        print('开始爬取第{}页内容'.format(i))
        resu = []
        number = chehao(i) #得到车号链接的列表
        #得到各个具体的车的信息
        print(num1)
        for ch, n in zip(number[0],number[1]):
            url3 = url + n
            lx = get_content(url3)
            print("url3")
            print(url3)
            tree = etree.HTML(lx)
            paqu(tree,resu)

        filename = str(numb) + '路公交车' + '.txt'
        filepath = 'G:/爬取内容' + '/' + filename
        with open(filepath,'w') as fp:
           fp.write(str(resu))
        print('第一页内容爬取完毕,接下来休息3s')
        time.sleep(3)

if __name__ =='__main__':
    main()