练习爬虫多线程的使用
import threading
import requests
import json
from lxml import html
etree = html.etree
import time
from queue import Queue
#用来存放采集线程
clist = []
#用来存放解析线程
jlist = []
#采集线程
class CrawlThread(threading.Thread):
def __init__(self, name,page_queue,data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.fanjian.net/jiantu-{}'
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',}
def run(self):
print("{}......线程启动".format(self.name))
while 1:#线程一直跑
#判断采集线程何时退出
if self.page_queue.empty():
break
#从队列中取出页码
page = self.page_queue.get()
#拼接url,发送请求
url = self.url.format(page)
r = requests.get(url,headers=self.headers)
#将响应内容存放到data_queue中
self.data_queue.put(r.text)
print("{}......线程结束".format(self.name))
#解析线程
class ParseThread(threading.Thread):
def __init__(self,name,data_queue,fp,lock):
super(ParseThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock
def run(self) -> None:
print("{}......线程启动".format(self.name))
while 1:#线程一直跑
#线程何时结束
# if self.data_queue.empty():
# break
#从data_queue中取一页数据
try:
data = self.data_queue.get(True,10)
#解析数据
self.parse_content(data)
except Exception as e:
break
print("{}......线程结束".format(self.name))
def parse_content(self, data):
tree = etree.HTML(data)
items = []
# 先查找所有的li,再查找各自的
lilist = tree.xpath('//ul[@class="cont-list"]//li')
for i in lilist:
# 获取图片标题
title = tree.xpath('.//h2/a/text()')[0]
# 获取图片
imag = tree.xpath('.//p/img/@data-src')[0]
item = {
'标题': title,
'图片': imag,
}
items.append(item)
# 写到文件中
# 先上锁,再写
self.lock.acquire()
# 写
self.fp.write(json.dumps(items, ensure_ascii=False) + '\n')
# 解锁
self.lock.release()
#队列函数
def create_queue():
#创建页码队列
page_queue = Queue()
for page in range(1,11):
page_queue.put(page)
#创建内容队列,等会儿用的时候再放东西
data_queue = Queue()
return page_queue,data_queue
#创建采集线程
def create_crawl_tread(page_queue,data_queue):
crawl_name = ['采集线程1','采集线程2','采集线程3']
for name in crawl_name:
tcrawl = CrawlThread(name,page_queue,data_queue)
clist.append(tcrawl)
#创建解析线程
def create_parse_thread(data_queue,fp,lock):
parse_name = ['解析线程1','解析线程2','解析线程3']
for name in parse_name:
tparse = ParseThread(name,data_queue,fp,lock)
jlist.append(tparse)
def main():
#创建队列函数
page_queue, data_queue = create_queue()
#创建文件
fp = open('jian.json','w',encoding='utf8')
#创建锁
lock = threading.Lock()
#创建采集线程
create_crawl_tread(page_queue,data_queue)
#休息一下防止队列开始为空程序停止
time.sleep(3)
#创建解析线程
create_parse_thread(data_queue,fp,lock)
#创建所有采集线程
for tcrawl in clist:
tcrawl.start()
#创建所有解析线程
for tparse in jlist:
tparse.start()
#主线程等待子线程结束
for tcrawl in clist:
tcrawl.join()
for tparse in clist:
tparse.join()
#关闭文件
fp.close()
print('主线程子线程全部结束')
if __name__ == '__main__':
main()