1、Python 多线程
参考文档:Python 异步编程 多线程
2、Python 多线程爬虫
相关文档:Python 爬虫入门教程
多线程对爬虫的效率提高是很有效的,但使用python的多线程需要注意,Python的多线程并不如Java的多线程,其差异在于当python解释器开始执行任务时,受制于GIL(全局解释所),Python 的线程被限制到同一时刻只允许一个线程执行这样一个执行模型。Python 的线程更适用于处理 I/O 和其他需要并发行的阻塞操作(如等待 I/O、等待从数据库获取数据等),而不是需要多处理器行的计算密集型任务。爬虫主要是网络请求耗时,则可以使用多线程来编写爬虫。可以提高性能。
1)多线程使用示例
from queue import Queue from threading import Thread def producer(out_q): while True: out_q.put(1) def consumer(in_q): while True: data = in_q.get() if __name__ == '__main__': q = Queue() t1 = Thread(target=consumer, args=(q, )) t2 = Thread(target=producer, args=(q, )) t1.start() t2.start()
2)多线程爬虫
import threading import requests import time import queue as Queue #h获取网址列表 link_list = [] with open('weblist.txt','r') as f: file_list = f.readlines() for eachone in file_list: link = eachone.replace('\n','') link_list.append(link) start = time.time() #创建线程对象,固定格式 class myThread (threading.Thread): def __init__(self, name, q): threading.Thread.__init__(self) self.name = name self.q = q def run(self): print ("Starting " + self.name) while True: try: crawler(self.name, self.q) except: break print ("Exiting " + self.name) #定义线程处理内容 def crawler(threadName, q): #获取队列 url = q.get(timeout=2) try: r = requests.get(url, timeout=20) print (q.qsize(), threadName, r.status_code, url) except Exception as e: print (q.qsize(), threadName, url, 'Error: ', e) threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"] workQueue = Queue.Queue(1000) threads = [] # 创建新线程 for tName in threadList: thread = myThread(tName, workQueue) thread.start() threads.append(thread) # 填充队列 for url in link_list: workQueue.put(url) # 等待所有线程完成 for t in threads: t.join() end = time.time() print ('Queue多线程爬虫的总时间为:', end-start) print ("Exiting Main Thread")