1、Python 多线程
参考文档:Python 异步编程 多线程
2、Python 多线程爬虫
相关文档:Python 爬虫入门教程
多线程对爬虫的效率提高是很有效的,但使用python的多线程需要注意,Python的多线程并不如Java的多线程,其差异在于当python解释器开始执行任务时,受制于GIL(全局解释所),Python 的线程被限制到同一时刻只允许一个线程执行这样一个执行模型。Python 的线程更适用于处理 I/O 和其他需要并发行的阻塞操作(如等待 I/O、等待从数据库获取数据等),而不是需要多处理器行的计算密集型任务。爬虫主要是网络请求耗时,则可以使用多线程来编写爬虫。可以提高性能。
1)多线程使用示例
from queue import Queue
from threading import Thread
def producer(out_q):
while True:
out_q.put(1)
def consumer(in_q):
while True:
data = in_q.get()
if __name__ == '__main__':
q = Queue()
t1 = Thread(target=consumer, args=(q, ))
t2 = Thread(target=producer, args=(q, ))
t1.start()
t2.start()
2)多线程爬虫
import threading
import requests
import time
import queue as Queue
#h获取网址列表
link_list = []
with open('weblist.txt','r') as f:
file_list = f.readlines()
for eachone in file_list:
link = eachone.replace('\n','')
link_list.append(link)
start = time.time()
#创建线程对象,固定格式
class myThread (threading.Thread):
def __init__(self, name, q):
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print ("Starting " + self.name)
while True:
try:
crawler(self.name, self.q)
except:
break
print ("Exiting " + self.name)
#定义线程处理内容
def crawler(threadName, q):
#获取队列
url = q.get(timeout=2)
try:
r = requests.get(url, timeout=20)
print (q.qsize(), threadName, r.status_code, url)
except Exception as e:
print (q.qsize(), threadName, url, 'Error: ', e)
threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"]
workQueue = Queue.Queue(1000)
threads = []
# 创建新线程
for tName in threadList:
thread = myThread(tName, workQueue)
thread.start()
threads.append(thread)
# 填充队列
for url in link_list:
workQueue.put(url)
# 等待所有线程完成
for t in threads:
t.join()
end = time.time()
print ('Queue多线程爬虫的总时间为:', end-start)
print ("Exiting Main Thread")