文章目录
线程和python
全局解释器锁GIL
Python代码的执行由Python虚拟机(也叫解释器主循环)来控制。Python在设计之初就考虑到要在主循环中,同时只有一个线程在执行。虽然 Python 解释器中可以“运行”多个线程,但在任意时刻只有一个线程在解释器中运行。
对Python虚拟机的访问由全局解释器锁(GIL)来控制,正是这个锁能保证同一时刻只有一个线程在运行。
在多线程环境中,Python 虚拟机按以下方式执行:
a、设置 GIL;
b、切换到一个线程去运行;
c、运行指定数量的字节码指令或者线程主动让出控制(可以调用 time.sleep(0));
d、把线程设置为睡眠状态;
e、解锁 GIL;
d、再次重复以上所有步骤。
在调用外部代码(如 C/C++扩展函数)的时候,GIL将会被锁定,直到这个函数结束为止(由于在这期间没有Python的字节码被运行,所以不会做线程切换)编写扩展的程序员可以主动解锁GIL。
threading模块
线程的创建
from threading import Thread
import time
def sayhi(name):
time.sleep(2)
print('%s say hello' %name)
if __name__ == '__main__':
t=Thread(target=sayhi,args=('egon',))
t.start()
print('主线程')
from threading import Thread
import time
class Sayhi(Thread):
def __init__(self,name):
super().__init__()
self.name=name
def run(self):
time.sleep(2)
print('%s say hello' % self.name)
if __name__ == '__main__':
t = Sayhi('egon')
t.start()
print('主线程')
join
from threading import Thread
import time
def sayhi(name):
time.sleep(2)
print('%s say hello' %name)
if __name__ == '__main__':
t=Thread(target=sayhi,args=('egon',))
t.start()
t.join()
print('主线程')
print(t.is_alive())
'''
egon say hello
主线程
False
'''
守护线程
方法1
from threading import Thread
import time
def sayhi(name):
time.sleep(2)
print('%s say hello' %name)
if __name__ == '__main__':
t=Thread(target=sayhi,args=('egon',))
t.setDaemon(True) #必须在t.start()之前设置
t.start()
print('主线程')
print(t.is_alive())
'''
主线程
True
'''
方法2
from threading import Thread
import time
def foo():
print(123)
time.sleep(1)
print("end123")
def bar():
print(456)
time.sleep(3)
print("end456")
t1=Thread(target=foo)
t2=Thread(target=bar)
t1.daemon=True
t1.start()
t2.start()
print("main-------")
queue队列
使用import queue,用法与进程Queue一样
先进先出
import queue
q=queue.Queue()
q.put('first')
q.put('second')
q.put('third')
print(q.get())
print(q.get())
print(q.get())
'''
结果(先进先出):
first
second
third
'''
后进先出#last in fisrt out
import queue
q=queue.LifoQueue()
q.put('first')
q.put('second')
q.put('third')
print(q.get())
print(q.get())
print(q.get())
'''
结果(后进先出):
third
second
first
'''
优先级队列
import queue
q=queue.PriorityQueue()
#put进入一个元组,元组的第一个元素是优先级(通常是数字,也可以是非数字之间的比较),数字越小优先级越高
q.put((20,'a'))
q.put((10,'b'))
q.put((30,'c'))
print(q.get())
print(q.get())
print(q.get())
'''
结果(数字越小优先级越高,优先级高的优先出队):
(10, 'b')
(20, 'a')
(30, 'c')
'''
锁
import threading
VALUE = 0
gLock = threading.Lock()
def add_value():
global VALUE
#上锁
gLock.acquire()
for x in range(100000):
VALUE += 1
#释放
gLock.release()
print('value: %d'%VALUE)
def main():
for i in range(3):
t = threading.Thread(target=add_value)
t.start()
if __name__ == '__main__':
main()
死锁
所谓死锁: 是指两个或两个以上的进程或线程在执行过程中,因争夺资源而造成的一种互相等待的现象,若无外力作用,它们都将无法推进下去。此时称系统处于死锁状态或系统产生了死锁,这些永远在互相等待的进程称为死锁进程,如下就是死锁
from threading import Lock as Lock
import time
mutexA=Lock()
mutexA.acquire()
mutexA.acquire()
print(123)
mutexA.release()
mutexA.release()
解决方法
解决方法,递归锁,在Python中为了支持在同一线程中多次请求同一资源,python提供了可重入锁RLock。
这个RLock内部维护着一个Lock和一个counter变量,counter记录了acquire的次数,从而使得资源可以被多次require。直到一个线程所有的acquire都被release,其他的线程才能获得资源。上面的例子如果使用RLock代替Lock,则不会发生死锁:
from threading import RLock as Lock
import time
mutexA=Lock()
mutexA.acquire()
mutexA.acquire()
print(123)
mutexA.release()
mutexA.release()
concurent.future线程池
# -*- coding:utf-8 -*-
# Author : ZRQ
# Data : 2019/6/27 21:22
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
from threading import currentThread
import os,time,random
def task(n):
print('%s:%s is running'%(currentThread().getName(),os.getpid())) #看到的pid都是一样的,因为线程是共享了一个进程
time.sleep(random.randint(1,3)) #I/O密集型的,,一般用线程,用了进程耗时长
return n**2
if __name__ == '__main__':
start = time.time()
p = ThreadPoolExecutor() #对于线程池如果不写max_works:默认的是cpu的数目*5
l = []
for i in range(10): #10个任务 # 线程池效率高了
obj = p.submit(task,i) #相当于apply_async异步方法
# obj = p.submit(task,i).result() #相当于同步方法
# print(obj)
l.append(obj)
p.shutdown() #默认有个参数wite=True (相当于close和join)
print('='*30)
print([obj.result() for obj in l])
print(time.time() - start)
多线程爬虫
import requests
from urllib import request
from queue import Queue
from lxml import etree
import 文件os
import threading
class Finder(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
}
def __init__(self,page_queue,img_queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.jiexi(url)
def jiexi(self,url):
res = requests.get(url,headers=self.headers)
res.encoding='gbk'
html = res.text
html = etree.HTML(html)
titles = html.xpath('//div[@class="con"]/div/a/@title')
urls = html.xpath('//div[@class="con"]/div/a/@href')
self.img_queue.put((titles,urls))
class Downder(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
titles,urls = self.img_queue.get()
print(len(urls))
for title,url in zip(titles,urls):
print(title,url)
self.img_xiazai(title,url)
def img_xiazai(self,title,url):
res = requests.get(url)
res.encoding='gbk'
html = res.text
html = etree.HTML(html)
pic_url = html.xpath('//div[@class="contenta"]/img/@src')
#按照标题创建文件夹
rosi_dir = 'E:\\Picture' + '\\' + title
if not 文件os.path.exists(rosi_dir):
文件os.makedirs(rosi_dir)
for url in pic_url:
#取链接尾部作为文件名
name = url.split('-')[-1]
print(name)
res = requests.get(url)
with open(rosi_dir + '\\' + name, 'wb') as f:
f.write(res.content)
#借助contains精确查找下一页链接
next_url = html.xpath('//div[@class="page"]/ul/a[contains(string(),"下一页")]/@href')
if next_url:
next_url = request.urljoin(url,next_url)
self.img_xiazai(title,next_url)
def main():
page_queue = Queue(6)
img_queue = Queue(1000)
for i in range(1,7):
url = 'https://www.meinvtu123.net/a/47/list_47_{}.html'.format(i)
page_queue.put(url)
for x in range(2):
t = Finder(page_queue,img_queue)
t.start()
for i in range(5):
t = Downder(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()