import requests
import re
import time
from bs4 import BeautifulSoup
from multiprocessing import Pool
import os
import sys, io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')#修改编码
url = 'http://www.panduoduo.net/c/4/1'
headers = {
# 'Cookie': 'Hm_lvt_11b39dcf34d304adbc3f3f49e67cb940=1483436227; CNZZDATA5767138=cnzz_eid%3D1508734121-1483434808-%26ntime%3D1483434808; sh=%E7%94%B5%E5%BD%B1+++python3%E8%A7%86%E9%A2%91%E6%95%99%E7%A8%8B+++python3%20%E4%BC%A0%E6%99%BA%E6%92%AD%E5%AE%A2+++python3+++%E6%B8%B8%E6%88%8F',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400'
}
#获取最大页数
def get_maxpage():
req=requests.get(url,headers=headers)
page_soup = BeautifulSoup(req.text, 'html.parser')
page_max=page_soup.find('div',class_='sep').find_all('span')[-2].text
# print(re.findall(r'\d+',page_max.strip())[0])
return int(re.findall(r'\d+',page_max.strip())[0])
def testgetHtml(url):
req = requests.get(url, headers=headers)
page_soup = BeautifulSoup(req.text, 'html.parser')
url_text = page_soup.find('div',class_='sep').find_all('a',class_='blue')
for i in url_text:
print(i.text)
#爬取资源文件名
def getHtml(page):
req = requests.get('http://www.panduoduo.net/c/4/{0}'.format(page), headers=headers)
print('..............................第{0}页..............................'.format(page))
page_soup = BeautifulSoup(req.text, 'html.parser')
url_text = page_soup.find('div',class_='sep').find_all('a',class_='blue')
for i in url_text:
print(i.text)
print('完成第{0}页'.format(page))
if __name__=='__main__':
# testgetHtml(url)
num=get_maxpage()+1
start=time.time()
#方法一
'''
pool=Pool(processes=20)#processes设置并发进程数量
for i in range(1,num):#异步进程池
pool.apply_async(func=getHtml,args=(i,))
# for i in range(1,1001):#同步进程池
# pool.apply(func=getHtml,args=(i,))
print('----------主进程----------pid=%d'%os.getppid())
pool.close()
pool.join()
'''
#方法二
with Pool(processes=20) as pool:#使用with不用再close join
for i in range(1,num):
pool.apply_async(func=getHtml,args=(i,))
print('----------主进程----------pid=%d' % os.getppid())
end=time.time()
print(end-start)