import requests
from urllib.parse import urlencode
import json
def get_page(offset):
params = {
'offset':offset,
"format":"json",
"keyword":"街拍",
"autoload":"true",
"count":"20",
"cur_tab":"1",
"from":"search_tab"
}
headers = {
"User-Agent":"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50"
}
url = "https://www.toutiao.com/search_content/?"+urlencode(params)
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None
def get_images(json):
if json.get("data"):
for item in json.get("data"):
if item.get("image_list"):
title = item.get("title")
for picture in item.get("image_list"):
yield {
"image":"http:"+picture.get("url"),
"title":title
}
import os
from hashlib import md5
def save_image(item):
path = os.path.join("D:\Ajxa",item.get("title"))
if not os.path.exists(path):
os.mkdir(path)
try:
response = requests.get(item.get("image"))
if response.status_code == 200:
file_path = os.path.join(path,md5(response.content).hexdigest())+".jpg"
if not os.path.exists(file_path):
with open(file_path,'wb') as f :
f.write(response.content)
else:
print("已经下载过了")
except requests.ConnectionError:
print("连接失败")
from multiprocessing.pool import Pool
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
group_start = 1
group_end =20
if __name__ == "__main__":
pool = Pool()
groups = ([x*20 for x in range(group_start,group_end+1)])
pool.map(main,groups)
pool.close()
pool.join()