- pip install requests
使用第三方库requests抓取,
import requests
import re
import json
import multiprocessing
from requests import RequestException
def get_html(url):
try:
response = requests.get(url)
return response.text
except RequestException:
return None
def parse_html(html):
pattern = re.compile('<dd>.*?board-index.*?(\d+).*?data-src="(.*?)".*?name.*?'
+'><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S)
movie_list = re.findall(pattern, html)
for item in movie_list:
yield{
"排名": item[0],
"电影名": item[2],
"封面链接": item[1],
"主演": item[3].strip(),
"上映时间":item[4],
}
def write_to_file(text):
with open('moviestop100.txt','a', encoding='utf-8') as f:
f.write(json.dumps(text, ensure_ascii=False)+'\n')
def main(i):
url = 'http://maoyan.com/board/4?offset='+str(i)
html = get_html(url)
for item in parse_html(html):
write_to_file(item)
if __name__ == "__main__":
pool = multiprocessing.Pool()
pool.map(main, [i*10 for i in range(12)])