导入
import requests
from lxml import etree
具体使用
import requests
from lxml import etree
import os,time
def mz_spider(base_url,headers):
res=requests.get(base_url,headers)
html=etree.HTML(res.text)
img_src=html.xpath('//div[@class="TypeList"]/ul/li/a/@href')
for img_url in img_src:
# print(img_url)
img_parse(img_url)
def img_parse(img_url):
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Referer': 'http://www.umei.cc/tags/qingchun_1.htm'
}
res=requests.get(img_url,headers)
res.encoding=res.apparent_encoding
html=etree.HTML(res.text)
#获取标题
title=html.xpath("//div[@class='ArticleTitle']/strong/text()")[0]
#获取图片总页数
page_num=html.xpath('//div[@class="NewPages"]/ul/li/a/text()')[-2]
# '''
# http://www.umei.cc/meinvtupian/meinvxiezhen/198190.htm
# http://www.umei.cc/meinvtupian/meinvxiezhen/198190_2.htm
# http://www.umei.cc/meinvtupian/meinvxiezhen/198190_3.htm
# '''
#拼接图片详情页地址
for num in range(2,int(page_num)+1):
img_src=img_url.split('.htm')[0] + '_{}'.format(str(num)) + '.htm'
# print(img_src)
download_img(img_src,title)
#下载图片
def download_img(img_src,title):
res=requests.get(img_src)
html=etree.HTML(res.text)
#图片的具体链接地址
img_dizhi=html.xpath('//div[@class="ImageBody"]/p//img/@src')[0]
# print(img_dizhi)
root_dir='qing_chun'
img_name=img_dizhi.split('/')[-1]
title=title.replace(" ",'')
root_dir=root_dir+"\\"+title
if not os.path.exists(root_dir):
os.makedirs(root_dir)
res=requests.get(img_dizhi,headers=headers)
with open(root_dir+"\\"+img_name,"wb")as f:
f.write(res.content)
print(title+'-'+img_name+'文件保存成功')
if __name__ == '__main__':
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 ',
'Referer':'http://www.umei.cc/tags/qingchun_1.htm'
}
for i in range(1,18):
base_url="http://www.umei.cc/tags/qingchun_{}.htm".format(str(i))
# time.sleep(1)
mz_spider(base_url,headers)
主要考察xpath的使用
xpath使用链接
requests官方文档:http://2.python-requests.org/zh_CN/latest/user/quickstart.html