网页地址:http://www.mm131.com/qingchun/
简单一点,我们只需得到 img 标签的 src 属性,即图片的地址
示例代码
**主要步骤**
1、获取网页数据
2、解析网页数据,得到所需的图片地址
3、根据图片地址下载图片,将图片保存到文件中
# CrawBeaGirlImage.py
import requests
import os
from bs4 import BeautifulSoup
# url 请求网页数据
def getHtmlText(url, code):
try:
r = requests.get(url)
r.encoding = code
r.raise_for_status()
return r.text
except:
return ''
# 解析网页,返回所需数据
def parseHtml(images, url):
html = getHtmlText(url, 'utf-8')
soup = BeautifulSoup(html, 'lxml')
imgs = soup.find_all('img')
for i in range(len(imgs)):
try:
images.append(imgs[i].attrs['src'])
print(imgs[i].attrs['src'])
except:
print('')
# 根据图片的 url 下载图片,然后存储到文件中
def storeImages(images):
root = '/Users/xiaolian/images/'
if not os.path.exists(root):
os.mkdir(root) # 文件夹不存在则创建一个文件夹
for i in range(len(images)):
# 图片的名字选的简单点 就用数字
path = root + str(i) + '.jpg'
try:
r = requests.get(images[i])
r.encoding = 'utf-8'
# 模式记得要加上 b,使文件以 2 进制模式打开, 否则会报错 TypeError: write() argument must be str, not bytes
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print('文件保存成功')
except:
continue
print('图片保存失败')
def main():
url = '''http://www.mm131.com/'''
images = []
parseHtml(images, url)
storeImages(images)
if __name__ == '__main__':
main()