这是我编的一个初级的爬虫,供有需求的同学使用
在使用时需保证:
网络爬虫礼仪规范
网络爬虫(Web Crawler)在收集网络数据时需要遵守一定的行为准则,这被称为"爬虫礼仪"(Web Crawling Etiquette)。遵守这些规范不仅能体现技术人员的职业操守,也能避免法律纠纷和服务器负担。
基本爬虫礼仪准则
1. 尊重robots.txt协议
- 检查目标网站的robots.txt文件:这是网站告知爬虫哪些内容可以抓取的标准方式
- 遵守Disallow规则:如果网站明确禁止抓取某些目录或页面,应当遵守
- 示例:在Python中可以使用
urllib.robotparser
模块解析robots.txt
2. 设置合理的抓取频率
- 添加抓取延迟:在请求之间设置间隔(如3-10秒)
- 避免高峰期抓取:尽量在网站流量低谷时段进行抓取
- 限制并发请求数:不要同时发起过多请求
0基础的同学记得用 pip 安装 requests 和 bs4 库哦
以下是代码:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import os
import re
from datetime import datetime
class EnhancedWebCrawler:
def __init__(self, base_url, max_pages=10, delay=1, download_videos=False, download_images=False):
self.base_url = base_url
self.max_pages = max_pages
self.delay = delay
self.download_videos = download_videos
self.download_images = download_images
self.visited_urls = set()
self.domain = urlparse(base_url).netloc
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.video_formats = ('.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv')
self.image_formats = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp')
# 创建主目录结构
self.base_dir = 'crawled_data'
self.text_dir = os.path.join(self.base_dir, 'texts')
self.video_dir = os.path.join(self.base_dir, 'videos')
self.image_dir = os.path.join(self.base_dir, 'images')
os.makedirs(self.text_dir, exist_ok=True)
if self.download_videos:
os.makedirs(self.video_dir, exist_ok=True)
if self.download_images:
os.makedirs(self.image_dir, exist_ok=True)
def is_valid_url(self, url):
"""检查URL是否有效且属于同一域名"""
parsed = urlparse(url)
return bool(parsed.netloc) and parsed.netloc == self.domain
def get_page_content(self, url):
"""获取页面内容"""
try:
time.sleep(self.delay)
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"获取 {url} 失败: {e}")
return None
def extract_links(self, html):
"""从HTML中提取链接"""
soup = BeautifulSoup(html, 'html.parser')
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(self.base_url, href)
if full_url.startswith('http') and self.is_valid_url(full_url):
links.add(full_url)
return links
def extract_video_links(self, html):
"""从HTML中提取视频链接"""
soup = BeautifulSoup(html, 'html.parser')
video_links = set()
for video_tag in soup.find_all('video'):
if video_tag.get('src'):
video_url = urljoin(self.base_url, video_tag['src'])
if self.is_video_url(video_url):
video_links.add(video_url)
for iframe in soup.find_all('iframe'):
if iframe.get('src') and 'youtube.com/embed/' in iframe['src']:
video_links.add(iframe['src'])
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if self.is_video_url(href):
video_url = urljoin(self.base_url, href)
video_links.add(video_url)
return video_links
def extract_image_links(self, html):
"""从HTML中提取图片链接"""
soup = BeautifulSoup(html, 'html.parser')
image_links = set()
for img_tag in soup.find_all('img', src=True):
img_url = urljoin(self.base_url, img_tag['src'])
if self.is_image_url(img_url):
image_links.add(img_url)
for tag in soup.find_all(style=True):
style = tag['style']
urls = re.findall(r'url$(.*?)$', style)
for url in urls:
img_url = urljoin(self.base_url, url.strip('"\''))
if self.is_image_url(img_url):
image_links.add(img_url)
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if self.is_image_url(href):
img_url = urljoin(self.base_url, href)
image_links.add(img_url)
return image_links
def is_video_url(self, url):
"""检查URL是否是视频链接"""
return any(url.lower().endswith(fmt) for fmt in self.video_formats) or 'youtube.com' in url
def is_image_url(self, url):
"""检查URL是否是图片链接"""
return any(url.lower().endswith(fmt) for fmt in self.image_formats)
def download_media(self, media_url, media_type='image'):
"""下载媒体文件(图片或视频)"""
try:
# 处理YouTube链接(不直接下载)
if media_type == 'video' and 'youtube.com' in media_url:
print(f"检测到YouTube视频: {media_url}")
print("建议使用youtube-dl或pytube库专门处理YouTube视频")
return None
# 获取文件名
parsed_url = urlparse(media_url)
filename = os.path.basename(parsed_url.path)
# 如果没有有效文件名,生成一个
if not filename or '.' not in filename:
ext = '.mp4' if media_type == 'video' else '.jpg'
filename = f"{media_type}_{int(time.time())}{ext}"
else:
# 确保文件名有正确的扩展名
ext = os.path.splitext(filename)[1].lower()
if media_type == 'video' and ext not in self.video_formats:
filename += '.mp4'
elif media_type == 'image' and ext not in self.image_formats:
filename += '.jpg'
# 确定保存路径
if media_type == 'video':
save_path = os.path.join(self.video_dir, filename)
else:
save_path = os.path.join(self.image_dir, filename)
print(f"开始下载{media_type}: {media_url}")
response = requests.get(media_url, headers=self.headers, stream=True)
response.raise_for_status()
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"{media_type.capitalize()}已保存到: {save_path}")
return save_path
except Exception as e:
print(f"下载{media_type}失败: {e}")
return None
def extract_content(self, html, url):
"""提取页面主要内容"""
soup = BeautifulSoup(html, 'html.parser')
# 移除不需要的标签
for script in soup(['script', 'style', 'nav', 'footer']):
script.decompose()
# 获取页面标题
title = soup.title.string if soup.title else '无标题'
# 获取正文文本
text = ' '.join(soup.stripped_strings)
# 提取视频信息
video_links = self.extract_video_links(html)
video_info = []
if self.download_videos and video_links:
print(f"在页面 {url} 中发现 {len(video_links)} 个视频")
for video_url in video_links:
video_path = self.download_media(video_url, 'video')
if video_path:
video_info.append({
'url': video_url,
'local_path': os.path.relpath(video_path, self.base_dir)
})
# 提取图片信息
image_links = self.extract_image_links(html)
image_info = []
if self.download_images and image_links:
print(f"在页面 {url} 中发现 {len(image_links)} 张图片")
for img_url in image_links:
img_path = self.download_media(img_url, 'image')
if img_path:
image_info.append({
'url': img_url,
'local_path': os.path.relpath(img_path, self.base_dir)
})
return {
'url': url,
'title': title,
'content': text,
'timestamp': datetime.now().isoformat(),
'videos': video_info,
'images': image_info
}
def save_content(self, content):
"""保存爬取的内容到文件"""
# 使用URL的MD5作为文件名
safe_title = re.sub(r'[\\/*?:"<>|]', '_', content['title'])
filename = os.path.join(self.text_dir, f"{safe_title[:50]}.txt")
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"URL: {content['url']}\n")
f.write(f"标题: {content['title']}\n")
f.write(f"爬取时间: {content['timestamp']}\n")
if content['videos']:
f.write("\n===== 视频信息 =====\n")
for video in content['videos']:
f.write(f"视频URL: {video['url']}\n")
f.write(f"本地路径: {video['local_path']}\n\n")
if content['images']:
f.write("\n===== 图片信息 =====\n")
for img in content['images']:
f.write(f"图片URL: {img['url']}\n")
f.write(f"本地路径: {img['local_path']}\n\n")
f.write("\n===== 内容 =====\n")
f.write(content['content'])
print(f"已保存文本内容到: {filename}")
def crawl(self, start_url=None):
"""开始爬取"""
start_url = start_url or self.base_url
queue = [start_url]
pages_crawled = 0
while queue and pages_crawled < self.max_pages:
current_url = queue.pop(0)
if current_url in self.visited_urls:
continue
print(f"\n正在爬取: {current_url}")
html = self.get_page_content(current_url)
if html:
self.visited_urls.add(current_url)
content = self.extract_content(html, current_url)
self.save_content(content)
pages_crawled += 1
# 提取新链接加入队列
new_links = self.extract_links(html)
queue.extend(new_links - self.visited_urls)
if __name__ == "__main__":
print("=== 增强版网站爬虫程序 ===")
print("请遵守目标网站的robots.txt规则,不要过度爬取")
# 用户输入目标URL
target_url = input("请输入要爬取的网站URL: ").strip()
if not target_url.startswith(('http://', 'https://')):
target_url = 'http://' + target_url
# 询问是否下载视频
download_videos = input("是否下载视频?(y/n): ").strip().lower() == 'y'
# 询问是否下载图片
download_images = input("是否下载图片?(y/n): ").strip().lower() == 'y'
# 创建爬虫实例
crawler = EnhancedWebCrawler(
base_url=target_url,
max_pages=20,
delay=2,
download_videos=download_videos,
download_images=download_images
)
# 开始爬取
try:
crawler.crawl()
print(f"\n爬取完成! 共爬取了 {len(crawler.visited_urls)} 个页面")
print(f"所有数据已保存到: {os.path.abspath(crawler.base_dir)}")
except KeyboardInterrupt:
print("\n用户中断,爬取已停止")
except Exception as e:
print(f"爬取过程中出错: {e}")
记得关注我呀,我也是CSDN的小白awa