在python3 的基础上实现的,但是相对于作者的代码少了支持代理部分和避免爬虫陷阱部分,代码写入了自己写的注释,暂时的理解就是这样了,希望有看到的朋友可以指正一下,小白成长中。。。
import urllib.request
import urllib.error
import re
import urllib.parse
from urllib.parse import urljoin
import urllib.robotparser
import time
from datetime import datetime
def download(url,user_agent='wswp',proxy=None,num_retries=2):
print('Downloading:', url)
headers={'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
opener = urllib.request.build_opener()
# urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象
try:
response = opener.open(request)
html = response.read().decode('utf-8')
code = response.code
except urllib.error.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code')and 500 <= e.code < 600:
# retry 5XX HTTP errors
html=download(url, proxy,num_retries - 1)
else:
code = None
return html
# def crawl_sitemap(url):
# #下载sitemap文件
# sitemap=download(url)
# #用正则表达式提取sitemap链接
# links=re.findall('<loc>(.*?)</loc>',sitemap)
# #下载每个链接
# for link in links:
# html=download(link)
#这一部分可以忽略不看,是利用网站地图爬出的一些链接
def link_crawler(seed_url,link_regex,user_agent='wswp',delay=5):
#从初始的url中使用正则表达式来匹配出想要下载的链接
crawl_queue=[seed_url]
rp=get_robots(seed_url)
seen=set(crawl_queue)
while crawl_queue:
url=crawl_queue.pop()
#从列表中取出url,此时列表为空
if rp.can_fetch(user_agent,url):
#确定指定的用户代理是否允许访问网页
html=download(url)
#调用download函数打开并解析网址
for link in get_links(html):
#递归循环get_links函数中得到的每一条链接
if re.match(link_regex,link):
#如果有与正则表达式相匹配的链接就取出来
link =urllib.parse.urljoin(seed_url,link)
#用urlparse将相对链接转化为绝对链接
if link not in seen:
#检验集合中是否已经有了这个链接
seen.add(link)
#当集合中没有这个链接时添加到集合中
crawl_queue.append(link)
#append方法是为了继续while循环
# print(crawl_queue)
else:
print('Block by robots.txt:',url)
#解析robots.txt文件,以避免下载禁止爬虫的URL
def get_robots(url):
rp=urllib.robotparser.RobotFileParser()
rp.set_url(urllib.parse.urljoin(url,'/robots.txt'))
rp.read()
return rp
#获取网页中所有链接
def get_links(html):
#正则表达式四去匹配网页中所有的链接
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)
#定义一个类使得下载限速
class Throttle:
#在两次下载之间添加延时,从而对爬虫限速
def __init__(self,delay):
#每个域下载之间的延迟量
self.delay=delay
#上次访问域时的时间戳
self.domains={}
def wait(self,url):
domain=urllib.parse.urlparse(url).netloc
#解析url格式取出netloc即‘example.webscraping.com’
last_accessed=self.domains.get(domain)
if self.delay>0 and last_accessed is not None:
sleep_secs=self.delay-(datetime.now()-last_accessed).seconds
#将指定延时与当前时间距离上次访问时间作比较
if sleep_secs>0:
time.sleep(sleep_secs)
#如果当前时间距离上次访问时间小于指定延时,则执行睡眠操作
self.domains[domain]=datetime.now()
#更新访问时间为当前时间
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)',delay=0,user_agent='BadCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)',delay=0, user_agent='GoodCrawler')
运行部分结果:
Downloading: http://example.webscraping.com/view/Saint-Pierre-and-Miquelon-192
Downloading: http://example.webscraping.com/view/Saint-Martin-191
Downloading: http://example.webscraping.com/view/Saint-Lucia-190
Downloading: http://example.webscraping.com/view/Saint-Kitts-and-Nevis-189
Downloading: http://example.webscraping.com/view/Saint-Helena-188
Downloading: http://example.webscraping.com/view/Saint-Barthelemy-187
Downloading: http://example.webscraping.com/view/Rwanda-186
Downloading: http://example.webscraping.com/view/Russia-185
Downloading: http://example.webscraping.com/view/Romania-184
Downloading: http://example.webscraping.com/view/Reunion-183