分享点以前做的项目,攒点项目经验
前面讲了怎么做scrapy-redis的配置,这次做个采集的实例
网站是彼岸图网:https://pic.netbian.com/4kmeinv/
分析
采集上面这个链接分类下的所有图片,分析网站是148页,读取下一页的链接前往下一页,读取上面的图片链接进入详情取大图的地址
运行效果
redis的存储内容
断点续传
中途停了以后也可以在命令行重新输入命令继续
再次运行它还是从那里开始接着继续
数据处理
数据库设计
数据库存储大图的标题和下载url
建表语句为
CREATE TABLE `db1`.`pictable` (
`id` INT NOT NULL AUTO_INCREMENT,
`title` VARCHAR(100) NOT NULL COMMENT '图片标题',
`picurl` VARCHAR(150) NOT NULL COMMENT '图片地址',
PRIMARY KEY (`id`))
COMMENT = '图片地址存储表';
items文件
items文件中就放置下面两个字段即可,id字段是自动生成的
class MyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
picurl=scrapy.Field()
管道处理
管道主要执行启动和关闭数据库,并对数据做简单查重后存入数据库中
import pymysql
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def open_spider(self, spider):
self.client = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='sa123456',
database='db1',
charset='utf8'
)
self.cursor = self.client.cursor()
def close_spider(self, spider):
self.cursor.close()
self.client.close()
def process_item(self, item, spider):
title = item['title']
picurl = item['picurl']
print('保存%s'%title)
sql = f"select * from pictable where title='{title}' and picurl='{picurl}'"
print(sql)
rows = self.cursor.execute(sql)
if rows == 0:
sql2 = f"insert into pictable(title,picurl) values ('{title}','{picurl}')"
self.cursor.execute(sql2)
print('新增了一条数据')
self.client.commit()
else:
print(f'数据({title},{picurl})已存在')
raise DropItem
return item
基本设置
设置文件中做好如下设置,并注意好管道的开放,UA的调整
# 分布式的配置
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 调度
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 去重
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 任务的优先级别
SCHEDULER_PERSIST = True
# SCHEDULER_FLUSH_ON_START = True
# 你有密码 redis://user:password@IP:port/db
REDIS_URL = 'redis:/你的redis'
主爬虫程序
parse函数是默认的处理函数,处理类似于每页的链接
imgdownload函数处理每个页面显示图片背后的链接
downloader处理图片文件的下载
import scrapy
import logging
from scrapy.selector import Selector
from scrapy_redis.spiders import RedisSpider
from my.items import MyItem
class StaSpider(RedisSpider):
name = 'sta'
# allowed_domains = ['*']
baseurl = 'http://pic.netbian.com'
# start_urls = ['https://pic.netbian.com/4kmeinv/']
def parse(self, response):
print('开始一个爬虫')
logging.warning('开始一个爬虫')
# print(response.text)
selector=Selector(text=response.text)
pics=selector.xpath('//*[@id="main"]/div[3]/ul/li/a/@href').extract()
nextselector=selector.xpath('//*[@id="main"]/div[4]/a[text()="下一页"]/@href')
print(pics)
print(nextselector)
if nextselector:
nextpage=nextselector.extract_first()
print(nextpage)
else:
nextpage=None
if nextpage:
print('进入下一页',self.baseurl+nextpage)
yield scrapy.Request(self.baseurl+nextpage,callback=self.parse)
for i in pics:
print('进入图片页',self.baseurl+i)
yield scrapy.Request(self.baseurl+i,callback=self.imgdownload)
def imgdownload(self,response):
logging.warning('开始一个页面')
# print(response.text)
selector=Selector(text=response.text)
pics=selector.xpath('//*[@id="img"]/img/@src').extract_first()
print(pics)
title=selector.xpath('//*[@id="main"]/div[2]/div[1]/div[1]/h1/text()').extract_first()
print(title)
picurl=self.baseurl+pics
item=MyItem()
item['title']=title
item['picurl']=picurl
yield item
print('进入下载页',picurl)
yield scrapy.Request(picurl,callback=self.downloader,meta={'title':title})
def downloader(self,response):
print('进入下载页')
title=response.meta.get('title')
with open('pics/'+title+'.jpg','wb') as f:
f.write(response.body)
运行方法
直接运行
第一次启动在项目目录下命令行运行
scrapy crawl xxx
xxx是主爬虫程序的名字,接着在redis库中推入自己的起点网址
中断后直接在命令行重新执行命令即可断点续传,分布式也是由此而来
scrapyd部署
配置scrapy.cfg文件
然后在一个终端启动scrapyd
scrapyd
另一个终端先部署
scrapyd-deploy mypro -p my
再启动
curl http://localhost:6800/schedule.json -d project=my -d spider=sta
sta是我的spider名字,可以根据自己的爬虫名称调整
my是项目名字
mypro是配置的名字
此时运行起来在网页界面可以看到运行
在redis推入一个新的网址
可以看到logs里面已经显示运行了
但是有点错误,这是我的请求被对面服务器关掉了
这肯定不是scrapyd的原因,后面我们再调试
停止项目
curl http://localhost:6800/cancel.json -d project=my -d job=bf98a5453f6511ecafbd10c37bb765fc
调整设置
这里发现自己在浏览器访问那个网页被重定向了一次,ip应该没有被封闭,但是有重定向
Charles抓包看数据就很明显
修改headers
这里做了个随机UA中间件,但是没有用,重定向的话考虑把headers设置一个长链接
设置中修改
DEFAULT_REQUEST_HEADERS = {
'referer':'https://pic.netbian.com/4kfengjing',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'connection':'keep-alive',
}
这是在默认中间件调用的,位置是
请求头和用户代理的中间件的默认排序值可以在defaultsettings文件中找到
如果通过自定义中间件来设置随机UA,放在headers设置前面的话,可能会有什么问题
目前这里没看出来覆盖的情况
随机UA代码参考
这里就是把默认的UA中间件代码拿过来,自己写一个随机UA的类,然后在request处理函数中设置UA即可
from collections import defaultdict, deque
import logging
import pprint
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
logger = logging.getLogger(__name__)
import random
class MyUA:
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11;Linux x86_64)','(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num,
fourth_num)
@classmethod
def get_ua(cls):
return ' '.join(['Mozilla/5.0', random.choice(cls.os_type),
'AppleWebKit/537.36','(KHTML, like Gecko)', cls.chrome_version,
'Safari/537.36'])
class RandomUAMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
# def __init__(self, user_agent='Scrapy'):
# self.user_agent = MyUA.get_ua()
#不处理设置中的UA
# @classmethod
# def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'])
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
# return o
#
# def spider_opened(self, spider):
# self.user_agent = self.user_agent
def process_request(self, request, spider):
request.headers[b'User-Agent']= MyUA.get_ua()
# print('UA中间件调用2',request.headers)
代理服务器
如果是代理服务器proxies的话,可以在相关网站购买能用的ip,并用中间件处理,如下所示使用的是代理ip之后的请求
ip的使用时有时限的,建议增加一段程序,动态的从IP网站重新提取ip
代码参考:
设置中要有PROXIES和HTTPPROXY_ENABLED两项,前一个是列表,后一个是布尔值
我购买的代理IP有提取接口,即时提取一定数量,取到的是ip:端口的形式,要根据形式调整格式,最后提交给request的是http://ip:port的形式
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from collections import defaultdict
from scrapy import signals
from scrapy.exceptions import NotConfigured
from twisted.internet.error import ConnectionRefusedError, TimeoutError
class RandomProxyMiddleware:
def __init__(self, settings):
# 2. 初始化配置及相关变量
self.proxies = settings.getlist('PROXIES')
self.stats = defaultdict(int)
self.max_failed = 3
@classmethod
def from_crawler(cls, crawler):
# 1. 创建中间件对象
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
return cls(crawler.settings)
def process_request(self, request, spider):
# 3. 为每个request对象分配一个随机的IP代理
if self.proxies and not request.meta.get('proxy') \
and request.url not in spider.start_urls:
print(self.proxies)
request.meta['proxy'] = 'http://'+random.choice(self.proxies)
print(request.meta['proxy'])
def process_response(self, request, response, spider):
# 4. 请求成功则调用process_response
cur_proxy = request.meta.get('proxy')
# 判断是否被对方封禁
if response.status in (401, 403):
# 给相应的IP失败次数+1
self.stats[cur_proxy] += 1
print('%s got wrong code %s times' % (cur_proxy, self.stats[cur_proxy]))
# 当某个IP的失败次数累计到一定数量
if self.stats[cur_proxy] >= self.max_failed:
print('got wrong http code (%s) when use %s' \
% (response.status, cur_proxy))
# 可以认为该IP被对方封禁了,从代理池中将该IP删除
self.remove_proxy(cur_proxy)
del request.meta['proxy']
# 将该请求重新安排调度下载
return request
return response
def process_exception(self, request, exception, spider):
# 4. 请求失败则调用process_exception
cur_proxy = request.meta.get('proxy')
# 如果本次请求使用了代理,并且网络请求报错,认为该IP出现问题了
if cur_proxy and isinstance(exception, (ConnectionRefusedError, TimeoutError)):
print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
return request
def remove_proxy(self, proxy):
if proxy in self.proxies:
self.proxies.remove(proxy)
print('remove %s from proxy list' % proxy)