分析:
中国永康五金是使用ajax请求来获取数据,需要在spiders中进行构造请求并爬取数据,在pipeline中对数据进行预处理。
爬虫结构如下:
1.spiders:构造请求并爬取数据
import scrapy
from myproject.items import YongkangItem
class YkindexSpider(scrapy.Spider):
name = 'ykindex'
allowed_domains = ['ykindex.com']
start_urls = ['http://ykindex.com/']
def start_requests(self):
base_urls=["http://www.ykindex.com/desktopmodules/"
"indexviewer/chart.aspx?pageindex={page}&pa"
"gesize=10&classid={class_}&type=1&code=X&theyte=1"]
for url in base_urls:
for cl in range(0,13):
for page in range(1,self.settings.get('MAX_PAGE')+1):
cl_=str(cl).zfill(2)#使数字变成两位数格式:如‘1’变成‘01’
new_url=url.format(page=page,class_=cl_)
self.logger.info('开始爬取{}'.format(new_url))
yield scrapy.Request(new_url,self.parse)
def parse(self, response):
datas=response.xpath('//set')
cla=response.xpath('//chart/@caption')
for data in datas:
item=YongkangItem()
item['class_']=cla.extract_first().replace('-',':')
item['date']=data.xpath('@label').extract_first()
item['value']=data.xpath('@value').extract_first()
item['chain']=data.xpath('@tooltext').extract_first()
yield item
pass
2.items:构造数据存储结构
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MyprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class YongkangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
class_=scrapy.Field()
date=scrapy.Field()
value=scrapy.Field()
chain=scrapy.Field()
code=scrapy.Field()
pass
3.pipelines:对数据进行预处理和保存
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from myproject.items import YongkangItem
import time
import csv
class MyprojectPipeline(object):
def process_item(self, item, spider):
return item
class ClassPipeline(object):
#添加数据代码
def __init__(self, p):
self.pattern= p
@classmethod
def from_crawler(cls,crawler):
return cls(
p = crawler.settings.get('PATTERN')
)
def process_item(self,item,spider):
clas_=item['class_']
#print(self.pattern)
item['code']=self.pattern.get(clas_,'None')
t1=item['date']
timearray=time.strptime(t1,'%y-%m-%d')
item['date']=time.strftime('%Y%m%d',timearray)
return item
class SavePipeline(object):
##对数据进行保存
def __init__(self,PATH):
self.path=PATH
@classmethod
def from_crawler(cls,crawler):
return cls(
PATH = crawler.settings.get('PATH')
)
def process_item(self,item,spider):
with open(self.path,'a+',encoding='gbk') as f:
writer=csv.writer(f)
writer.writerow([item['class_'],item['code'],
item['date'],item['value'],item['chain']])
4.settings:配置
# -*- coding: utf-8 -*-
# Scrapy settings for myproject project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'myproject'
SPIDER_MODULES = ['myproject.spiders']
NEWSPIDER_MODULE = 'myproject.spiders'
MAX_PAGE=2
PATH='D:/爬虫/保存的文件/五金.csv'
PATTERN={"五金市场交易周价格指数":"代码1",
"五金市场交易周价格指数:门及配件":"代码2",
"五金市场交易周价格指数:门及配件:门产品(整件)":"代码3",
"五金市场交易周价格指数:门及配件:窗及配件":"代码4"
}
#LOG_FILE='D:/爬虫/myproject/myproject/spiders/log.txt'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'myproject (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'myproject.pipelines.MyprojectPipeline': 300,
'myproject.pipelines.ClassPipeline':300,
'myproject.pipelines.SavePipeline':400
}