scrapy爬取中国永康五金-CSDN博客

本文介绍了一个使用Scrapy框架爬取中国永康五金市场交易周价格指数数据的爬虫项目。该项目通过发送AJAX请求获取数据，并在pipeline阶段对数据进行预处理和保存。涉及的技术包括Python、Scrapy、XPath、CSV文件操作和时间格式转换。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

分析：

中国永康五金是使用ajax请求来获取数据，需要在spiders中进行构造请求并爬取数据，在pipeline中对数据进行预处理。

爬虫结构如下：

在这里插入图片描述

1.spiders：构造请求并爬取数据

import scrapy
from myproject.items import YongkangItem

class YkindexSpider(scrapy.Spider):
    name = 'ykindex'
    allowed_domains = ['ykindex.com']
    start_urls = ['http://ykindex.com/']

    def start_requests(self):
        base_urls=["http://www.ykindex.com/desktopmodules/"
                   "indexviewer/chart.aspx?pageindex={page}&pa"
                   "gesize=10&classid={class_}&type=1&code=X&theyte=1"]
        for url in base_urls:
            for cl in range(0,13):
                for page in range(1,self.settings.get('MAX_PAGE')+1):
                    cl_=str(cl).zfill(2)#使数字变成两位数格式：如‘1’变成‘01’
                    new_url=url.format(page=page,class_=cl_)
                    self.logger.info('开始爬取{}'.format(new_url))
                    yield scrapy.Request(new_url,self.parse)

    def parse(self, response):
        datas=response.xpath('//set')
        cla=response.xpath('//chart/@caption')
        for data in datas:
            item=YongkangItem()
            item['class_']=cla.extract_first().replace('-',':')
            item['date']=data.xpath('@label').extract_first()
            item['value']=data.xpath('@value').extract_first()
            item['chain']=data.xpath('@tooltext').extract_first()
            yield item

        pass

2.items：构造数据存储结构

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class MyprojectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class YongkangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    class_=scrapy.Field()
    date=scrapy.Field()
    value=scrapy.Field()
    chain=scrapy.Field()
    code=scrapy.Field()
    pass

3.pipelines：对数据进行预处理和保存

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from myproject.items import YongkangItem
import time
import csv

class MyprojectPipeline(object):
    def process_item(self, item, spider):
        return item

class ClassPipeline(object):
    #添加数据代码
    def __init__(self, p):
        self.pattern= p

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            p = crawler.settings.get('PATTERN')
        )

    def process_item(self,item,spider):
        clas_=item['class_']
        #print(self.pattern)
        item['code']=self.pattern.get(clas_,'None')
        t1=item['date']
        timearray=time.strptime(t1,'%y-%m-%d')
        item['date']=time.strftime('%Y%m%d',timearray)
        return item

class SavePipeline(object):
    ##对数据进行保存
    def __init__(self,PATH):
        self.path=PATH

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            PATH = crawler.settings.get('PATH')
            )

    def process_item(self,item,spider):
        with open(self.path,'a+',encoding='gbk') as f:
            writer=csv.writer(f)
            writer.writerow([item['class_'],item['code'],
                             item['date'],item['value'],item['chain']])

4.settings：配置

# -*- coding: utf-8 -*-

# Scrapy settings for myproject project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'myproject'

SPIDER_MODULES = ['myproject.spiders']
NEWSPIDER_MODULE = 'myproject.spiders'

MAX_PAGE=2
PATH='D:/爬虫/保存的文件/五金.csv'

PATTERN={"五金市场交易周价格指数":"代码1",
"五金市场交易周价格指数:门及配件":"代码2",
"五金市场交易周价格指数:门及配件:门产品(整件)":"代码3",
"五金市场交易周价格指数:门及配件:窗及配件":"代码4"
}

#LOG_FILE='D:/爬虫/myproject/myproject/spiders/log.txt'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'myproject (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#    'myproject.pipelines.MyprojectPipeline': 300,
     'myproject.pipelines.ClassPipeline':300,
     'myproject.pipelines.SavePipeline':400
     
}