爬虫抓取奥迪汽车信息-CSDN博客


from pyspider.libs.base_handler import *
from lxml import etree
import re
from datetime import datetime
import random
import pymongo
db = client['research']
col = db['car_information']
col.create_index([
    ("car_type", pymongo.ASCENDING),
    ("brand", pymongo.ASCENDING),
], background=True)

class Handler(BaseHandler):
    crawl_config = {

        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        'retries': 10
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.audi.cn/cn/web/zh/models.html', callback=self.index_page, validate_cert=False, )

    @config(age=0)
    def index_page(self, response):
        for each in response.etree.xpath('//div//div[@id="list"]//li/a/@href'):
            # print type(each)
            car_type = each.split('/')[-2]
            second_href = 'https://audi.cn' + each
            print car_type
            self.crawl(second_href, callback=self.second_page, validate_cert=False, save={'car_type': car_type})

    @config(age=0)
    def second_page(self, response):
        types = response.etree.xpath(
            '//div[starts-with(@class,"nm-wrapper")]//div[@class="nm-wrapper-content"]//h2/text()')
        for each in response.doc('ul#nm-navigation > li > a').items():
            if each.text() == u'装备价格表':
                third_href = each.attr.href
                print third_href
                self.crawl(third_href, callback=self.four_page, validate_cert=False,
                           save={'car_type': response.save['car_type'], 'type': types[0], 'third_href': third_href})

    @config(age=0)
    def four_page(self, response):
        img_src = response.etree.xpath(
            '//form/div/div[3]/div/div/div/div[2]/div[starts-with(@class, "removec-as")]/img/@src')
        next_hrefs = response.etree.xpath(
            '//div[starts-with(@class, "nm-content")]/div[2]//a[starts-with(@class, " nm-goto-video-link")]//@href')
        if len(next_hrefs):
            for next_href in next_hrefs:
                print 1
                save = {'car_type': response.save['car_type'],
                        'type': response.save["type"],
                        'third_href': next_href
                       }
                        
                self.crawl(next_href, callback=self.four_page, validate_cert=False,
                           save=save)
        else:
            if len(img_src):
                print 2
                for each in response.etree.xpath('//form/div/div[3]/div[2]/div[1]/ul/li/@ssid'):
                    four_href = response.save['third_href'] +'&id='+ each
                    self.crawl(four_href, callback=self.detail_page_1, validate_cert=False, save=response.save)
            else:
                print 3
                print 'third_href='+response.save['third_href']
                
                self.crawl(response.save['third_href'], callback=self.detail_page_2, validate_cert=False,
                           save=response.save)

    @config(age=0)
    def detail_page_2(self, response):
        print 4
        date = datetime.now().strftime('%Y-%m-%d')
        Date = datetime.strptime(date, '%Y-%m-%d')
        configurations = response.etree.xpath('//form/div/div[3]/div/div/div/div[2]/span/text()')
        price_RMBs = response.etree.xpath('//form/div/div[3]/div/div/div/div[3]')
        brand = '奥迪'
        for i in range(len(configurations)):
            result = {
                'date': Date,
                'brand': brand,
                'car_type': response.save['car_type'],
                'type': response.save['type'],
                'price_rmb': int(''.join(price_RMBs[i].text.strip().split(','))),
                'configuration': configurations[i].strip(),
                'update_time': datetime.now(),
            }
            print result
            yield result

    @config(age=0)
    def detail_page_1(self, response):
        print 5
        date = datetime.now().strftime('%Y-%m-%d')
        Date = datetime.strptime(date, '%Y-%m-%d')
        configurations = response.etree.xpath('//form/div/div[3]/div/div/div/div[2]/span/text()')[0]
        price_RMBs = response.etree.xpath('//form/div/div[3]/div/div/div/div[3]')[0]
        brand = '奥迪'
        result = {
            'date': Date,
            'brand': brand,
            'car_type': response.save['car_type'],
            'type': response.save['type'],
            'price_rmb': int(''.join(price_RMBs.text.strip().split(','))),
            'configuration': configurations.strip(),
            'update_time': datetime.now(),
        }
        yield result
        print result
    def on_result(self, result):
        # 存入数据库
        print result
        if result is None:
            return
        col.update({'price_rmb': result['price_rmb'], 'configuration': result['configuration']}, {'$set': result},
                   upsert=True)