from pyspider.libs.base_handler import *
from lxml import etree
import re
from datetime import datetime
import random
import pymongo
db = client['research']
col = db['car_information']
col.create_index([
("car_type", pymongo.ASCENDING),
("brand", pymongo.ASCENDING),
], background=True)
class Handler(BaseHandler):
crawl_config = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'retries': 10
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.audi.cn/cn/web/zh/models.html', callback=self.index_page, validate_cert=False, )
@config(age=0)
def index_page(self, response):
for each in response.etree.xpath('//div//div[@id="list"]//li/a/@href'):
# print type(each)
car_type = each.split('/')[-2]
second_href = 'https://audi.cn' + each
print car_type
self.crawl(second_href, callback=self.second_page, validate_cert=False, save={'car_type': car_type})
@config(age=0)
def second_page(self, response):
types = response.etree.xpath(
'//div[starts-with(@class,"nm-wrapper")]//div[@class="nm-wrapper-content"]//h2/text()')
for each in response.doc('ul#nm-navigation > li > a').items():
if each.text() == u'装备价格表':
third_href = each.attr.href
print third_href
self.crawl(third_href, callback=self.four_page, validate_cert=False,
save={'car_type': response.save['car_type'], 'type': types[0], 'third_href': third_href})
@config(age=0)
def four_page(self, response):
img_src = response.etree.xpath(
'//form/div/div[3]/div/div/div/div[2]/div[starts-with(@class, "removec-as")]/img/@src')
next_hrefs = response.etree.xpath(
'//div[starts-with(@class, "nm-content")]/div[2]//a[starts-with(@class, " nm-goto-video-link")]//@href')
if len(next_hrefs):
for next_href in next_hrefs:
print 1
save = {'car_type': response.save['car_type'],
'type': response.save["type"],
'third_href': next_href
}
self.crawl(next_href, callback=self.four_page, validate_cert=False,
save=save)
else:
if len(img_src):
print 2
for each in response.etree.xpath('//form/div/div[3]/div[2]/div[1]/ul/li/@ssid'):
four_href = response.save['third_href'] +'&id='+ each
self.crawl(four_href, callback=self.detail_page_1, validate_cert=False, save=response.save)
else:
print 3
print 'third_href='+response.save['third_href']
self.crawl(response.save['third_href'], callback=self.detail_page_2, validate_cert=False,
save=response.save)
@config(age=0)
def detail_page_2(self, response):
print 4
date = datetime.now().strftime('%Y-%m-%d')
Date = datetime.strptime(date, '%Y-%m-%d')
configurations = response.etree.xpath('//form/div/div[3]/div/div/div/div[2]/span/text()')
price_RMBs = response.etree.xpath('//form/div/div[3]/div/div/div/div[3]')
brand = '奥迪'
for i in range(len(configurations)):
result = {
'date': Date,
'brand': brand,
'car_type': response.save['car_type'],
'type': response.save['type'],
'price_rmb': int(''.join(price_RMBs[i].text.strip().split(','))),
'configuration': configurations[i].strip(),
'update_time': datetime.now(),
}
print result
yield result
@config(age=0)
def detail_page_1(self, response):
print 5
date = datetime.now().strftime('%Y-%m-%d')
Date = datetime.strptime(date, '%Y-%m-%d')
configurations = response.etree.xpath('//form/div/div[3]/div/div/div/div[2]/span/text()')[0]
price_RMBs = response.etree.xpath('//form/div/div[3]/div/div/div/div[3]')[0]
brand = '奥迪'
result = {
'date': Date,
'brand': brand,
'car_type': response.save['car_type'],
'type': response.save['type'],
'price_rmb': int(''.join(price_RMBs.text.strip().split(','))),
'configuration': configurations.strip(),
'update_time': datetime.now(),
}
yield result
print result
def on_result(self, result):
# 存入数据库
print result
if result is None:
return
col.update({'price_rmb': result['price_rmb'], 'configuration': result['configuration']}, {'$set': result},
upsert=True)