# -*- coding:utf-8 -*- import urllib2 import urllib import json import re class JingDong: # 初始化方法 def __init__(self): # 爬的初始页面 self.base_page = r'http://list.jd.com/list.html?cat=9855,9860,9934' self.file_name = 'children_page.txt' def start(self): request = urllib2.Request(self.base_page) response = urllib2.urlopen(request) self.findChildrenPage(response.read()) def findChildrenPage(self, pre_page_content): pattern = re.compile(r'//item.jd.com/\d+.html') children_page = self.delSame(re.findall(pattern, pre_page_content)) self.saveChildrenPage(children_page) def delSame(self, children_page): return {}.fromkeys(children_page).keys() def saveChildrenPage(self, children_page): fp = open(self.file_name, 'w') for child in children_page: fp.write(r'http:'+child) fp.write('\n') fp.close() class JingDongDetail: """ 对获取京东商品价格进行简单封装 """ def __init__(self, url): self.url = url self._response = urllib.urlopen(self.url) self.html = self._response.read() def get_product(self): """ 获取html中,商品的描述(未对数据进行详细处理,粗略的返回str类型) """ product_re = re.compile(r'compatible: true,(.*?)};', re.S) product_info = re.findall(product_re, self.html)[0] return product_info def get_product_skuid(self): """ 通过获取的商品信息,获取商品的skuid """ product_info = self.get_product() skuid_re = re.compile(r'skuid: (.*?),') skuid = re.findall(skuid_re, product_info)[0] return skuid def get_product_name(self): """ 通过获取的商品信息,获取商品的name """ # '\u4e2d\u6587'.decode('unicode-escape') (你可能需要print它才能看到结果) product_info = self.get_product() # 源码中名称左右有两个',所以过滤的时候应该去掉 name_re = re.compile(r"name: '(.*?)',") name = re.findall(name_re, product_info)[0] return name.decode('unicode-escape') # 将其转换为中文 def get_product_price(self): """ 根据商品的skuid信息,请求获得商品price :return: """ price = None # 得到产品的序号和名称,取价格的时候会用得到 skuid = self.get_product_skuid() name = self.get_product_name() print name # 通过httpfox检测得知,每次网页都会访问这个网页去提取价格嵌入到html中 url = 'http://p.3.cn/prices/mgets?skuIds=J_' + skuid + '&type=1' # json调整格式,并将其转化为utf-8,列表中只有一个字典元素所以取出第一个元素就转化为字典 price_json = json.load(urllib.urlopen(url))[0] # p对应的价格是我们想要的 if price_json['p']: price = price_json['p'] return price if __name__ == '__main__': jingdong = JingDong() jingdong.start() fp = open(jingdong.file_name, 'r') for url in fp.readlines(): print url jp = JingDongDetail(url) print "*"*40 print jp.get_product_price() print "*" * 40 print "\n" fp.close()