继续上面爬取腾讯招聘的例子我们再来说一下item的用法:
在items.py中做以下修改:
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
postion = scrapy.Field()
pubish_data = scrapy.Field()
此时我们 hr.py 也要做相应的改变:(导入类然后再实例化)
import scrapy
from tencent.items import TencentItem
class HrSpider(scrapy.Spider):
name = 'hr'
# allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com/position.php']
def parse(self, response):
tr_list = response.xpath('//table[@class="tablelist"]/tr')[1:-1]
print(tr_list)
for tr in tr_list:
item = TencentItem()
item['title'] = tr.xpath('./td[1]/a/text()').extract_first()
item['position'] = tr.xpath('./td[2]/text()').extract_first()
item['publish_data'] = tr.xpath('./td[5]/text()').extract_first()
print("item内容:",item)
yield item
# 找到下一页的url地址:
next_url = response.xpath('//a[@id=''next]@href').extract_first()
if next_url != "javascript:;":
next_url = "http://hr.tencent.com/" + next_url
yield scrapy.Request(
next_url,
callback=self.parse(),
meta = {"item":item}
)