Scrapy+bs4爬取京东商品对应的评论信息
spiders/comm.py
import json
import jsonpath
import scrapy
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from commit.items import CommitItem
from scrapy_redis.spiders import RedisSpider
class CommSpider(RedisSpider):
name = 'comm'
redis_key = 'comm:start_urls'
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
one_page_products = soup.find_all(class_='gl-item')
for one_product in one_page_products:
productId = one_product.get('data-sku')
for i in range(0,2):
src = f'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv3067&productId={productId}&score=0&sortType=5&page={i}&pageSize=10&isShadowSku=0&fold=1'
item = CommitItem()
item['productid'] = productId
request = scrapy.Request(src,callback=self.pase_comment)
request.meta['item'] = item
yield request
def pase_comment(self,response):
data = response.text
data = data.split('(')[1]
data = data.split(')')[0]
data = json.loads(data)
data_list = jsonpath.jsonpath(data, '$..comments')[0]
for one in data_list:
userid = one['id']
content = one['content']
datatime = one['creationTime']
item = response.meta['item']
item['userid'] = userid
item['content'] = content
item['datatime'] = datatime
yield item
import scrapy
class CommitItem(scrapy.Item):
content = scrapy.Field()
userid = scrapy.Field()
productid = scrapy.Field()
datatime = scrapy.Field()
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300,
}
DB_HOST='127.0.0.1'
DB_PORT=3306
DB_USER='root'
DB_PASSWORD='密码'
DB_DATABASE='jd'
DB_CHARSET='utf8'
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
import pymysql
from scrapy.utils.project import get_project_settings
class CommitPipeline(object):
def open_spider(self,spider):
settings = get_project_settings()
host = settings['DB_HOST']
port = settings['DB_PORT']
user = settings['DB_USER']
password = settings['DB_PASSWORD']
database = settings['DB_DATABASE']
charset = settings['DB_CHARSET']
self.conn = pymysql.connect(host=host,port=port,user=user,password=password,database=database,charset=charset)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql = 'INSERT INTO nr(productid,userid,datatime,content) VALUES ("%s","%s","%s","%s")'%(item['productid'],item['userid'],item['datatime'],item['content'])
try:
self.cursor.execute(sql)
self.conn.commit()
except:
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()