磁盘文件
基于终端指令
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
#allowed_domains = ['www.qiushibaike.com/text']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 建议大家使用xpath进行解析(框架集成了xpath解析的接口)
div_list = response.xpath("//div[@id='content-left']/div")
# 存储到的解析到的页面数据
data_list = []
for div in div_list:
author = div.xpath('./div/a[2]/h2/text()').extract_first()
content = div.xpath(".//div[@class='content']/span/text()").extract_first()
#print(author+'\n'+content)
dict = {'author':author,'content':content}
data_list.append(dict)
return data_list
scrapy crawl qiubai -o qiubai.csv --nolog
乱码问题,多空行解决:
https://blog.csdn.net/Light__1024/article/details/88655333
scrapy xpath()中的/与//的区别
https://blog.csdn.net/changer_WE/article/details/84553986
流程总结
- 返回一个可迭代类型的对象(存储解析到的页面内容)
- 使用终端指令 scrapy crawl 爬虫文件名称 –o 磁盘文件.后缀
基于管道
就是利用items对象传递数据
test.py
import scrapy
from first.items import FirstItem
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.qiushibaike.com/text']
def parse(self, response):
div_list = response.xpath("//div[@id='content-left']/div")
for div in div_list:
author = div.xpath('./div/a[2]/h2/text()').extract_first()
content = div.xpath(".//div[@class='content']/span/text()").extract_first()
# 1.将解析到的数据值(author和content)存储到items对象
# 所以items导入,导入之前定义字段。
items=FirstItem()
items['author']=author
items['content']=content
#提交给管道
yield items
import scrapy
class FirstItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义存储字段
author = scrapy.Field()
content = scrapy.Field()
class FirstPipeline(object):
# 2、该进:该方法只在爬虫时候打开一次
def open_spider(self,spider):
self.f=open('./qiubai_pipe.txt','w',encoding='utf-8')
# 1、该方法可以接收爬虫提交的item对象
# 每次调用管道都会进行数据读取的操作,
def process_item(self, item, spider):
author = item['author']
content = item['content']
# # 持久化存储io操作,如果只有这一步最后文件内容只有最后一条
# with open('./qiubai_pipe.txt','w',encoding='utf-8')as f:
# f.write(author+':'+content+'\n\n\n')
# return item
self.f.write(author+':'+content+'\n\n\n')
return item
def close_spider(self,spider):
# 3、爬虫结束时候执行
self.f.close()
settings里取消注释67行
ITEM_PIPELINES = {
'first.pipelines.FirstPipeline': 300,
}
流程总结:
- items对象定义字段
- test文件导包,生成对象,存储,传给管道 yield items
- pipelines文件内定义三个方法,开启,写入,关闭文件。
- 配置文件取消注释
数据库
mysql:
import pymysql
class FirstPipeline(object):
conn = None
cursor = None
def open_spider(self, spider):
# 连接数据库
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123',
db='db2',
charset='utf8')
def process_item(self, item, spider):
author = item['author']
content = item['content']
sql = 'insert into test values ("%s","%s")'%(author,content)
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
import scrapy
from first.items import FirstItem
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.qiushibaike.com/text']
def parse(self, response):
div_list = response.xpath("//div[@id='content-left']/div")
for div in div_list:
author = div.xpath('./div/a[2]/h2/text()').extract_first()
content = div.xpath(".//div[@class='content']/span/text()").extract_first()
items = FirstItem()
items['author'] = author
items['content'] = content
yield items
import scrapy
class FirstItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义存储字段
author = scrapy.Field()
content = scrapy.Field()
流程总结:
- pymysql操作mysql的语法!!
- 流程和管道存文件一致
redis:
import redis
import json
class FirstPipeline(object):
conn = None
def open_spider(self, spider):
# 连接数据库 Redis
self.conn = redis.Redis(
host='127.0.0.1',
port=6379
)
def process_item(self, item, spider):
data_dict = {
'author':item['author'],
'content':item['content']
}
try:
data_dict = json.dumps(data_dict)
self.conn.lpush('data', data_dict)
return item
except Exception as e:
print(e)
def close_spider(self,spider):
print('ok')
流程总结:
-
redis语法 :
http://www.runoob.com/redis/redis-install.html
redis-server.exe redis.windows.conf
redis-cli.exe -h 127.0.0.1 -p 6379 -
redis.Redis lpush(‘data’, data_dict)
-
写入redis数据的格式必须是byte,string,number
-
lrange data 0 -1
同时存入文件,MySQL,redis中:
- 在管道文件中同时定义三个管道类,把三个pipelines文件合并
- settings里设置优先级,数字越小越先执行,0-1000.
学习:
https://www.cnblogs.com/foremostxl/p/10085232.html#_label9