应实验室的要求,学习了python,没有很系统的去学,但是常用的一些功能块还是在这记录一下,方便以后使用。
文件操作
# coding=utf-8
import os
import os.path
import uuid
import re
import sys
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
# 这里放着你要操作的文件夹名称,转义字符所以要加上\\
path = 'D:\\python_code\\Pro\\uniquePro'
# 把e:\get_key\目录下的文件全部获取保存在files中
files = os.listdir(path.decode('utf-8'))
for file in files :
# 准确获取一个txt的位置,利用字符串的拼接
txt_path = 'D:\\python_code\\Pro\\uniquePro\\'+file.decode('utf-8')
# 把结果保存了在contents中
contents = codecs.open(txt_path.decode('utf-8'),'r',encoding='utf-8')
for content in contents:
# 对文件的操作
pass
# 处理好的文件放在新的目录结构下
new_txt_path = 'D:\\python_code\\Pro\\aaa\\' + file.decode('utf-8')
unique_keywords = codecs.open(new_txt_path.decode('utf-8'), 'w', encoding='utf-8')
#释放资源
unique_keywords.close()
contents.close()
数据库操作
#coding=utf-8
import MySQLdb as mdb
import os.path
import sys
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
def getProv():
con_code = mdb.connect(host='127.0.0.1', user='root', passwd='password123', db='techpooldata', charset='utf8')
cur_code = con_code.cursor()
# selectSQL = "SELECT universities FROM China_Universities"
selectSQL = "SELECT dictionary_value FROM `t_dictionary` where `level` = 1 and dictionary_type = '地区'"
cur_code.execute(selectSQL)
rows = cur_code.fetchall()
unit = []
for each in rows:
unit.append(each[0])
return unit
# getPro返回SQL处理好的结果回来
provList = getProv()
网页爬虫,返回的结果是json格式
#coding=utf-8
from lxml import etree
import requests
import MySQLdb as mdb
import codecs
import sys
import time
import random
import re
reload(sys)
sys.setdefaultencoding('utf8')
# 去数据库等到单位信息初始化操作
unitList = getUnit()
yearList = [2012, 2013, 2014, 2015, 2016]
session = requests.Session()
params = {
"entity": "浙江大学",
"name": "",
"type": "",
"year1": 2013,
"year2": 2013,
"_index1": "",
"_index2": "",
"pageindex": 1
}
for unit in unitList:
if unit is not '' and unit.find("大学") == -1:
params['entity'] = unit
time.sleep(1)
params['year1'] = 2012
params['year2'] = 2016
try:
json = session.get("http://beta.cingta.com/askMP", params=params).text
print "获取 " + unit + "中..."
except BaseException:
print "获取 "+unit+" 在 12-16年失败,等待一分钟..."
sleep_time = random.randint(60, 61)
time.sleep(sleep_time)
json = session.get("http://beta.cingta.com/askMP", params=params).text
# 说明没有一条数据
print json
if json.find('{"state":0}') != -1:
continue
# 获取json字符串中某两个字符之间的数据
num1 = json.find('[',0)
num2 = json.find(',"count":',1)
dataUnPro = json[num1:num2]
# 遍历数据
dataList = dataUnPro.split("],[")
for data in dataList:
dataArray = data.split(',')
print dataArray[1] +'\t'+ dataArray[2] + '\t'+dataArray[3].replace('"', "")+ '\t'+dataArray[4].replace('"', "")+'\t'+ dataArray[5].replace('"', "").replace(']]',"")
fileWriter.write( dataArray[1] +'\t'+ dataArray[2] + '\t'+dataArray[3].replace('"', "")+ '\t'+dataArray[4].replace('"', "")+'\t'+ dataArray[5].replace('"', "").replace(']]',"") +'\n')
网页爬虫,返回结果为html页面
#coding=utf-8
from lxml import etree
import requests
import codecs
import MySQLdb as mdb
import sys
import time
import random
reload(sys)
sys.setdefaultencoding('utf8')
fileWriter = codecs.open("newCode.txt",mode='wb',encoding='utf-8')
content = codecs.open("2.html", mode='r', encoding='utf-8' )
for html in content:
print html
tree = etree.HTML(html.decode('utf-8'))
liList0 = tree.xpath('//*[@class="level0"]')
for li in liList0:
/**
* ./表示的当前路径去查找,//表示所有能匹配到的,具体可以看网上资料,
* xpath的路径可以在chrome下右键查看源代码的时候,选择你要
* 操作的标签,然后选择Copy xpath
**/
codeList = li.xpath('./span[@class="node_name node_name-code"][1]/text()')
codeNameList = li.xpath('./span[@class="node_name"][1]/text()')
# print code.replace("['","").replace("]'","")
for code,codeName in zip(codeList,codeNameList):
fileWriter.write(code+'\t'+codeName+'\t'+"0"+'\n')
print code+'\t'+codeName