我们使用语言与文字 描述自然,语言有其内在的结构;如原始人用鸟鸣声进行狩猎活动一样;特定的“含义;文章,文本将所有的内容统一到一个框架下,所要做的只是将其“还原”
理想状态
百万字 计算量 ;统计法 关键字分群处理 查找 ;使用统计 进行文本 收集;再对收集的文本进行归类;群,特征,从而得出相应的情节 事件
想要的分类 标签 网上没有 ;想要用神经网络实现,发现成本有点大;直接使用击中来筛选;
re进行百万字分析,速度还是可以
import re
from collections import Counter
import time
import json
with open("XX.txt",encoding = "utf-8") as f:
h = f.read()
f.seek(2000)
h = re.sub("[\.\*\?\|\[\]\(\)\+\-`]"," ",h)
#单字统计
c = Counter(h)
print("# 统计数重复==1的词")
f1 = filter(lambda v:v[1] ==1,c.items())
f2 = dict(filter(lambda v:v[1]>1,c.items()))
//重复执行 以统计文本 x xx xxx xxxx
print(time.strftime("2 %H %M %S",time.localtime()))
ns = ".|".join(f2.keys())+'.'
print(ns)
r = re.findall(ns,h,re.S)
r = Counter(r)
f1 = dict(filter(lambda v:v[1] ==1,r.items()))
#恢复原有统计
result = f1 #dict(filter( lambda v:v[1] ==1,r.items()))
result = {k[0:-1]:f2[k[0:-1]] for k,v in result.items()} #输出结果
print(time.strftime("2 %H %M %S",time.localtime()))
out.update(result)
f2 = dict(filter(lambda v:v[1] >1,r.items()))
--------------------------------------------------------------------------------
import requests
import re
from collections import Counter
#获取总目录 1
##h = requests.get("https://xxxxxx/{0}.html".format(1))
##h = h.content.decode("utf-8")
##with open("total.txt","w",encoding = "utf-8") as f:
## f.write(h)
with open("total.txt","r",encoding = "utf-8") as f:
hcontent = f.read()
##print(h)
#获取书仓列表
content_list = re.findall('<a(.*?)</a>',hcontent,re.S)
##print(content_list)
for i in content_list[27:1025]: #27-1025
print("********************")
print(i)
#1书名网
href = i.split('"')[1]
#2书名
## print(href)
name = i.split('>')[1]
#跳转到 对应网址
h2 = requests.get(href) #******
h2 = h2.content.decode("utf-8")
## with open("222.txg","w",encoding = "utf-8") as f:
## f.write(h2)
## with open("222.txg",encoding = "utf-8") as f:
## h2 = f.read()
## print(h2)
#目录框 vlist
r2 = re.findall('<ul class="vlist">(.*?)</ul>',h2,re.S)
r2 = " ".join(r2)
# 获取章节目录
r2 = re.findall('<a(.*?)</a>',r2,re.S)
print("****")
r3 = r2[-1:-4:-1]
print(r3)
#获取章节 内容 123
for i in (r3): #r2[31:29:-1]:
print(i)
href = "https://xxxxx" + i.split('"')[1]
r = requests.get(href)
h = r.content.decode("utf-8")
## with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
## f.write(h)
##
## with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
## h = f.read()
r = re.findall("<p>(.*?)</p>",h,re.S)
## print(r)
r = " ".join(r)
#击中标签
c1 = Counter(re.findall('门派|穿越|皇帝|王爷|唐朝|宋朝|清朝',r,re.S)).keys()
c2 = Counter(re.findall("魔法",r,re.S)).keys()
c3 = Counter(re.findall("剑|道士|道人|和尚",r,re.S)).keys()
c4 = Counter(re.findall("精神病院|青山医院|练功|针灸",r,re.S)).keys()
print({name:(c1,c2,c3,c4)})
#------------------------------------------------------------------------------------------
input("bbb")
##print(r[27:100]) #1025
## print(name)
##with open("page.txt","w",encoding = "utf-8") as f:
## f.write(h)
with open("page.txt","r",encoding = "utf-8") as f:
h = f.read()
##print(h)
##print(r[31:29:-1])
#获取章节 1 2 3 章
n = 0
for i in r[31:30:-1]: #r[31:29:-1]
print(i)
n = n +1
href = "https://xxxxxx" + i.split('"')[1]
r = requests.get(href)
h = r.content.decode("utf-8")
print(h)
with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
f.write(h)
with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
h = f.read()
r = re.findall("<p>(.*?)</p>",h,re.S)
print(r)
r = " ".join(r)
#击中标签
c1 = Counter(re.findall('门派|穿越|皇帝|王爷|唐朝|宋朝|清朝',r,re.S)).keys()
c2 = Counter(re.findall("魔法",r,re.S)).keys()
c3 = Counter(re.findall("剑|道士|道人|和尚",r,re.S)).keys()
c4 = Counter(re.findall("精神病院|青山医院|练功|针灸",r,re.S)).keys()
print({name:(c1,c2,c3,c4)})
input("aa")
#恢复原数量*******************************
print(time.strftime("%H %M %S",time.localtime()))
##h= {}
v1 = {k[0:-1]:v for k,v in v1.items()}
print(time.strftime("%H %M %S",time.localtime()))
st = v1.keys() & v2.keys()
print(st)
print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}
v1 = dict(filter(lambda v:v[1]==1,d0.items()))
#恢复原数量*******************************
print(time.strftime("%H %M %S",time.localtime()))
##h= {}
v1 = {k[0:-1]:v for k,v in v1.items()}
print(time.strftime("%H %M %S",time.localtime()))
st = v1.keys() & v2.keys()
print(st)
-------------------------------------------------------
print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}
##h = { k for k in st}
print(time.strftime("%H %M %S",time.localtime()))
h = dict(sorted(h.items(),key = lambda v:v[1],reverse = True))
print(h)
print(time.strftime("%H %M %S",time.localtime()))
v2 = dict(filter(lambda v:v[1]>1,d0.items()))
print(len(v1))
print(len(v2))
print("###############5字统计")
with open("d4.txt",encoding = "utf-8") as f:
d0 = json.load(f)
不成熟的技术不要入;除非你是头部的几方;