python 文本击中

原创已于 2025-04-04 07:58:24 修改 · 370 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#windows #服务器 #linux

于 2025-02-24 15:23:53 首次发布

我们使用语言与文字描述自然，语言有其内在的结构；如原始人用鸟鸣声进行狩猎活动一样；特定的“含义；文章，文本将所有的内容统一到一个框架下，所要做的只是将其“还原”

理想状态

百万字计算量；统计法关键字分群处理查找；使用统计进行文本收集；再对收集的文本进行归类；群，特征，从而得出相应的情节事件

想要的分类标签网上没有；想要用神经网络实现，发现成本有点大；直接使用击中来筛选；

re进行百万字分析，速度还是可以

import re
from collections import Counter
import time
import json

with open("XX.txt",encoding = "utf-8") as f:
    h = f.read()
    f.seek(2000)

h = re.sub("[\.\*\?\|\[\]\(\)\+\-`]"," ",h)

#单字统计 
c = Counter(h)


print("# 统计数重复==1的词")
f1 = filter(lambda v:v[1] ==1,c.items())
f2 = dict(filter(lambda v:v[1]>1,c.items()))




//重复执行 以统计文本 x xx xxx xxxx
    print(time.strftime("2 %H %M %S",time.localtime()))
    
    ns = ".|".join(f2.keys())+'.'
    print(ns)
    r = re.findall(ns,h,re.S)
    r = Counter(r)
    f1 = dict(filter(lambda v:v[1] ==1,r.items()))
        #恢复原有统计
    result = f1 #dict(filter( lambda v:v[1] ==1,r.items()))
    result = {k[0:-1]:f2[k[0:-1]] for k,v in result.items()}  #输出结果
    print(time.strftime("2 %H %M %S",time.localtime()))
    out.update(result)
    f2 = dict(filter(lambda v:v[1] >1,r.items()))

--------------------------------------------------------------------------------

import requests
import re
from collections import Counter

#获取总目录 1
##h = requests.get("https://xxxxxx/{0}.html".format(1))
##h = h.content.decode("utf-8")
##with open("total.txt","w",encoding = "utf-8") as f:
## f.write(h)

with open("total.txt","r",encoding = "utf-8") as f:
hcontent = f.read()
##print(h)

#获取书仓列表
content_list = re.findall('<a(.*?)</a>',hcontent,re.S)
##print(content_list)

for i in content_list[27:1025]: #27-1025
print("********************")
print(i)
#1书名网
href = i.split('"')[1]
#2书名
## print(href)
name = i.split('>')[1]

#跳转到对应网址
h2 = requests.get(href) #******
h2 = h2.content.decode("utf-8")

## with open("222.txg","w",encoding = "utf-8") as f:
## f.write(h2)

## with open("222.txg",encoding = "utf-8") as f:
## h2 = f.read()

## print(h2)

#目录框 vlist
r2 = re.findall('<ul class="vlist">(.*?)</ul>',h2,re.S)
r2 = " ".join(r2)

# 获取章节目录
r2 = re.findall('<a(.*?)</a>',r2,re.S)
print("****")
r3 = r2[-1:-4:-1]

print(r3)

#获取章节内容 123
for i in (r3): #r2[31:29:-1]:
print(i)
href = "https://xxxxx" + i.split('"')[1]
r = requests.get(href)
h = r.content.decode("utf-8")

## with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
## f.write(h)
##
## with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
## h = f.read()

r = re.findall("<p>(.*?)</p>",h,re.S)
## print(r)
r = " ".join(r)

#击中标签
c1 = Counter(re.findall('门派|穿越|皇帝|王爷|唐朝|宋朝|清朝',r,re.S)).keys()
c2 = Counter(re.findall("魔法",r,re.S)).keys()
c3 = Counter(re.findall("剑|道士|道人|和尚",r,re.S)).keys()
c4 = Counter(re.findall("精神病院|青山医院|练功|针灸",r,re.S)).keys()

print({name:(c1,c2,c3,c4)})

#------------------------------------------------------------------------------------------

input("bbb")

##print(r[27:100]) #1025

## print(name)

##with open("page.txt","w",encoding = "utf-8") as f:
## f.write(h)

with open("page.txt","r",encoding = "utf-8") as f:
h = f.read()
##print(h)

##print(r[31:29:-1])

#获取章节 1 2 3 章
n = 0
for i in r[31:30:-1]: #r[31:29:-1]
print(i)
n = n +1
href = "https://xxxxxx" + i.split('"')[1]
r = requests.get(href)
h = r.content.decode("utf-8")
print(h)
with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
f.write(h)

with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
h = f.read()

r = re.findall("<p>(.*?)</p>",h,re.S)
print(r)
r = " ".join(r)

print({name:(c1,c2,c3,c4)})

input("aa")

#恢复原数量*******************************
print(time.strftime("%H %M %S",time.localtime()))
##h= {}
v1 = {k[0:-1]:v for k,v in v1.items()}
print(time.strftime("%H %M %S",time.localtime()))
st = v1.keys() & v2.keys()
print(st)

print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}

v1 = dict(filter(lambda v:v[1]==1,d0.items()))

-------------------------------------------------------

print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}

##h = { k for k in st}
print(time.strftime("%H %M %S",time.localtime()))
h = dict(sorted(h.items(),key = lambda v:v[1],reverse = True))
print(h)
print(time.strftime("%H %M %S",time.localtime()))

v2 = dict(filter(lambda v:v[1]>1,d0.items()))
print(len(v1))
print(len(v2))

print("###############5字统计")
with open("d4.txt",encoding = "utf-8") as f:
d0 = json.load(f)

不成熟的技术不要入；除非你是头部的几方；