python 文本击中

我们使用语言与文字 描述自然,语言有其内在的结构;如原始人用鸟鸣声进行狩猎活动一样;特定的“含义;文章,文本将所有的内容统一到一个框架下,所要做的只是将其“还原”

理想状态

百万字 计算量 ;统计法 关键字分群处理 查找 ;使用统计 进行文本 收集;再对收集的文本进行归类;群,特征,从而得出相应的情节 事件

想要的分类 标签 网上没有 ;想要用神经网络实现,发现成本有点大;直接使用击中来筛选;

re进行百万字分析,速度还是可以 

import re
from collections import Counter
import time
import json

with open("XX.txt",encoding = "utf-8") as f:
    h = f.read()
    f.seek(2000)

h = re.sub("[\.\*\?\|\[\]\(\)\+\-`]"," ",h)

#单字统计 
c = Counter(h)


print("# 统计数重复==1的词")
f1 = filter(lambda v:v[1] ==1,c.items())
f2 = dict(filter(lambda v:v[1]>1,c.items()))




//重复执行 以统计文本 x xx xxx xxxx
    print(time.strftime("2 %H %M %S",time.localtime()))
    
    ns = ".|".join(f2.keys())+'.'
    print(ns)
    r = re.findall(ns,h,re.S)
    r = Counter(r)
    f1 = dict(filter(lambda v:v[1] ==1,r.items()))
        #恢复原有统计
    result = f1 #dict(filter( lambda v:v[1] ==1,r.items()))
    result = {k[0:-1]:f2[k[0:-1]] for k,v in result.items()}  #输出结果
    print(time.strftime("2 %H %M %S",time.localtime()))
    out.update(result)
    f2 = dict(filter(lambda v:v[1] >1,r.items()))

--------------------------------------------------------------------------------

import requests
import re
from collections import Counter


#获取总目录 1
##h = requests.get("https://xxxxxx/{0}.html".format(1))
##h = h.content.decode("utf-8")
##with open("total.txt","w",encoding = "utf-8") as f:
##    f.write(h)
    
with open("total.txt","r",encoding = "utf-8") as f:
    hcontent = f.read()
##print(h)

#获取书仓列表
content_list = re.findall('<a(.*?)</a>',hcontent,re.S)
##print(content_list)

for i in content_list[27:1025]:  #27-1025
    print("********************")
    print(i)
    #1书名网
    href = i.split('"')[1]
    #2书名
##    print(href)
    name = i.split('>')[1]


    

    
    #跳转到 对应网址
    h2 = requests.get(href)    #******
    h2 = h2.content.decode("utf-8")

##    with open("222.txg","w",encoding = "utf-8") as f:
##        f.write(h2)

##    with open("222.txg",encoding = "utf-8") as f:
##        h2 = f.read()

    
    
##    print(h2)
    
    #目录框 vlist
    r2 = re.findall('<ul class="vlist">(.*?)</ul>',h2,re.S)
    r2 = " ".join(r2)
    

    #  获取章节目录 
    r2 = re.findall('<a(.*?)</a>',r2,re.S)
    print("****")
    r3 = r2[-1:-4:-1]

    print(r3)
 


    #获取章节 内容 123
    for i in (r3):  #r2[31:29:-1]:
        print(i)
        href = "https://xxxxx" + i.split('"')[1]
        r = requests.get(href)
        h = r.content.decode("utf-8")

        
##        with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
##            f.write(h)
##
##        with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
##            h = f.read()
            
        r = re.findall("<p>(.*?)</p>",h,re.S)
##        print(r)
        r = " ".join(r)

        #击中标签
        c1 = Counter(re.findall('门派|穿越|皇帝|王爷|唐朝|宋朝|清朝',r,re.S)).keys()
        c2 = Counter(re.findall("魔法",r,re.S)).keys()
        c3 = Counter(re.findall("剑|道士|道人|和尚",r,re.S)).keys()
        c4 = Counter(re.findall("精神病院|青山医院|练功|针灸",r,re.S)).keys()

        print({name:(c1,c2,c3,c4)})
        


#------------------------------------------------------------------------------------------    
    

input("bbb")

##print(r[27:100]) #1025


    
##    print(name)


##with open("page.txt","w",encoding = "utf-8") as f:
##    f.write(h)
    
with open("page.txt","r",encoding = "utf-8") as f:
    h = f.read()
##print(h)


##print(r[31:29:-1])

#获取章节  1 2 3 章
n = 0
for i in r[31:30:-1]: #r[31:29:-1]
    print(i)
    n = n +1
    href = "https://xxxxxx" + i.split('"')[1]
    r = requests.get(href)
    h = r.content.decode("utf-8")
    print(h)
    with open("{0}".format(n)+".txt","w",encoding = "utf-8") as f:
        f.write(h)

    with open("{0}.txt".format("%d",n),encoding = "utf-8") as f:
        h = f.read()
        
    r = re.findall("<p>(.*?)</p>",h,re.S)
    print(r)
    r = " ".join(r)

    #击中标签
    c1 = Counter(re.findall('门派|穿越|皇帝|王爷|唐朝|宋朝|清朝',r,re.S)).keys()
    c2 = Counter(re.findall("魔法",r,re.S)).keys()
    c3 = Counter(re.findall("剑|道士|道人|和尚",r,re.S)).keys()
    c4 = Counter(re.findall("精神病院|青山医院|练功|针灸",r,re.S)).keys()

    print({name:(c1,c2,c3,c4)})

    

    
    
    

input("aa")

#恢复原数量*******************************
print(time.strftime("%H %M %S",time.localtime()))
##h= {}
v1 = {k[0:-1]:v for k,v in v1.items()}
print(time.strftime("%H %M %S",time.localtime()))
st = v1.keys() & v2.keys()
print(st)


print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}

v1 = dict(filter(lambda v:v[1]==1,d0.items()))

#恢复原数量*******************************
print(time.strftime("%H %M %S",time.localtime()))
##h= {}
v1 = {k[0:-1]:v for k,v in v1.items()}
print(time.strftime("%H %M %S",time.localtime()))
st = v1.keys() & v2.keys()
print(st)

-------------------------------------------------------

print(time.strftime("%H %M %S",time.localtime()))
h = {k:v2[k] for k in st}

##h = { k for k in st}
print(time.strftime("%H %M %S",time.localtime()))
h = dict(sorted(h.items(),key = lambda v:v[1],reverse = True))
print(h)
print(time.strftime("%H %M %S",time.localtime()))


v2 = dict(filter(lambda v:v[1]>1,d0.items()))
print(len(v1))
print(len(v2))

print("###############5字统计")
with open("d4.txt",encoding = "utf-8") as f:
    d0 = json.load(f)
 

不成熟的技术不要入;除非你是头部的几方;

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值