python database ioerror_win10上用Python2.7处理文本,出错IOError: [Errno 2] No such file or directory:如何解决???...

本文档介绍了一个在Windows 10上使用Python 2.7处理文本时遇到的IOError问题,具体是由于尝试加载自定义分词词典时找不到文件导致。文中提供了一个利用jieba分词库进行文本预处理的完整流程,包括加载自定义词典、分词、去停用词等步骤,并给出了批量处理文件的方法。然而,在实际运行过程中,代码在加载用户字典时触发了IOError。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

# coding=utf-8

import os

import jieba

import sys

import re

import time

import jieba.posseg as pseg

sys.path.append("../")

jieba.load_userdict("../Python27/fenci/dict.txt") # 加载自定义分词词典

'''

title:利用结巴分词进行文本语料处理:单文本处理器、批量文件处理器

1 首先对文本进行遍历查找

2 创建原始文本的保存结构

3 对原文本进行结巴分词和停用词处理

4 对预处理结果进行标准化格式,并保存原文件结构路径

author:白宁超

myblog:http://www.cnblogs.com/baiboy/

time:2017年4月28日10:03:09

'''

'''

分词.词性标注以及去停用词

stopwordspath: 停用词路径

dealpath:中文数据预处理文件的路径

savepath:中文数据预处理结果的保存路径

'''

"""

def cutTxtWord(dealpath, savepath, stopwordspath):

stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')]) # 停用词表

with open(dealpath, "r", encoding='utf-8') as f:

txtlist = f.read() # 读取待处理的文本

words = pseg.cut(txtlist) # 带词性标注的分词结果

cutresult = "" # 获取去除停用词后的分词结果

for word, flag in words:

if word not in stopwords:

cutresult += word + "/" + flag + " " # 去停用词

getFlag(cutresult, savepath) #

"""

'''

分词.词性标注以及去停用词

stopwordspath: 停用词路径

read_folder_path :中文数据预处理文件的路径

write_folder_path :中文数据预处理结果的保存路径

filescount=300 #设置文件夹下文件最多多少个

'''

def cutFileWord(read_folder_path, write_folder_path, stopwordspath):

# 停用词表

stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')])

# 获取待处理根目录下的所有类别

folder_list = os.listdir(read_folder_path)

# 类间循环

for folder in folder_list:

# 某类下的路径

new_folder_path = os.path.join(read_folder_path, folder)

# 创建保存文件目录

path = write_folder_path + folder # 保存文件的子文件

isExists = os.path.exists(path)

if not isExists:

os.makedirs(path)

print(path + ' 创建成功')

else:

pass

save_folder_path = os.path.join(write_folder_path, folder) # 某类下的保存路径

print('--> 请稍等,正在处理中...')

# 类内循环

files = os.listdir(new_folder_path)

j = 1

for file in files:

if j > len(files): break

dealpath = os.path.join(new_folder_path, file) # 处理单个文件的路径

with open(dealpath, "r", encoding='utf-8') as f:

txtlist = f.read()

# python 过滤中文、英文标点特殊符号

# txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",txtlist)

words = pseg.cut(txtlist) # 带词性标注的分词结果

cutresult = "" # 单个文本:分词后经停用词处理后的结果

for word, flag in words:

if word not in stopwords:

cutresult += word + "/" + flag + " " # 去停用词

savepath = os.path.join(save_folder_path, file)

getFlag(cutresult, savepath)

j += 1

'''

做词性筛选

cutresult:str类型,初切分的结果

savepath: 保存文件路径

'''

def getFlag(cutresult, savepath):

txtlist = [] # 过滤掉的词性后的结果

# 词列表为自己定义要过滤掉的词性

cixing = ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz", "/y"]

for line in cutresult.split('\n'):

line_list2 = re.split('[ ]', line)

line_list2.append("\n") # 保持原段落格式存在

line_list = line_list2[:]

for segs in line_list2:

for K in cixing:

if K in segs:

line_list.remove(segs)

break

else:

pass

txtlist.extend(line_list)

# 去除词性标签

resultlist = txtlist[:]

flagresult = ""

for v in txtlist:

if "/" in v:

slope = v.index("/")

letter = v[0:slope] + " "

flagresult += letter

else:

flagresult += v

standdata(flagresult, savepath)

'''

标准化处理,去除空行,空白字符等。

flagresult:筛选过的结果

'''

def standdata(flagresult, savepath):

f2 = open(savepath, "w", encoding='utf-8')

for line in flagresult.split('\n'):

if len(line) >= 2:

line_clean = "/ ".join(line.split())

lines = line_clean + " " + "\n"

f2.write(lines)

else:

pass

f2.close()

if __name__ == '__main__':

t1 = time.time()

# 测试单个文件

# dealpath = "../Database/SogouC/FileTest/1.txt"

# savepath = "../Database/SogouCCut/FileTest/1.txt"

# stopwordspath = '../Database/stopwords/CH_stopWords.txt'

stopwordspath1 = '../Python27/fenci/chstop.txt' # 哈工大停用词表

# 批量处理文件夹下的文件

# rfolder_path = '../Database/SogouC/Sample/'

rfolder_path = '../Python27/fenci/FileNews/'

# 分词处理后保存根路径

wfolder_path = '../Python27/fenci/result/'

# 中文语料预处理器

# cutTxtWord(dealpath,savepath,stopwordspath) # 单文本预处理器

cutFileWord(rfolder_path, wfolder_path, stopwordspath) # 多文本预处理器

t2 = time.time()

print("中文语料语处理完成,耗时:" + str(t2 - t1) + "秒。") # 反馈结果运行结果如下Building prefix dict from the default dictionary ...Loading model from cache c:\users\hp\appdata\local\temp\jieba.cacheLoading model cost 0.478 seconds.Prefix dict has been built succesfully.Traceback (most recent call last):  File "D:/Python27/fenci/fenci4.py", line 10, in     jieba.load_userdict("../Python27/fenci/dict.txt")  # 加载自定义分词词典  File "D:\Python27\lib\site-packages\jieba\__init__.py", line 374, in load_userdict    f = open(f, 'rb')IOError: [Errno 2] No such file or directory: '../Python27/fenci/dict.txt'Process finished with exit code 1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值