说明:该功能是为了将关键词加入到embedding模型中,以便于在embedding模型中进行关键词的embedding该功能的实现是通过修改embedding模型的tokenizer来实现的该功能仅仅对EMBEDDING_MODEL参数对应的的模型有效,输出后的模型保存在原本模型
一、提取关键词的方法
使用jieba库对文本中的关键词进行提取
import os
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
def read_files_from_directory(directory):
file_contents = []
filenames = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
filepath = os.path.join(directory, filename)
with open(filepath, 'r', encoding='utf-8') as file:
file_contents.append(file.read())
filenames.append(filename)
return filenames, file_contents
def tokenize(text):
words = jieba.cut(text)
print(words)
return " ".join(words)
def generate_keywords(file_contents):
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_features=10)
X = vectorizer.fit_transform(file_contents)
terms = vectorizer.get_feature_names_out()
keywords_per_file = {}
for idx, file_vector in enumerate(X.toarray()):
top_indices = file_vector.argsort()[-10:][::-1]
keywords = [terms[i] for i in top_indices]
keywords_per_file[idx] = keywords
return keywords_per_file
def main():
directory = 'data' # 替换为你的文件目录路径
filenames, file_contents = read_files_from_directory(directory)
keywords_per_file = generate_keywords(file_contents)
for idx, keywords in keywords_per_file.items():
print(f"File: {filenames[idx]}")
print(f"Keywords: {', '.join(keywords)}\n")
if __name__ == "__main__":
main()
如果文本是英文文本,则利用 TfidfVectorizer方法对文本中的关键字进行提取,并保存到相应的文件中去。
该脚本通过使用 TF-IDF 向量化技术,从给定目录中的文本文件中提取关键词,并将文件名和关键词打印出来。可以根据需要进行修改和扩展,例如更改停用词、调整关键词数量等。
import os
from sklearn.feature_extraction.text import TfidfVectorizer
def read_files_from_directory(directory):
file_contents = []
filenames = []
for filename in os.listdir(directory):
if filename.endswith(".txt"):
filepath = os.path.join(directory, filename)
with open(filepath, 'r', encoding='utf