import numpy as np import pickle f = open(r'C:\Users\sdnugeo\Desktop\target2.pkl','rb') bi = pickle.load(f) # bi = np.array(bi) data = np.arange(0, 156) print(data) data = data.tolist() # print(data) # 每次抽取1000个数值，共抽取10次 samples = [] c = [] for i in range(11): if len(data) > 15: sample = np.random.choice(data, 15, replace=False) else: sample = np.random.choice(data, 6, replace=False) # continue # print(type(sample)) # sample2 = np.array(sample) b = [bi[j] for j in list(sample)] # d = np.array(b) print(type(b)) c.append(b) # c = bi[0] print(sample) print(b) samples.append(sample) # sample = [s for s in sample if s in data] for s in sample: data.remove(s) samples = np.array(samples).reshape(-1) print(samples) print(data)，报错setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (11,) + inhomogeneous part.

def sample_data(filepath,idx,path): f = open(filepath, 'rb') bi = pickle.load(f) data = np.arange(0, idx) data = data.tolist() samples = [] bs = [] a = path r = math.ceil(len(data)/a) for i in range(a): if len(data) > r: sample = np.random.choice(data, r, replace=False) else: sample = np.random.choice(data, len(data), replace=False) b = [bi[j] for j in list(sample)] bs.append(b) samples.append(sample) for s in sample: data.remove(s) samples = [np.array(s) for s in samples] samples = np.concatenate(samples, axis=0) bs = [np.array(e) for e in bs] bs = np.concatenate(bs, axis=0) return samples, bs请帮我输出每一次的b值并且将导入dataload里面可以用于深度学习

import numpy as np def sample_data(filepath, idx, path): f = open(filepath, 'rb') bi = pickle.load(f) data = np.arange(0, idx) data = data.tolist() samples = [] bs = [] a = path r = math....

import os from random import shuffle import pickle import librosa import tensorflow.keras as keras import librosa.display import numpy as np from matplotlib import pyplot as plt def load_files(audio_dir): files=os.listdir(audio_dir) wav_files=[] for wav in files: if not wav.endswith('.wav'): continue wav_files.append(wav) if not wav_files: print('未找到数据集') shuffle(wav_files) nfiles=len(wav_files) ntrain=int(nfiles*0.7) return wav_files[:ntrain],wav_files[ntrain:] audio_dir='../data/recordings/' dataset_pickle='../tmp/recordings.pkl' train_files,valid_files=load_files(audio_dir) print('训练集样本数为{}\n验证集样本数为{}'.format(len(train_files),load_files(valid_files)))帮我检查错误

import numpy as np from matplotlib import pyplot as plt def load_files(audio_dir): files = os.listdir(audio_dir) wav_files = [] for wav in files: if not wav.endswith('.wav'): continue wav_files...

import pickle, numpy as np IndentationError: unexpected indent

import numpy as np # 正确的缩进示例 data = [1, 2, 3, 4, 5] with open('data.pkl', 'wb') as f: pickle.dump(data, f) # 错误的缩进示例 # with open('data.pkl', 'wb') as f: # pickle.dump(data, f)

def test(self): load_model(self.model, args.checkpoint) self.model.eval() with torch.no_grad(): rep, user_pool = self.model(self.graph) """ Save embeddings """ user_emb = (rep[:self.model.n_user] + user_pool).cpu().numpy() item_emb = rep[self.model.n_user: self.model.n_user + self.model.n_item].cpu().numpy() with open(f'HGMN-{self.args.dataset}-embeds.pkl', 'wb') as f: pickle.dump({'user_embed': user_emb, 'item_embed': item_emb}, f) """ Save results """ tqdm_dataloader = tqdm(self.testloader) uids, hrs, ndcgs = [], [], [] for iteration, batch in enumerate(tqdm_dataloader, start=1): user_idx, item_idx = batch user = rep[user_idx] + user_pool[user_idx] item = rep[self.model.n_user + item_idx] preds = self.model.predict(user, item) preds_hrs, preds_ndcgs = self.calc_hr_and_ndcg(preds, self.args.topk) hrs += preds_hrs ndcgs += preds_ndcgs uids += user_idx[::101].tolist() with open(f'HGMN-{self.args.dataset}-test.pkl', 'wb') as f: pickle.dump({uid: (hr, ndcg) for uid, hr, ndcg in zip(uids, hrs, ndcgs)}, f)

2. 保存嵌入向量：将用户嵌入向量和物品嵌入向量转换为 NumPy 数组，并使用 pickle 序列化保存到文件中。 3. 保存评估结果：通过遍历测试数据集中的批次，计算并保存每个用户的命中率和 NDCG 值。同时，也保存了每...

import pickle import numpy as np import matplotlib.pyplot as plt from PIL import Image import sys def load_pkl_data(/Users/yangguang/Documents/python_workspace/data/2025-07-15T14:55:55.540272.pkl): """安全加载PKL文件数据""" try: with open(/Users/yangguang/Documents/python_workspace/data/2025-07-15T14:55:55.540272.pkl, 'rb') as f: # 尝试不同编码方式解决兼容性问题 try: return pickle.load(f) except UnicodeDecodeError: return pickle.load(f, encoding='latin1') except Exception as e: print(f"加载PKL文件失败: {str(e)}") sys.exit(1) def diagnose_data_structure(data): """诊断数据结构并返回报告""" report = { 'data_type': type(data).name, 'keys': [], 'nested_levels': 0, 'contains_image': False } if isinstance(data, dict): report['keys'] = list(data.keys()) # 检查嵌套层级 def count_nesting(obj, level=1): if isinstance(obj, dict): return max([count_nesting(v, level+1) for v in obj.values()] + [level]) elif isinstance(obj, (list, tuple)): return max([count_nesting(item, level+1) for item in obj] + [level]) if obj else level return level report['nested_levels'] = count_nesting(data) # 检查是否包含图像数据 for value in data.values(): if isinstance(value, np.ndarray) and value.ndim in (2, 3): report['contains_image'] = True break return report def find_image_data(data, depth=0, max_depth=5): """在数据结构中递归查找图像数据""" # 防止无限递归 if depth > max_depth: return None # 如果是NumPy数组且符合图像维度 if isinstance(data, np.ndarray): if data.ndim == 2: # 灰度图像 (高度, 宽度) return data elif data.ndim == 3: # 彩色图像 (高度, 宽度, 通道) return data # 如果是字典，检查常见键名 if isinstance(data, dict): # 常见图像键名列表 image_keys = ['image', 'img', 'data', 'pixels', 'array', 'frame', 'picture'] # 优先检查常见键名 for key in image_keys: if key in data: result = find_image_data(data[key], depth+1, max_depth) if result is not None: return result # 深度搜索所有值 for value in data.values(): result = find_image_data(value, depth+1, max_depth) if result is not None: return result # 如果是列表或元组 elif isinstance(data, (list, tuple)): for item in data: result = find_image_data(item, depth+1, max_depth) if result is not None: return result return None def analyze_image(image_data): """分析图像数据并返回信息""" if not isinstance(image_data, np.ndarray): try: # 尝试转换为NumPy数组 image_data = np.array(image_data) except Exception as e: return {"error": f"无法转换为图像数组: {str(e)}"} info = { 'dimensions': image_data.ndim, 'shape': image_data.shape, 'dtype': str(image_data.dtype), 'resolution': None, 'channels': None } # 获取分辨率 if image_data.ndim == 2: # 灰度图像 (高度, 宽度) height, width = image_data.shape info['resolution'] = f"{width}×{height}" info['channels'] = 1 elif image_data.ndim == 3: # 彩色图像 (高度, 宽度, 通道) height, width, channels = image_data.shape info['resolution'] = f"{width}×{height}" info['channels'] = channels return info def display_images(image_data, num_images=6, title="提取的图像数据"): """显示图像数据""" if not isinstance(image_data, np.ndarray): print("无法显示: 图像数据不是NumPy数组") return # 处理单个图像 if image_data.ndim == 2 or (image_data.ndim == 3 and image_data.shape[2] in [1, 3, 4]): plt.figure(figsize=(6, 6)) plt.imshow(image_data, cmap='gray' if image_data.ndim == 2 or image_data.shape[2] == 1 else None) plt.title(title) plt.axis('off') plt.show() return # 处理多个图像 if image_data.ndim == 4: # 图像集合 (数量, 高度, 宽度, 通道) num_images = min(num_images, image_data.shape[0]) fig, axes = plt.subplots(1, num_images, figsize=(12, 3)) for i in range(num_images): ax = axes[i] if num_images > 1 else axes img = image_data[i] if img.shape[2] == 1: # 单通道灰度图 ax.imshow(img[:, :, 0], cmap='gray') else: # 多通道彩色图 ax.imshow(img) ax.axis('off') plt.suptitle(f"{title} (前{num_images}张)") plt.tight_layout() plt.show() def main(pkl_file): """主处理函数""" print(f"加载文件: {pkl_file}") data = load_pkl_data(pkl_file) # 诊断数据结构 print("\n=== 数据结构诊断 ===") report = diagnose_data_structure(data) print(f"数据类型: {report['data_type']}") print(f"包含的键: {report['keys'][:10]}{'...' if len(report['keys']) > 10 else ''}") print(f"嵌套层级: {report['nested_levels']}") print(f"包含图像数据: {'是' if report['contains_image'] else '否'}") # 查找图像数据 print("\n=== 搜索图像数据 ===") image_data = find_image_data(data) if image_data is None: print("错误: 未找到图像数据") return # 分析图像 print("\n=== 图像分析 ===") image_info = analyze_image(image_data) print(f"图像维度: {image_info['dimensions']}") print(f"图像形状: {image_info['shape']}") if image_info['resolution']: print(f"分辨率: {image_info['resolution']}") print(f"通道数: {image_info['channels']}") else: print("警告: 无法确定分辨率") # 显示图像 print("\n=== 显示图像 ===") display_images(image_data) if name == "main": # 替换为您的PKL文件路径 PKL_FILE = "image_data.pkl" main(PKL_FILE) 在Mac上运行该代码有问题帮我改改

with open(file_path, 'rb') as f: # 二进制模式读取 data = pickle.load(f) - 使用上下文管理器确保文件正确关闭 - rb 模式适用于二进制文件[^4] #### 在Mac上的特殊注意事项 1. **文件权限问题**： ...

if name == "main": BMES = [] print("正在读取本地模型矩阵...") with open(r'01/init_mat.pkl', "rb") as f0: init_mat = np.array(list(pickle.load(f0).values())) with open(r'01/trans_mat.pkl', "rb") as f1: init_trans_mat = np.array(list(pickle.load(f1).values())) with open(r'01/emit_mat.pkl', "rb") as f2: init_emit_mat = pickle.load(f2) catalog = list(init_emit_mat) trans_mat = [] emit_mat = [] hidden_state = ["B", "M", "E", "S"] for item in init_trans_mat: trans_mat.append(np.array(list(item.values()))) for i in hidden_state: emit_mat.append(np.array(list(init_emit_mat.loc[i]))) emit_mat = np.array(emit_mat).reshape(4,-1) print("读取模型矩阵成功！") print("目前模型的汉字库",catalog)请给这段代码的每行代码加上详细注释

with open(r'01/init_mat.pkl', "rb") as f0: init_mat = np.array(list(pickle.load(f0).values())) # 读取模型中的状态转移矩阵 init_trans_mat with open(r'01/trans_mat.pkl', "rb") as f1: init_trans_mat...

import os import re import time import torch import torch.nn as nn import pandas as pd import numpy as np from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from gensim.models import Word2Vec # SpaCy 用来取代 nltk 的低效文本处理 import spacy nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # 使用 gensim 的 Word2Vec 和 KeyedVectors from gensim.models import Word2Vec, TfidfModel from gensim.corpora import Dictionary # 自定义 Preprocess（快速版本） STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', str(text)).strip().lower() def tokenize(text): doc = nlp(clean_text(text)) return [token.text for token in doc if token.text not in STOPWORDS and token.text.isalnum()] def preprocess(text): tokens = tokenize(text) return " ".join(tokens) class SemanticMatchModel(nn.Module): def init(self, input_dim): super().init() self.fc1 = nn.Linear(input_dim, 256) self.bn1 = nn.BatchNorm1d(256) self.fc2 = nn.Linear(256, 128) self.bn2 = nn.BatchNorm1d(128) self.fc3 = nn.Linear(128, 64) self.bn3 = nn.BatchNorm1d(64) self.fc4 = nn.Linear(64, 1) self.dropout = nn.Dropout(0.3) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.relu(self.bn1(self.fc1(x))) x = self.dropout(x) x = self.relu(self.bn2(self.fc2(x))) x = self.dropout(x) x = self.relu(self.bn3(self.fc3(x))) x = self.dropout(x) x = self.sigmoid(self.fc4(x)) return x class QADataset(Dataset): """ 数据集：将正样本 (question, answer) 与随机负样本 (question, random_answer) 拼接在一起，其中正样本 label=1，负样本 label=0。 """ def init(self, qa_pairs, tfidf_vectorizer, negative_ratio=1.0): """ :param qa_pairs: [(question_text, answer_text), ...] :param tfidf_vectorizer: 已经fit好的 TfidfVectorizer :param negative_ratio: 每个正样本对应的负样本倍数 """ self.qa_pairs = qa_pairs self.vectorizer = tfidf_vectorizer self.samples = [] # 构造正样本 for i, (q, a) in enumerate(self.qa_pairs): self.samples.append((q, a, 1)) # label=1 # 构建负样本：random替换answer if negative_ratio > 0: negative_samples = [] total_pairs = len(self.qa_pairs) for i, (q, a) in enumerate(self.qa_pairs): for _ in range(int(negative_ratio)): rand_idx = np.random.randint(total_pairs) # 若随机到同一个qa对，就重新随机 while rand_idx == i: rand_idx = np.random.randint(total_pairs) neg_q, neg_a = self.qa_pairs[rand_idx] # 保持question不变，随机替换答案 negative_samples.append((q, neg_a, 0)) self.samples.extend(negative_samples) def len(self): return len(self.samples) def getitem(self, idx): q, a, label = self.samples[idx] q_vec = self.vectorizer.transform([preprocess(q)]).toarray()[0] a_vec = self.vectorizer.transform([preprocess(a)]).toarray()[0] pair_vec = np.concatenate((q_vec, a_vec)) return torch.tensor(pair_vec, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) class KnowledgeBase: def init(self, host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0): print("🔄 初始化知识库...") self.host = host self.user = user self.password = password self.database = database self.table = table self.model_dir = model_dir self.negative_ratio = negative_ratio # 确保模型目录存在 os.makedirs(self.model_dir, exist_ok=True) self.qa_pairs = [] self.q_texts = [] self.a_texts = [] self.semantic_model = None self.word2vec_model = None self.tfidf_vectorizer = None self.tfidf_matrix = None # 第一步：从数据库载入数据 self.load_data_from_mysql() # 第二步：加载或缓存预处理后的文本 self.load_or_cache_processed_questions() # 第三步：加载 TF-IDF + 向量化 self.load_cached_tfidf() # 第四步：加载 Word2Vec 或使用缓存 self.load_cached_word2vec_model() # 第五步：加载 PyTorch 模型 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') if os.path.exists(model_path): self.load_model() def load_data_from_mysql(self): print("🔄 正在连接 MySQL，加载问答数据...") conn = mysql.connector.connect( host=self.host, user=self.user, password=self.password, database=self.database ) cursor = conn.cursor() query = f"SELECT question_text, answer_text FROM {self.table}" cursor.execute(query) rows = cursor.fetchall() conn.close() self.qa_pairs = [(row[0], row[1]) for row in rows] self.q_texts = [pair[0] for pair in self.qa_pairs] self.a_texts = [pair[1] for pair in self.qa_pairs] print(f"✅ 成功从 MySQL 加载 {len(self.qa_pairs)} 条问答数据。") def load_or_cache_processed_questions(self): """使用本地缓存避免每次都预处理大量数据""" cache_path = os.path.join(self.model_dir, 'processed_questions.pkl') if os.path.exists(cache_path): print("🔄 使用缓存预处理后的分词文本。") with open(cache_path, 'rb') as f: self.processed_q_list = pickle.load(f) else: print("🔄 正在预处理问题文本（首次较慢）...") self.processed_q_list = [preprocess(q) for q in self.q_texts] with open(cache_path, 'wb') as f: pickle.dump(self.processed_q_list, f) print("✅ 预处理缓存已保存。") def load_cached_tfidf(self): """加载已存在的 TfidfVectorizer 或构建""" cache_tfidf_matrix = os.path.join(self.model_dir, 'tfidf_matrix.npz') cache_qa_list = os.path.join(self.model_dir, 'tfidf_qa.pkl') tfidf_path = os.path.join(self.model_dir, 'tfidf_vectorizer.pkl') if os.path.exists(tfidf_path) and os.path.exists(cache_tfidf_matrix) and os.path.exists(cache_qa_list): print("🔄 加载 TF-IDF 缓存版本。") import joblib self.tfidf_vectorizer = joblib.load(tfidf_path) self.tfidf_matrix = np.load(cache_tfidf_matrix)['tfidf'] with open(cache_qa_list, 'rb') as f: self.tfidf_qa = pickle.load(f) else: print("🔄 创建并构建 TF-IDF（首次较慢）...") self.tfidf_vectorizer = TfidfVectorizer( tokenizer=lambda x: x.split(), lowercase=False, max_features=10000 ) self.tfidf_qa = self.processed_q_list self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.tfidf_qa).toarray() print("✅ TF-IDF 构建完成。") import joblib joblib.dump(self.tfidf_vectorizer, tfidf_path) np.savez_compressed(cache_tfidf_matrix, tfidf=self.tfidf_matrix) with open(cache_qa_list, 'wb') as f: pickle.dump(self.tfidf_qa, f) def load_cached_word2vec_model(self): """加载已训练好的 Word2Vec 模型，没有就训练""" word2vec_path = os.path.join(self.model_dir, 'word2vec.model') if os.path.exists(word2vec_path): print("🔄 加载缓存中的 Word2Vec 模型...") self.word2vec_model = Word2Vec.load(word2vec_path) else: print("🔄 训练 Word2Vec 模型（首次较慢）...") tokenized_questions = [preprocess(q).split() for q in self.q_texts] self.word2vec_model = Word2Vec( sentences=tokenized_questions, vector_size=100, window=5, min_count=1, workers=4 ) self.word2vec_model.save(word2vec_path) print("✅ Word2Vec 模型训练完成并保存。") def sentence_to_vec(self, sentence): """将句子转换为向量表示""" tokens = preprocess(sentence).split() if self.word2vec_model: vecs = [self.word2vec_model.wv[w] for w in tokens if w in self.word2vec_model.wv] return np.mean(vecs, axis=0) if vecs else np.zeros(self.word2vec_model.vector_size) else: # 没有 Word2Vec 模型时，使用 TF-IDF 向量 return self.tfidf_vectorizer.transform([preprocess(sentence)]).toarray()[0] def build_model(self, epochs=10, batch_size=128, lr=1e-3): """ 构建并训练语义匹配模型，包含训练集/验证集拆分与性能监控。 """ # 创建数据集 full_dataset = QADataset(self.qa_pairs, self.tfidf_vectorizer, negative_ratio=self.negative_ratio) # 划分训练集/验证集 train_size = int(len(full_dataset) * 0.8) val_size = len(full_dataset) - train_size train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size]) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) # 初始化模型 sample_input, _ = full_dataset[0] input_dim = sample_input.shape[0] self.semantic_model = SemanticMatchModel(input_dim) criterion = nn.BCELoss() optimizer = optim.Adam(self.semantic_model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) # 训练模型 best_val_acc = 0.0 print("\n开始模型训练...") start_time = time.time() for epoch in range(epochs): self.semantic_model.train() total_loss, total_correct, total_samples = 0.0, 0, 0 for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - 训练中"): optimizer.zero_grad() outputs = self.semantic_model(X_batch).squeeze() loss = criterion(outputs, y_batch) loss.backward() optimizer.step() total_loss += loss.item() * len(y_batch) preds = (outputs >= 0.5).float() total_correct += (preds == y_batch).sum().item() total_samples += len(y_batch) train_loss = total_loss / total_samples train_acc = total_correct / total_samples # 验证阶段 self.semantic_model.eval() val_loss, val_correct, val_samples = 0.0, 0, 0 with torch.no_grad(): for X_val, y_val in val_loader: outputs_val = self.semantic_model(X_val).squeeze() loss_val = criterion(outputs_val, y_val) val_loss += loss_val.item() * len(y_val) preds_val = (outputs_val >= 0.5).float() val_correct += (preds_val == y_val).sum().item() val_samples += len(y_val) val_loss /= val_samples val_acc = val_correct / val_samples # 更新学习率 scheduler.step() print(f"Epoch [{epoch + 1}/{epochs}] | " f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | " f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") # 保存最优模型 if val_acc > best_val_acc: best_val_acc = val_acc model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') torch.save(self.semantic_model.state_dict(), model_path) print(f"✅ 新的最优模型已保存 (Val Acc: {best_val_acc:.4f})") end_time = time.time() print(f"\n训练完成，共耗时 {end_time - start_time:.2f} 秒。") # 加载最优模型权重 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model.load_state_dict(torch.load(model_path)) self.semantic_model.eval() def load_model(self): """加载训练好的语义匹配 PyTorch 模型""" input_dim = self.tfidf_matrix.shape[1] * 2 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model = SemanticMatchModel(input_dim) self.semantic_model.load_state_dict(torch.load(model_path, map_location='cpu')) self.semantic_model.eval() print("✅ 语义匹配模型加载完成。") def retrieve(self, query, semantic_topk=100): """ 检索接口：先通过 TF-IDF + 句向量评分做粗检，再对Top-K结果用语义模型做精检，返回最匹配的 QA。 """ # 粗检 query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] tfidf_scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() query_sent_vec = self.sentence_to_vec(query) sent_vecs = np.array([self.sentence_to_vec(q) for q in self.q_texts]) sent_scores = cosine_similarity([query_sent_vec], sent_vecs).flatten() sim_scores = tfidf_scores + sent_scores topk_indices = np.argpartition(sim_scores, -semantic_topk)[-semantic_topk:] topk_indices = topk_indices[np.argsort(sim_scores[topk_indices])[::-1]] # 精检 if self.semantic_model: with torch.no_grad(): batch_inputs = [] for i in topk_indices: q = preprocess(self.q_texts[i]) a = preprocess(self.a_texts[i]) q_vec = self.tfidf_vectorizer.transform([q]).toarray()[0] a_vec = self.tfidf_vectorizer.transform([a]).toarray()[0] pair_input = np.concatenate((q_vec, a_vec)) batch_inputs.append(pair_input) batch_inputs = torch.tensor(np.stack(batch_inputs), dtype=torch.float32) batch_scores = self.semantic_model(batch_inputs).squeeze().cpu().numpy() semantic_scores = batch_scores # 综合得分 final_scores = sim_scores[topk_indices] + semantic_scores best_idx = topk_indices[np.argmax(final_scores)] return self.qa_pairs[best_idx], final_scores.max() else: # 没有语义模型时，只使用粗检结果 best_idx = topk_indices[0] return self.qa_pairs[best_idx], sim_scores[best_idx] def recommend_similar(self, query, topk=3): """针对未命中答案的情况，推荐相似问题""" query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() topk_idx = scores.argsort()[0][-topk:][::-1] return [(self.qa_pairs[i][0], self.qa_pairs[i][1]) for i in topk_idx] class FeedbackRecorder: """记录未回答问题""" def init(self, file_path='unanswered_questions.csv'): self.file_path = file_path if not os.path.exists(self.file_path): with open(self.file_path, 'w', newline='', encoding='utf-8') as f: import csv csv.writer(f).writerow(['time', 'question']) def record_question(self, question): with open(self.file_path, 'a', newline='', encoding='utf-8') as f: import csv writer = csv.writer(f) writer.writerow([datetime.now().isoformat(), question]) def main(): kb = KnowledgeBase( host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0 ) # 是否重新训练语义匹配模型 if input("是否重新训练语义匹配模型？(y/n): ").strip().lower() == 'y': kb.build_model( epochs=5, # 训练轮数 batch_size=128, # 批大小 lr=1e-3 # 学习率 ) recorder = FeedbackRecorder() print("\n🎯 智能知识问答系统已启动（输入'q'退出聊天）\n") while True: query = input("🧐 问题：") if query.strip().lower() == 'q': break result, score = kb.retrieve(query) if result: print("💡 回答：", result[1]) print(f"📊 匹配信心分数: {score:.4f}\n") else: print("⚠ 没有找到合适的答案，已将你的问题记录下来。") recorder.record_question(query) print("🔥 相似问题推荐：") for q, a in kb.recommend_similar(query): print(f"Q: {q}\nA: {a}\n") if name == "main": main()

import numpy as np from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_...

import os import re import time import torch import torch.nn as nn import torch.optim as optim import pandas as pd import numpy as np import joblib # 添加缺失的导入 from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from gensim.models import Word2Vec import spacy nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', str(text)).strip().lower() def tokenize(text): """优化tokenize函数，移除冗余检查""" doc = nlp(clean_text(text)) return [token.text for token in doc if token.text not in STOPWORDS] # 移除isalnum检查 def preprocess(text): tokens = tokenize(text) return " ".join(tokens) class SemanticMatchModel(nn.Module): def init(self, input_dim): super().init() self.fc1 = nn.Linear(input_dim, 256) self.bn1 = nn.BatchNorm1d(256) self.fc2 = nn.Linear(256, 128) self.bn2 = nn.BatchNorm1d(128) self.fc3 = nn.Linear(128, 64) self.bn3 = nn.BatchNorm1d(64) self.fc4 = nn.Linear(64, 1) self.dropout = nn.Dropout(0.3) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.relu(self.bn1(self.fc1(x))) x = self.dropout(x) x = self.relu(self.bn2(self.fc2(x))) x = self.dropout(x) x = self.relu(self.bn3(self.fc3(x))) x = self.dropout(x) x = self.sigmoid(self.fc4(x)) return x class QADataset(Dataset): def init(self, qa_pairs, tfidf_vectorizer, negative_ratio=1.0): self.qa_pairs = qa_pairs self.vectorizer = tfidf_vectorizer self.samples = [] # 构建正样本 for i, (q, a) in enumerate(self.qa_pairs): self.samples.append((q, a, 1)) # 优化负样本构建逻辑 if negative_ratio > 0: total_pairs = len(self.qa_pairs) all_answers = [a for _, a in self.qa_pairs] # 预先生成负样本索引 neg_indices = np.random.choice( len(all_answers), size=int(total_pairs * negative_ratio), replace=True ) for idx, (q, a) in enumerate(self.qa_pairs): sample_count = int(negative_ratio) start = idx * sample_count end = start + sample_count for j in range(start, end): if j < len(neg_indices): neg_a = all_answers[neg_indices[j]] # 确保不是当前答案 if neg_a != a: self.samples.append((q, neg_a, 0)) def len(self): return len(self.samples) def getitem(self, idx): q, a, label = self.samples[idx] q_vec = self.vectorizer.transform([preprocess(q)]).toarray()[0] a_vec = self.vectorizer.transform([preprocess(a)]).toarray()[0] pair_vec = np.concatenate((q_vec, a_vec)) return torch.tensor(pair_vec, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) class KnowledgeBase: def init(self, host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0): print("🔄 初始化知识库...") self.host = host self.user = user self.password = password self.database = database self.table = table self.model_dir = model_dir self.negative_ratio = negative_ratio os.makedirs(self.model_dir, exist_ok=True) self.qa_pairs = [] self.q_texts = [] self.a_texts = [] self.semantic_model = None self.word2vec_model = None self.tfidf_vectorizer = None self.tfidf_matrix = None # 调整初始化顺序 self.load_data_from_mysql() self.load_or_cache_processed_questions() self.load_cached_tfidf() self.load_cached_word2vec_model() # 最后加载模型（确保依赖项已初始化） model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') if os.path.exists(model_path): self.load_model() else: print("⚠ 语义匹配模型未训练，请先训练模型。") def load_data_from_mysql(self): print("🔄 正在连接 MySQL，加载问答数据...") try: conn = mysql.connector.connect( host=self.host, user=self.user, password=self.password, database=self.database ) cursor = conn.cursor() query = f"SELECT question_text, answer_text FROM {self.table}" cursor.execute(query) rows = cursor.fetchall() self.qa_pairs = [(row[0], row[1]) for row in rows] self.q_texts = [pair[0] for pair in self.qa_pairs] self.a_texts = [pair[1] for pair in self.qa_pairs] print(f"✅ 成功从 MySQL 加载 {len(self.qa_pairs)} 条问答数据。") except Exception as e: print(f"❌ 数据库连接失败: {e}") self.qa_pairs = [] finally: if conn.is_connected(): conn.close() def load_or_cache_processed_questions(self): cache_path = os.path.join(self.model_dir, 'processed_questions.pkl') if os.path.exists(cache_path): print("🔄 使用缓存预处理后的分词文本。") with open(cache_path, 'rb') as f: self.processed_q_list = pickle.load(f) else: print("🔄 正在预处理问题文本（首次较慢）...") self.processed_q_list = [preprocess(q) for q in tqdm(self.q_texts)] with open(cache_path, 'wb') as f: pickle.dump(self.processed_q_list, f) print("✅ 预处理缓存已保存。") def load_cached_tfidf(self): cache_tfidf_matrix = os.path.join(self.model_dir, 'tfidf_matrix.npz') cache_qa_list = os.path.join(self.model_dir, 'tfidf_qa.pkl') tfidf_path = os.path.join(self.model_dir, 'tfidf_vectorizer.pkl') if os.path.exists(tfidf_path) and os.path.exists(cache_tfidf_matrix) and os.path.exists(cache_qa_list): print("🔄 加载 TF-IDF 缓存版本。") self.tfidf_vectorizer = joblib.load(tfidf_path) self.tfidf_matrix = np.load(cache_tfidf_matrix)['tfidf'] with open(cache_qa_list, 'rb') as f: self.tfidf_qa = pickle.load(f) else: print("🔄 创建并构建 TF-IDF（首次较慢）...") self.tfidf_vectorizer = TfidfVectorizer( tokenizer=lambda x: x.split(), lowercase=False, max_features=10000 ) self.tfidf_qa = self.processed_q_list self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.tfidf_qa).toarray() print("✅ TF-IDF 构建完成。") joblib.dump(self.tfidf_vectorizer, tfidf_path) np.savez_compressed(cache_tfidf_matrix, tfidf=self.tfidf_matrix) with open(cache_qa_list, 'wb') as f: pickle.dump(self.tfidf_qa, f) def load_cached_word2vec_model(self): word2vec_path = os.path.join(self.model_dir, 'word2vec.model') if os.path.exists(word2vec_path): print("🔄 加载缓存中的 Word2Vec 模型...") self.word2vec_model = Word2Vec.load(word2vec_path) else: print("🔄 训练 Word2Vec 模型（首次较慢）...") tokenized_questions = [preprocess(q).split() for q in self.q_texts] self.word2vec_model = Word2Vec( sentences=tokenized_questions, vector_size=100, window=5, min_count=1, workers=4, epochs=10 ) self.word2vec_model.save(word2vec_path) print("✅ Word2Vec 模型训练完成并保存。") def sentence_to_vec(self, sentence): """修复空向量问题""" tokens = preprocess(sentence).split() if not tokens: return np.zeros(100) # 默认向量大小 if self.word2vec_model: vecs = [self.word2vec_model.wv[w] for w in tokens if w in self.word2vec_model.wv] return np.mean(vecs, axis=0) if vecs else np.zeros(self.word2vec_model.vector_size) else: vec = self.tfidf_vectorizer.transform([preprocess(sentence)]).toarray()[0] return vec def build_model(self, epochs=10, batch_size=128, lr=1e-3): # 创建数据集 full_dataset = QADataset(self.qa_pairs, self.tfidf_vectorizer, negative_ratio=self.negative_ratio) # 划分训练集/验证集 train_size = int(len(full_dataset) * 0.8) val_size = len(full_dataset) - train_size train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size]) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) # 初始化模型 sample_input, _ = full_dataset[0] input_dim = sample_input.shape[0] self.semantic_model = SemanticMatchModel(input_dim) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.semantic_model.to(device) criterion = nn.BCELoss() optimizer = optim.Adam(self.semantic_model.parameters(), lr=lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5) # 训练模型 best_val_acc = 0.0 print("\n开始模型训练...") start_time = time.time() for epoch in range(epochs): self.semantic_model.train() total_loss, total_correct, total_samples = 0.0, 0, 0 for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - 训练中"): X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() outputs = self.semantic_model(X_batch).squeeze() loss = criterion(outputs, y_batch) loss.backward() optimizer.step() total_loss += loss.item() * len(y_batch) preds = (outputs >= 0.5).float() total_correct += (preds == y_batch).sum().item() total_samples += len(y_batch) train_loss = total_loss / total_samples train_acc = total_correct / total_samples # 验证阶段 self.semantic_model.eval() val_loss, val_correct, val_samples = 0.0, 0, 0 with torch.no_grad(): for X_val, y_val in val_loader: X_val, y_val = X_val.to(device), y_val.to(device) outputs_val = self.semantic_model(X_val).squeeze() loss_val = criterion(outputs_val, y_val) val_loss += loss_val.item() * len(y_val) preds_val = (outputs_val >= 0.5).float() val_correct += (preds_val == y_val).sum().item() val_samples += len(y_val) val_loss /= val_samples val_acc = val_correct / val_samples # 更新学习率 scheduler.step(val_acc) print(f"Epoch [{epoch + 1}/{epochs}] | " f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | " f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") # 保存最优模型 if val_acc > best_val_acc: best_val_acc = val_acc model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') torch.save(self.semantic_model.state_dict(), model_path) print(f"✅ 新的最优模型已保存 (Val Acc: {best_val_acc:.4f})") end_time = time.time() print(f"\n训练完成，共耗时 {end_time - start_time:.2f} 秒。") # 加载最优模型权重 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model.load_state_dict(torch.load(model_path, map_location=device)) self.semantic_model.eval() def load_model(self): """加载训练好的语义匹配模型""" input_dim = self.tfidf_matrix.shape[1] * 2 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model = SemanticMatchModel(input_dim) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.semantic_model.load_state_dict(torch.load(model_path, map_location=device)) self.semantic_model.to(device) self.semantic_model.eval() print("✅ 语义匹配模型加载完成。") def retrieve(self, query, semantic_topk=100): # 粗检 query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] tfidf_scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() query_sent_vec = self.sentence_to_vec(query) sent_vecs = np.array([self.sentence_to_vec(q) for q in self.q_texts]) sent_scores = cosine_similarity([query_sent_vec], sent_vecs).flatten() # 归一化句子向量相似度到[0,1] sent_scores = (sent_scores + 1) / 2 sim_scores = tfidf_scores + sent_scores # 确保有足够的数据 if len(sim_scores) == 0: return ("抱歉，知识库中没有找到相关信息。", 0.0) topk_indices = np.argpartition(sim_scores, -semantic_topk)[-semantic_topk:] topk_indices = topk_indices[np.argsort(sim_scores[topk_indices])[::-1]] # 精检 if self.semantic_model: device = next(self.semantic_model.parameters()).device with torch.no_grad(): batch_inputs = [] for i in topk_indices: q = preprocess(self.q_texts[i]) a = preprocess(self.a_texts[i]) q_vec = self.tfidf_vectorizer.transform([q]).toarray()[0] a_vec = self.tfidf_vectorizer.transform([a]).toarray()[0] pair_input = np.concatenate((q_vec, a_vec)) batch_inputs.append(pair_input) if batch_inputs: batch_inputs = torch.tensor(np.stack(batch_inputs), dtype=torch.float32).to(device) batch_scores = self.semantic_model(batch_inputs).squeeze().cpu().numpy() semantic_scores = batch_scores else: semantic_scores = np.zeros(len(topk_indices)) # 综合得分（添加归一化权重） coarse_scores = sim_scores[topk_indices] / 2.0 # 归一化到[0,1] final_scores = 0.3 * coarse_scores + 0.7 * semantic_scores best_idx_in_topk = np.argmax(final_scores) best_idx = topk_indices[best_idx_in_topk] return self.qa_pairs[best_idx], final_scores[best_idx_in_topk] else: best_idx = topk_indices[0] if topk_indices.size > 0 else 0 return self.qa_pairs[best_idx], sim_scores[best_idx] def recommend_similar(self, query, topk=3): """修复索引越界问题""" query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() # 安全获取topk索引 if len(scores) == 0: return [] if len(scores) < topk: topk = len(scores) topk_idx = np.argpartition(scores, -topk)[-topk:] topk_idx = topk_idx[np.argsort(scores[topk_idx])[::-1]] return [(self.q_texts[i], self.a_texts[i]) for i in topk_idx] class FeedbackRecorder: def init(self, file_path='unanswered_questions.csv'): self.file_path = file_path if not os.path.exists(self.file_path): with open(self.file_path, 'w', newline='', encoding='utf-8') as f: import csv csv.writer(f).writerow(['time', 'question']) def record_question(self, question): with open(self.file_path, 'a', newline='', encoding='utf-8') as f: import csv writer = csv.writer(f) writer.writerow([datetime.now().isoformat(), question]) def main(): kb = KnowledgeBase( host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0 ) if input("是否重新训练语义匹配模型？(y/n): ").strip().lower() == 'y': kb.build_model( epochs=5, batch_size=128, lr=1e-3 ) recorder = FeedbackRecorder() print("\n🎯 智能知识问答系统已启动（输入'q'退出聊天）\n") while True: query = input("🧐 问题：") if query.strip().lower() == 'q': break try: result, score = kb.retrieve(query) if result: print(f"💡 回答：{result[1]}") print(f"📊 匹配信心分数: {score:.4f}\n") else: print("⚠ 没有找到合适的答案，已将你的问题记录下来。") recorder.record_question(query) print("🔥 相似问题推荐：") similar_questions = kb.recommend_similar(query, topk=3) for q, a in similar_questions: print(f"Q: {q}\nA: {a}\n") except Exception as e: print(f"❌ 检索过程中发生错误: {e}") if name == "main": main()

import numpy as np from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_...

import os.path import gzip import pickle import os import numpy as np import urllib url_base = 'http://yann.lecun.com/exdb/mnist/' key_file = { 'train_img':'train-images-idx3-ubyte.gz', 'train_label':'train-labels-idx1-ubyte.gz', 'test_img':'t10k-images-idx3-ubyte.gz', 'test_label':'t10k-labels-idx1-ubyte.gz' } dataset_dir = os.path.dirname(os.path.abspath("_file_")) save_file = dataset_dir + "/mnist.pkl" train_num=60000 test_num=10000 img_dim=(1,28,28) img_size=784 def _download(file_name): file_path = dataset_dir+"/"+file_name if os.path.exists(file_path): return print("Downloading"+file_name+" ... ") urllib.request.urlretrieve(url_base + file_name,file_path) print("Done") def download_mnist(): for v in key_file.values(): _download(v) def _load_label(file_name): file_path = dataset_dir+ "/" +file_name print("Converting" + file_name +"to Numpy Array ...") with gzip.open(file_path,'rb') as f: labels = np.frombuffer(f.read(),np.uint8,offset=8) print("Done") return labels def _load_img(file_name): file_path=dataset_dir+"/"+file_name print("Converting"+file_name+"to Numpy Array ...") with gzip.open(file_path,'rb') as f: data = np.frombuffer(f.read(),np.uint8,offset=16) data = data.reshape(-1,img_size) print("Done") return data def _convert_numpy(): dataset = {} dataset['train_img'] = _load_img(key_file['train_img']) dataset['train_label'] = _load_label(key_file['train_label']) dataset['test_img'] = _load_img(key_file['test_img']) dataset['test_label'] = _load_label(key_file['test_label']) return dataset def init_mnist(): download_mnist() dataset = _convert_numpy() print("Creating pickle file ...") with open(save_file,'wb') as f: pickle.dump(dataset,f,-1) print("Done") if name =='main': init_mnist()

这段代码是用于下载MNIST数据集并将数据集转换成Numpy数组格式的函数。MNIST数据集是一个手写数字识别数据集，包含了60000张训练图片和10000张测试图片。在函数中，首先定义了数据集的下载地址和四个文件的名称，...

import streamlit as st import numpy as np import pandas as pd import pickle import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier import streamlit_echarts as st_echarts from sklearn.metrics import accuracy_score,confusion_matrix,f1_score def pivot_bar(data): option = { "xAxis":{ "type":"category", "data":data.index.tolist() }, "legend":{}, "yAxis":{ "type":"value" }, "series":[ ] }; for i in data.columns: option["series"].append({"data":data[i].tolist(),"name":i,"type":"bar"}) return option st.markdown("mode pracitce") st.sidebar.markdown("mode pracitce") df=pd.read_csv(r"D:\课程数据\old.csv") st.table(df.head()) with st.form("form"): index_val = st.multiselect("choose index",df.columns,["Response"]) agg_fuc = st.selectbox("choose a way",[np.mean,len,np.sum]) submitted1 = st.form_submit_button("Submit") if submitted1: z=df.pivot_table(index=index_val,aggfunc = agg_fuc) st.table(z) st_echarts(pivot_bar(z)) df_copy = df.copy() df_copy.drop(axis=1,columns="Name",inplace=True) df_copy["Response"]=df_copy["Response"].map({"no":0,"yes":1}) df_copy=pd.get_dummies(df_copy,columns=["Gender","Area","Email","Mobile"]) st.table(df_copy.head()) y=df_copy["Response"].values x=df_copy.drop(axis=1,columns="Response").values X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) with st.form("my_form"): estimators0 = st.slider("estimators",0,100,10) max_depth0 = st.slider("max_depth",1,10,2) submitted = st.form_submit_button("Submit") if "model" not in st.session_state: st.session_state.model = RandomForestClassifier(n_estimators=estimators0,max_depth=max_depth0, random_state=1234) st.session_state.model.fit(X_train, y_train) y_pred = st.session_state.model.predict(X_test) st.table(confusion_matrix(y_test, y_pred)) st.write(f1_score(y_test, y_pred)) if st.button("save model"): pkl_filename = "D:\\pickle_model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(st.session_state.model, file) 会出什么错误

2. 如果使用了 streamlit_echarts 库，在运行代码前需要先安装该库，可以通过 !pip install streamlit_echarts 命令进行安装。 3. 确保所有的依赖项都已经被正确地导入。 4. 确认你的代码没有语法错误和逻辑...

import numpy as npimport timeimport osimport pickleimport matplotlib.pyplot as plt

import numpy as np # 使用别名提高可读性和效率[^2] import time # 提供时间处理功能 import os # 文件路径操作和其他操作系统接口 import pickle # 数据序列化支持 import matplotlib.pyplot as plt # 可视化工具...

def read_data(pkl_path, loc): with open(pkl_path, 'rb') as file: data = pickle.load(file) pos_data, neg_data = [], [] for d in data: if d['label'] == 0: pos_data.append(d[loc]) else: neg_data.append(d[loc]) pos_data = torch.stack(pos_data).float() neg_data = torch.stack(neg_data).float() # assert pos_data.shape[0] == neg_data.shape[0] return pos_data, neg_data def read_data_all(pkl_path): with open(pkl_path, 'rb') as file: data = pickle.load(file) pos_data, neg_data = [], [] for d in data: if d['label'] == 0: pos_data.append(d) else: neg_data.append(d) return pos_data, neg_data def train_val_split(pos_data, neg_data, num_shot=10): total_num = pos_data.shape[0] train_num = int(num_shot) if num_shot != -1 else total_num val_num = total_num - train_num train_idx = random.sample(range(total_num), train_num) val_idx = list(set(range(total_num)) - set(train_idx)) x_train = torch.cat([pos_data[train_idx], neg_data[train_idx]], dim=0) y_train = torch.cat([torch.zeros(train_num), torch.ones(train_num)]) x_val = torch.cat([pos_data[val_idx], neg_data[val_idx]], dim=0) y_val = torch.cat([torch.zeros(val_num), torch.ones(val_num)]) return x_train, y_train, x_val, y_val def mix_data_fitting(x_train, y_train, model_name='lr'): model_list = [] for head in range(x_train.shape[1]): # 32 * 32 = 1024 if model_name == 'lda': model = LinearDiscriminantAnalysis() elif model_name == 'lr': model = LogisticRegression(penalty='l2') elif model_name == 'sgd': model = SGDClassifier(loss='log_loss') else: raise NotImplementedError model.fit(x_train[:, head], y_train) model_list.append(model) return model_list # def data_fitting_opt(x_train): # solutions = [] # pos_data = x_train[:len(x_train) // 2] # neg_data = x_train[len(x_train) // 2:] # for head in range(x_train.shape[1]): # def objective(x,u1,u2): # s12 = np.dot(np.transpose(u1-u2),u1-u2) # s1 = np.dot(np.transpose(u1),u1) # s2 = np.dot(np.transpose(u2),u2) # np12 = np.dot(np.dot(x,s12),np.transpose(x)) # np1 = np.sqrt(np.dot(np.dot(x,s1),np.transpose(x))) # np2 = np.sqrt(np.dot(np.dot(x,s2),np.transpose(x))) # down = np12 + (np1 + np2)2 # return -np12/down import numpy as np def objective(x, u1, u2): # Number of samples n = u1.shape[0] u1 = np.array(u1) u2 = np.array(u2) # Compute means mu_x = np.mean(u1, axis=0) # Shape: (128,) mu_y = np.mean(u2, axis=0) # Shape: (128,) # Numerator: |x^T (mu_x - mu_y)|^2 diff_mean = mu_x - mu_y numerator = (np.dot(x, diff_mean))2 #/ (n2) # Scalar # Compute covariance matrices with 1/(n-1) u1_centered = u1 - mu_x # Shape: (17, 128) u2_centered = u2 - mu_y # Shape: (17, 128) Sigma_x = np.dot(u1_centered.T, u1_centered) #/ (n - 1) # Shape: (128, 128) Sigma_y = np.dot(u2_centered.T, u2_centered) #/ (n - 1) # Shape: (128, 128),样本数较少时（1%） # Compute denominator terms s_x = np.dot(np.dot(x, Sigma_x), x) # x^T Sigma_x x, scalar s_y = np.dot(np.dot(x, Sigma_y), x) # x^T Sigma_y x, scalar denominator = (np.sqrt(s_x) + np.sqrt(s_y))2 # Compute alpha, avoiding division by zero alpha = numerator / (numerator + denominator) # if (numerator + denominator) > 0 else 0 return -alpha def model_opt(pos, neg): import scipy.optimize as opt u1 = pos # Shape (n, 128) u2 = neg # Shape (n, 128) x0 = np.random.random(u1.shape[1]) # (128,) result = opt.minimize(objective, x0, args=(u1, u2)) return result.x, -result.fun 这是第二段代码

$$ \alpha = \frac{(x^T(\mu_x - \mu_y))^2}{(x^T(\mu_x - \mu_y))^2 + (\sqrt{x^T\Sigma_x x} + \sqrt{x^T\Sigma_y x})^2} $$ 最大化该值可使投影方向同时满足： - 类间差异大（分子项） - 类内方差小（分母项...

请帮我详细解释每一行代码的意思if name == "main": BMES = [] print("正在读取本地模型矩阵...") with open(r'mat_pickle/init_mat.pkl', "rb") as f0: init_mat = np.array(list(pickle.load(f0).values())) with open(r'mat_pickle/trans_mat.pkl', "rb") as f1: init_trans_mat = np.array(list(pickle.load(f1).values())) with open(r'mat_pickle/emit_mat.pkl', "rb") as f2: init_emit_mat = pickle.load(f2) catalog = list(init_emit_mat) trans_mat = [] emit_mat = [] hidden_state = ["B", "M", "E", "S"] for item in init_trans_mat: trans_mat.append(np.array(list(item.values()))) for i in hidden_state: emit_mat.append(np.array(list(init_emit_mat.loc[i]))) emit_mat = np.array(emit_mat).reshape(4,-1) print("读取模型矩阵成功！") print("目前模型的汉字库",catalog) while(1): new_sentence = input("请输入你要分词的句子（如：商品和货币）输入0结束分词功能:") if (new_sentence == '0'): print("输入结束！") break state_s = [0,1,2,3] original = [catalog.index(i) for i in new_sentence] result = compute(original, state_s, init_mat, trans_mat, emit_mat) answer = solve_tag(result,new_sentence) print("分词的结果为：") for item in answer: print(item,end='') print("\n")

with open(r'mat_pickle/init_mat.pkl', "rb") as f0: init_mat = np.array(list(pickle.load(f0).values())) 这行代码用于从文件 init_mat.pkl 中读取 HMM 模型的初始概率矩阵，并将其转换为 numpy 数组格式...

模仿def deap_preprocess(data_file,emotion): dataset_extention = ".mat_win_384_dataset.pkl" label_extention = ".mat_win_384_label.pkl" arousal_or_valence = emotion dataset_dir = "../DEAP_pickle_"+arousal_or_valence+"/" with open(dataset_dir+data_file+dataset_extention, "rb") as fp: datasets = pickle.load(fp) with open(dataset_dir+data_file+label_extention,"rb") as fp: labels = pickle.load(fp) labels = np.transpose(labels) # print(labels) # print(datasets.shape) # labels = np.asarray(pd.get_dummies(labels),dtype=np.int8) # print(labels.shape) datasets = datasets.reshape(-1,384,32,1).astype('float32') labels = labels.astype('int64') # print(type(labels)) return datasets, labels 编写一个加载DEAP数据集 .mat格式文件

import numpy as np # 加载数据和标签 with open(data_path, 'rb') as f: data = pickle.load(f) with open(label_path, 'rb') as f: labels = pickle.load(f) # reshape数据，假设原始数据是3D，转换为2D ...

import numpy as np import pickle import math f = open(r'C:\Users\sdnugeo\Desktop\target2.pkl','rb') bi = pickle.load(f) # bi = np.array(bi) data = np.arange(0, 156) print(data) data = data.tolist() # print(data) # 每次抽取1000个数值，共抽取10次 samples = [] c = [] a = 11 r = math.ceil(len(data)/a) print(r) for i in range(a): if len(data) > r : sample = np.random.choice(data, r, replace=False) else: sample = np.random.choice(data, len(data), replace=False) # for s in sample: # data.remove(s) # continue # print(type(sample)) # sample2 = np.array(sample) b = [bi[j] for j in list(sample)] # d = np.array(b) print(type(b)) c.append(b) # c = np.concatenate(c, axis=0) # c = bi[0] print(sample) # print(b) samples.append(sample) for s in sample: data.remove(s) # sample = [s for s in sample if s in data] samples = [np.array(s) for s in samples] samples = np.concatenate(samples, axis=0) c = [np.array(e) for e in c] c = np.concatenate(c, axis=0) # samples = np.array(samples).reshape(-1) print(samples) print(c.shape) 请帮我把这段代码修改成可调用的函数

好的，你想要将这段代码封装...filepath = r'C:\Users\sdnugeo\Desktop\target2.pkl' samples, c = sample_data(filepath) 其中，filepath是你想要读取的pickle文件的路径。函数返回两个变量，分别为samples和c。

相关推荐

mnist.pkl.gz数据文件

MNIST.pkl.gz数据包及Python下载代码

python中的Pickle文件和npy文件（csdn）————程序.pdf

import pickle, numpy as np IndentationError: unexpected indent

import numpy as npimport timeimport osimport pickleimport matplotlib.pyplot as plt

大家在看

机械臂建模+MATLAB代码+六自由度.zip

易语言WinSock模块应用

VxWorks和RTlinux的性能测试分析

波特率任意设 串口调试助手

十几种水下图像增强算法源代码

最新推荐

Visual C++.NET编程技术实战指南

HarmonyOS内核深度探秘：优化自由行旅游系统的策略

tkinter模块所有控件

局域网五子棋游戏：娱乐与聊天的完美结合

自由行旅游新篇章：HarmonyOS技术融合与系统架构深度解析

足底支撑相到达73%是什么问题

宾馆预约系统开发与优化建议

HarmonyOS在旅游领域的创新：揭秘最前沿应用实践

数据架构师需要具备什么能力

Java Web应用开发教程：Struts与Hibernate实例解析

波特率任意设串口调试助手