from collections import Counter import numpy as np import pandas as pd import torch import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score, classification_report from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from torch.utils.data import DataLoader, Dataset from tqdm import tqdm from transformers import AutoTokenizer, BertModel import joblib from sklearn.metrics import confusion_matrix import seaborn as sns # 1. ====================== 配置参数 ====================== MODEL_PATH = r'D:\pythonProject5\bert-base-chinese' BATCH_SIZE = 64 MAX_LENGTH = 128 SAVE_DIR = r'D:\pythonProject5\BSVMC_model' DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 2. ====================== 数据加载与划分 ====================== def load_data(file_path): """加载并预处理数据""" df = pd.read_csv(file_path).dropna(subset=['text', 'label']) texts = df['text'].astype(str).str.strip().tolist() labels = df['label'].astype(int).tolist() return texts, labels # 加载原始数据 texts, labels = load_data("train3.csv") # 第一次拆分:分出测试集(20%) train_val_texts, test_texts, train_val_labels, test_labels = train_test_split( texts, labels, test_size=0.2, stratify=labels, random_state=42 ) # 第二次拆分:分出训练集(70%)和验证集(30% of 80% = 24%) train_texts, val_texts, train_labels, val_labels = train_test_split( train_val_texts, train_val_labels, test_size=0.3, # 0.3 * 0.8 = 24% of original stratify=train_val_labels, random_state=42 ) # 3. ====================== 文本编码 ====================== tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) def encode_texts(texts): return tokenizer( texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt" ) # 编码所有数据集 train_encodings = encode_texts(train_texts) val_encodings = encode_texts(val_texts) test_encodings = encode_texts(test_texts) # 4. ====================== 数据集类 ====================== class TextDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): return { 'input_ids': self.encodings['input_ids'][idx], 'attention_mask': self.encodings['attention_mask'][idx], 'labels': torch.tensor(self.labels[idx]) } def __len__(self): return len(self.labels) # 创建所有数据集加载器 train_dataset = TextDataset(train_encodings, train_labels) val_dataset = TextDataset(val_encodings, val_labels) test_dataset = TextDataset(test_encodings, test_labels) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) # 5. ====================== 特征提取 ====================== def extract_features(bert_model, dataloader): """使用BERT提取CLS特征""" bert_model.eval() all_features = [] all_labels = [] with torch.no_grad(): for batch in tqdm(dataloader, desc="提取特征"): inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'} outputs = bert_model(**inputs) features = outputs.last_hidden_state[:, 0, :].cpu().numpy() all_features.append(features) all_labels.append(batch['labels'].numpy()) return np.vstack(all_features), np.concatenate(all_labels) # 加载并冻结BERT模型 bert_model = BertModel.from_pretrained(MODEL_PATH).to(DEVICE) for param in bert_model.parameters(): param.requires_grad = False # 提取所有特征 print("\n" + "=" * 30 + " 特征提取阶段 " + "=" * 30) train_features, train_labels = extract_features(bert_model, train_loader) val_features, val_labels = extract_features(bert_model, val_loader) test_features, test_labels = extract_features(bert_model, test_loader) # 6. ====================== 特征预处理 ====================== scaler = StandardScaler() train_features = scaler.fit_transform(train_features) # 只在训练集上fit val_features = scaler.transform(val_features) test_features = scaler.transform(test_features) # 7. ====================== 训练SVM ====================== print("\n" + "=" * 30 + " 训练SVM模型 " + "=" * 30) svm_model = SVC( kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42 ) svm_model.fit(train_features, train_labels) # 8. ====================== 评估模型 ====================== def evaluate(features, labels, model, dataset_name): preds = model.predict(features) acc = accuracy_score(labels, preds) print(f"\n[{dataset_name}] 评估结果:") print(f"准确率:{acc:.4f}") print(classification_report(labels, preds, digits=4)) return preds print("\n训练集评估:") _ = evaluate(train_features, train_labels, svm_model, "训练集") print("\n验证集评估:") val_preds = evaluate(val_features, val_labels, svm_model, "验证集") print("\n测试集评估:") test_preds = evaluate(test_features, test_labels, svm_model, "测试集") # 9. ====================== 保存模型 ====================== def save_pipeline(): """保存完整模型管道""" # 创建保存目录 import os os.makedirs(SAVE_DIR, exist_ok=True) # 保存BERT相关 bert_model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) # 保存SVM和预处理 joblib.dump(svm_model, f"{SAVE_DIR}/svm_model.pkl") joblib.dump(scaler, f"{SAVE_DIR}/scaler.pkl") # 保存标签映射(假设标签为0: "中性", 1: "正面", 2: "负面") label_map = {0: "中性", 1: "正面", 2: "负面"} joblib.dump(label_map, f"{SAVE_DIR}/label_map.pkl") print(f"\n模型已保存至 {SAVE_DIR} 目录") save_pipeline() # 10. ===================== 可视化 ====================== plt.figure(figsize=(15, 5)) # 决策值分布 plt.subplot(1, 2, 1) plt.plot(svm_model.decision_function(train_features[:100]), 'o', alpha=0.5) plt.title("训练集前100样本决策值分布") plt.xlabel("样本索引") plt.ylabel("决策值") # 生成混淆矩阵 cm = confusion_matrix(y_true=test_labels, y_pred=test_preds) # 可视化 plt.figure(figsize=(10, 7)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1', '2'], yticklabels=['0', '1', '2']) plt.xlabel('Predicted label') plt.ylabel('True label') plt.title('confusion matrix') plt.show() # 准确率对比 plt.subplot(1, 2, 2) accuracies = [ accuracy_score(train_labels, svm_model.predict(train_features)), accuracy_score(val_labels, val_preds), accuracy_score(test_labels, test_preds) ] labels = ['train', 'Validation', 'test'] plt.bar(labels, accuracies, color=['blue', 'orange', 'green']) plt.ylim(0, 1) plt.title("Comparison of accuracy rates for each dataset") plt.ylabel("Accuracy rate") plt.tight_layout() plt.show()画一下我的模型架构图
时间: 2025-06-05 07:47:20 浏览: 16
<think>好的,我需要帮助用户根据提供的代码绘制BERT与SVM结合的模型架构图。首先,我需要理解用户的需求。用户提到了BERT和SVM的结合,可能是指使用BERT作为特征提取器,然后将这些特征输入到SVM分类器中进行分类。这种架构在文本分类任务中比较常见,比如情感分析或文本分类。接下来,我需要考虑如何将这两个模型结合起来。BERT通常用于生成文本的上下文嵌入,而SVM是一种传统的机器学习分类器,适合处理高维特征。因此,架构图应该展示文本输入到BERT模型,提取特征,然后将这些特征输入到SVM中进行分类。
用户可能没有提供具体的代码,但根据常见的实现方式,代码可能包括使用BERT提取特征的部分,例如使用HuggingFace的Transformers库加载预训练的BERT模型,获取最后一层的隐藏状态作为特征,然后用这些特征训练SVM分类器。需要确保代码部分正确,并且架构图能够反映这一流程。
然后,我需要考虑绘图工具。用户可能使用诸如TensorBoard、PyTorch的torchviz、或者在线工具如Draw.io、Lucidchart等。另外,可能需要手动绘制,确保各个组件清晰,比如BERT的编码器层、SVM的分类层,以及数据流的箭头方向。
还需要注意模型架构图的细节,例如输入层、BERT的Transformer块、池化层、特征向量输出,以及SVM的分类决策边界。可能需要标注关键部分,如特征提取和分类步骤,并确保图示符合学术规范,清晰易懂。
另外,用户可能希望了解如何将两者结合的具体步骤,比如特征提取后的数据处理(是否需要降维或标准化),SVM的核函数选择等。这些在架构图中可能不需要详细展示,但在说明中需要提及。最后,生成相关问题,帮助用户深入理解这一架构,比如性能优化、超参数调优、应用场景等。需要确保问题具有针对性,能够引导用户进一步探索。</think>要绘制BERT与SVM结合的模型架构图,需要分三步理解特征提取与分类的逻辑关系:
### 一、模型架构设计原理
1. **BERT层**:接收文本输入,通过12/24层Transformer编码器生成768/1024维词向量
$$ \text{BERT}(x) = [h_1, h_2, ..., h_n] $$
2. **池化层**:提取[CLS]标记的语义特征作为句子表示[^2]
$$ h_{\text{[CLS]}} = \text{Linear}(h_1) $$
3. **SVM分类器**:接收BERT输出的特征向量,通过核函数实现分类决策
$$ f(x) = \text{sign}(\sum_{i=1}^n \alpha_i y_i K(h_{\text{[CLS]}}, h_i) + b) $$
### 二、实现代码示例
```python
from transformers import BertModel, BertTokenizer
from sklearn.svm import SVC
import torch
# BERT特征提取
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
inputs = tokenizer("example text", return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
features = outputs.last_hidden_state[:,0,:].numpy() # [CLS]向量
# SVM分类
svm = SVC(kernel='rbf')
svm.fit(features, labels)
```
### 三、架构图绘制要点
使用Mermaid语法绘制流程示意图:
```mermaid
graph LR
A[原始文本] --> B(BERT编码器)
B --> C([CLS]特征向量)
C --> D(SVM分类器)
D --> E{分类结果}
```
阅读全文
相关推荐



















