在本项目下情感仅分为积极与消极两种情感(0 == negative, 1 == positive)
LSTM介绍
LSTM(Long Short Term Memory)是一种在RNN(Recurrent Neural Network)基础上发展出来的一种网络结构,相比RNN能更好的实现长序列信息的处理,在NLP,视频信号处理等领域有着一定的应用。
图1. RNN与LSTM的相关参数运算
RNN的思想主要是将前一时间的输出与当前时间的输入一起进行处理,然后输出,实现对序列时间进行动态处理的相关功能。
而LSTM则在RNN的基础上引入了input gate, forget gate, gate gate, ouput gate等四个门,通过这四个门实现对cell里面的信息的控制,实现更好的长距离序列信息的处理。
input gate: Whether to write to cell (i)
forget gate: Whether to erase cell (f)
gate gate: How much to write to cell (g)
output gate: How much to reveal cell (o)
图2. LSTM的相关结构
为 Long term memory
为 Short term memory
项目的具体实现
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
import time
from sklearn.metrics import accuracy_score
# 1. 加载数据
train_df = pd.read_json('./train.json')
test_df = pd.read_json('./test.json')
# 2. 数据预处理
tokenizer = get_tokenizer("basic_english")
batch_size = 64
train_epoch = 100
lr = 0.001
hidden_dim = 256
output_dim = 2
# 3. 定义情绪分析数据集
class SentimentDataset(Dataset):
def __init__(self, reviews, sentiments=None, vocab=None, tokenizer=None):
self.reviews = reviews
self.sentiments = sentiments
self.vocab = vocab
self.tokenizer = tokenizer
def __len__(self):
return len(self.reviews)
def __getitem__(self, idx):
review = str(self.reviews[idx])
tokens = [self.vocab[token] for token in self.tokenizer(review) if token in self.vocab]
if self.sentiments is not None:
sentiment = int(self.sentiments[idx])
return torch.tensor(tokens, dtype=torch.long), torch.tensor(sentiment, dtype=torch.long)
else:
return torch.tensor(tokens, dtype=torch.long)
# 4. 构建词汇表和加载 GloVe 词向量,使用6B词向量预训练模型计算
vocab = GloVe(name='6B', dim=100, cache='./glove_cache')
embedding_dim = 100
# 5. 准备训练数据,分训练和验证集
train_texts = train_df['reviews'].values
train_labels = train_df['sentiments'].values
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=10)
# 6. 创建训练集和验证集
train_dataset = SentimentDataset(reviews=train_texts, sentiments=train_labels, vocab=vocab.stoi, tokenizer=tokenizer)
valid_dataset = SentimentDataset(reviews=val_texts, sentiments=val_labels, vocab=vocab.stoi, tokenizer=tokenizer)
# 7. 定义 LSTM 模型
class SentimentLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(SentimentLSTM, self).__init__()
self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze=False)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
embedded = self.embedding(x)
_, (hidden, _) = self.lstm(embedded)
return self.fc(hidden[-1])
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = SentimentLSTM(vocab_size=len(vocab.stoi), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
# model.load_state_dict(torch.load(f'./LSTM_model_{hidden_dim}.pth', map_location=device)) # load pretrained model
# model.to(device)
# print(f'load pretrain model, model name:LSTM_model_{hidden_dim}.pth')
# 9. 定义损失函数、优化器和学习率调度器
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, verbose=True)
# 10. 准备数据加载器
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True).to(device)
labels = torch.stack(labels).to(device)
return texts, labels
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
# 11. 模型训练
print(f"Device: {device}")
start_time = time.time()
best_accuracy = 0.0 # 用于记录最佳准确率
best_model_path = f'./LSTM_model_{hidden_dim}.pth'
for epoch in range(train_epoch):
print(f"Epoch {epoch + 1}/{train_epoch}")
model.train()
for texts, labels in train_loader:
texts, labels = texts.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(texts)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
# 每2个 epoch 进行一次验证
if (epoch + 1) % 2 == 0:
model.eval()
all_preds = []
all_labels = []
wrong_comments = [] # 存储预测错误的评论
global_index = 0
with torch.no_grad():
for texts, labels in valid_loader:
texts, labels = texts.to(device), labels.to(device)
outputs = model(texts)
preds = torch.argmax(outputs, dim=1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
# 将预测错误的评论、标签和预测值添加到 wrong_comments 列表
for i, (pred, label) in enumerate(zip(preds.cpu().numpy(), labels.cpu().numpy())):
if pred != label: # 如果预测值与真实标签不符
wrong_comments.append({
"text": val_texts[global_index + i], # 用全局索引获取文本
"true_label": label,
"predicted_label": pred
})
global_index += len(labels) # 增加当前批次的样本数
# 将错误的评论保存到文件中
with open("wrong_comment_LSTM.txt", "w", encoding="utf-8") as f:
for wc in wrong_comments:
f.write(f"Text: {wc['text']} | True Label: {wc['true_label']} | Predicted Label: {wc['predicted_label']}\n")
# 计算当前 epoch 的验证集准确率
accuracy = accuracy_score(all_labels, all_preds)
print(f'Validation Accuracy after epoch {epoch + 1}: {accuracy * 100:.2f}%')
# 记录每个 epoch 的准确率和学习率
current_lr = optimizer.param_groups[0]['lr']
with open('LSTM_V3_train_parameter.txt', 'a', encoding="utf-8") as f:
f.write(f"Epoch {epoch + 1} - Validation Accuracy: {accuracy * 100:.2f}%, Learning Rate: {current_lr}, dim:{hidden_dim}\n")
# 更新学习率调度器
scheduler.step(accuracy)
# 保存最佳准确率模型
if accuracy > best_accuracy:
best_accuracy = accuracy
torch.save(model.state_dict(), best_model_path)
print(f"New best model saved with accuracy: {best_accuracy * 100:.2f}%")
end_time = time.time()
print("Training complete")
print("Best Validation Accuracy:", best_accuracy * 100, "%")
print("Total training time:", end_time - start_time)
# 加载并评估最佳模型
model.load_state_dict(torch.load(best_model_path, map_location=device))
model.eval()
# 13. 对 test.json 进行预测
test_texts = test_df['reviews'].values
test_dataset = SentimentDataset(reviews=test_texts, vocab=vocab.stoi, tokenizer=tokenizer)
test_loader = DataLoader
在本项目中实现了训练LSTM模型,保存准确率最高的LSTM模型,当验证集精确度没有降低时动态调整学习率,将验证集中的错误评论保存到指定文件中等功能