目录
一、概述
该任务是对英文评论进行分析,得到评论的情感:正面/负面。
总体流程如下:数据预处理→模型构建→模型训练与测试
二、细节
1.数据预处理
将所有文本,去除标点,变为小写,之后构建词典,把每一个句子的每一个词根据词典转为id,之后根据句子的长度进行padding,最后把数据分割成训练集、验证集、测试集。
from string import punctuation
import random
import numpy as np
import torch
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset
from torch import optim
from tqdm import tqdm
import torch.nn as nn
from sklearn.metrics import classification_report
import Model
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if __name__ == '__main__':
#导入数据
with open('reviews.txt','r') as f:
text = f.read()
with open('labels.txt', 'r') as f:
labels = f.read()
# 去除标点
def text_process(text):
text_no_punc = [char.lower() for char in text if char not in punctuation]
text_list = ''.join(text_no_punc).split('\n')
return text_list
clean_text = text_process(text)
labels = labels.split('\n')
# 获取词典
def get_dict(text):
# 获取每个单词并去重,去掉空字符
# 转为word2id
all_words = []
for sentence in text:
all_words.extend(sentence.split())
unique_words = list(set(all_words))
id2word = dict(enumerate(unique_words, 1))
word2id = {word:int(idx) for idx, word in id2word.items()}
return word2id, id2word
word2id, id2word = get_dict(clean_text)
label2id = np.array([1 if x == 'positive' else 0 for x in labels])
# 根据长度进行padding
length = [len(sentence.split()) for sentence in clean_text]
counts = Counter(length)
min_len = min(counts.items())
max_len = max(counts.items())
min_idx = [i for i, sentence in enumerate(clean_text) if len(sentence.split())==min_len[0]]
max_idx = [i for i, sentence in enumerate(clean_text) if len(sentence.split())==max_len[0]]
new_text = np.delete(clean_text, min_idx+max_idx)
new_label = np.delete(label2id, min_idx+max_idx)
max_sentence_length = 200
words_id = []
for sentence in new_text:
senten = sentence.split(' ')
words_id.append([word2id.get(word, 0) for word in senten])
# padding
dataset = np.zeros((len(words_id), max_sentence_length))
for index, words in enumerate(words_id):
if len(words) >= max_sentence_length:
dataset[index, :] = words[:max_sentence_length]
else:
dataset[index, :len(words)] = words
data_tensor = torch.tensor(dataset, dtype=torch.long)
data_label = torch.tensor(new_label, dtype=torch.long)
# 数据分割
train_size = int(data_tensor.shape[0] * 0.8)
rest_size = data_tensor.shape[0] - train_size
val_size = int(rest_size * 0.5)
test_size = int(rest_size * 0.5)
train_data = data_tensor[:train_size]
train_label = data_label[:train_size]
rest_data = data_tensor[train_size:]
rest_label = data_label[train_size:]
val_data = rest_data[:val_size]
val_label = rest_label[:val_size]
test_data = rest_data[val_size:]
test_label = rest_label[val_size:]
2.模型训练
由于这里和上面的数据预处理代码是连着的,因此先上这个代码,模型构建放在最后,在训练或测试的时候导入即可。
该训练中,采用的是双向LSTM模型,在每一个epoch中,都会进行验证以及测试,并保存。
# 数据封装
train_dataset = TensorDataset(train_data, train_label)
val_dataset = TensorDataset(val_data, val_label)
test_dataset = TensorDataset(test_data, test_label)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size,
pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size,
pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
# 设置模型参数
embed_dim = 300
hidden_dim=256
output_size=2
vocab_size=len(word2id)
epochs = 20
device = "cuda" if torch.cuda.is_available() else 'cpu'
# 导入模型
model = Model.LSTM(embedding_dim=embed_dim, hidden_dim=hidden_dim, output_size=output_size, vocab_size=vocab_size)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.98,0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()
# 训练
train_loss = 0.0
best_f1 = 0.0
for epoch in tqdm(range(epochs), desc='Epoch'):
model.train()
for step,(x,y) in enumerate(train_loader):
x,y = x.to(device), y.to(device)
optimizer.zero_grad()
logits = model(x)
# 输出是(B, 2)
loss = criterion(logits, y)
train_loss += loss.item() * batch_size
loss.backward()
optimizer.step()
train_loss /= len(train_loader)
print()
print('train loss: {}'.format(train_loss))
# 每一个epoch进行一次验证,看看损失
model.eval()
correct = 0
val_loss = 0
for step,(x,y) in enumerate(val_loader):
x,y = x.to(device), y.to(device)
logits = model(x)
loss = criterion(logits, y)
val_loss += loss.item() * batch_size
y_pred = torch.argmax(logits, dim=-1)
correct+=y_pred.eq(y).sum().item()
print('val loss: {} accuracy: {}'.format(val_loss/len(val_loader), correct/(len(val_loader)*batch_size)))
# 每一个epoch进行一次测试,看看准确率以及f1等数据,根据最好的分数保存模型
model.eval()
y_real = []
y_pred = []
test_loss = 0
for step, (x, y) in enumerate(test_loader):
x, y = x.to(device), y.to(device)
logits = model(x)
loss = criterion(logits, y)
test_loss += loss.item() * batch_size
y_pre = torch.argmax(logits, dim=-1).tolist()
y_real.extend(y.tolist())
y_pred.extend(y_pre)
ret = classification_report(y_real, y_pred, output_dict=True)
# classification_report的各种分数
precision_0 = ret['0']['precision']
recall_0 = ret['0']['recall']
f1_0 = ret['0']['f1-score']
support_0 = ret['0']['support']
precision_1 = ret['1']['precision']
recall_1 = ret['1']['recall']
f1_1 = ret['1']['f1-score']
support_1 = ret['1']['support']
weight_0 = support_0 / (support_1+support_0)
weight_1 = 1-weight_0
avg_precision = precision_0 * weight_0 + precision_1 * weight_1
avg_recall = recall_0 * weight_0 + recall_1 * weight_1
avg_f1 = f1_0 * weight_0 + f1_1 * weight_1
all_avg_precision = ret['macro avg']['precision']
all_avg_recall = ret['macro avg']['recall']
all_avg_f1 = ret['macro avg']['f1-score']
print('Testing Epoch: [%d/%d]' % (epoch, epochs))
print('"0" precision=%.2f, recall=%.2f, f1=%.2f' % (precision_0, recall_0, f1_0))
print('"1" precision=%.2f, recall=%.2f, f1=%.2f' % (precision_1, recall_1, f1_1))
# 根据最好分数存储模型
if avg_f1 > best_f1:
best_f1 = avg_f1
torch.save(model.state_dict(), 'best_model_adjusted.bin')
print('Save the best model %s, avg_f1=%.2f'%('best_model.bin', best_f1))
3.模型构建
模型由以下层组成:embedding层, LSTM层,全连接层
import torch.nn as nn
import torch
class LSTM(nn.Module):
def __init__(self, embedding_dim, hidden_dim, output_size, vocab_size,dropout=0.5, num_layers=2):
super(LSTM, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.output_size = output_size
self.vocab_size = vocab_size
self.num_layers = num_layers
self.dropout = dropout
self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
self.LSTM = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers,dropout=self.dropout, batch_first=True, bidirectional=True)
self.fclayer = nn.Linear(self.hidden_dim * 2, self.output_size)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
emb = self.embedding_layer(x)
output,(h_n,c_n) = self.LSTM(emb)
out = torch.cat([h_n[-1,...], h_n[-2,...]], dim=-1)
logits = self.fclayer(out)
sig = self.sigmoid(logits)
return sig
if __name__ == '__main__':
inputs = torch.randint(0,100,(128, 200))
model = LSTM(embedding_dim=300, hidden_dim=256, output_size=2, vocab_size=500)
out = model(inputs)
y_pre = torch.argmax(out, dim=-1)
print(out)
4.模型测试
对每一个输入的句子进行处理,去除标点,变为小写,并转成单词Id+padding,然后输入模型,得到预测结果。
import torch
from Model import LSTM
from string import punctuation
def text_process(text):
text_no_punc = [char.lower() for char in text if char not in punctuation]
text_list = ''.join(text_no_punc).split('\n')
return text_list
def get_dict(text):
# 获取每个单词并去重,去掉空字符
# 转为word2id
all_words = []
for sentence in text:
all_words.extend(sentence.split())
unique_words = list(set(all_words))
id2word = dict(enumerate(unique_words, 1))
word2id = {word:int(idx) for idx, word in id2word.items()}
return word2id, id2word
def sentence_process(word2id, sentence, max_sentence_length=200):
sentence = ''.join([char for char in sentence if char not in punctuation])
sentence = sentence.lower().split()
words_id = [word2id.get(word, 0) for word in sentence]
if len(words_id) < 200:
words_id += [0] * (max_sentence_length-len(words_id))
else:
words_id = words_id[:max_sentence_length]
words_id = torch.tensor(words_id).unsqueeze(0)
return words_id
with open('reviews.txt','r') as f:
text = f.read()
clean_text = text_process(text)
word2id, id2word = get_dict(clean_text)
# text = 'I think this is great!'
# text_id = sentence_process(word2id, sentence=text)
embed_dim = 300
hidden_dim = 256
output_size = 2
vocab_size = len(word2id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTM(embedding_dim=embed_dim, hidden_dim=hidden_dim, output_size=output_size, vocab_size=vocab_size)
model.load_state_dict(torch.load('best_model.bin'), strict=False)
model.to(device)
model.eval()
while True:
print('============================')
x = input('请输入英文评论(输入quit结束):')
if x == 'quit':
break
else:
inputs_ids = sentence_process(word2id, x)
inputs_ids = inputs_ids.to(device)
logits = model(inputs_ids)
y_pre = torch.argmax(logits, dim=-1).tolist()
print('result:', y_pre[0])
5.总结
本项目使用了双向LSTM模型,在使用双向之前,也用单向的进行了尝试,最后的测试准确率都在79%左右。通过优化器的改进没有取得很好的效果,如果使用BERT预训练模型,结果应该会更好一些。