情感分析初级实战

本文介绍了一个使用双向LSTM模型进行英文评论情感分析的项目,包括数据预处理、模型构建、训练、测试及最终实现的过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

目录

一、概述

二、细节

1.数据预处理

2.模型训练

3.模型构建

4.模型测试

5.总结


一、概述

该任务是对英文评论进行分析,得到评论的情感:正面/负面。

总体流程如下:数据预处理→模型构建→模型训练与测试

二、细节

1.数据预处理

将所有文本,去除标点,变为小写,之后构建词典,把每一个句子的每一个词根据词典转为id,之后根据句子的长度进行padding,最后把数据分割成训练集、验证集、测试集。

from string import punctuation
import random
import numpy as np
import torch
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset
from torch import optim
from tqdm import tqdm
import torch.nn as nn
from sklearn.metrics import classification_report

import Model

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if __name__ == '__main__':
    #导入数据
    with open('reviews.txt','r') as f:
        text = f.read()
    with open('labels.txt', 'r') as f:
        labels = f.read()
    # 去除标点
    def text_process(text):
        text_no_punc = [char.lower() for char in text if char not in punctuation]
        text_list = ''.join(text_no_punc).split('\n')
        return text_list

    clean_text = text_process(text)
    labels = labels.split('\n')
    # 获取词典
    def get_dict(text):
        # 获取每个单词并去重,去掉空字符
        # 转为word2id
        all_words = []
        for sentence in text:
            all_words.extend(sentence.split())
        unique_words = list(set(all_words))
        id2word = dict(enumerate(unique_words, 1))
        word2id = {word:int(idx) for idx, word in id2word.items()}

        return word2id, id2word

    word2id, id2word = get_dict(clean_text)
    label2id = np.array([1 if x == 'positive' else 0 for x in labels])
    # 根据长度进行padding
    length = [len(sentence.split()) for sentence in clean_text]
    counts = Counter(length)

    min_len = min(counts.items())
    max_len = max(counts.items())
    min_idx = [i for i, sentence in enumerate(clean_text) if len(sentence.split())==min_len[0]]
    max_idx = [i for i, sentence in enumerate(clean_text) if len(sentence.split())==max_len[0]]
    new_text = np.delete(clean_text, min_idx+max_idx)
    new_label = np.delete(label2id, min_idx+max_idx)

    max_sentence_length = 200
    words_id = []
    for sentence in new_text:
        senten = sentence.split(' ')
        words_id.append([word2id.get(word, 0) for word in senten])
    # padding
    dataset = np.zeros((len(words_id), max_sentence_length))
    for index, words in enumerate(words_id):
        if len(words) >= max_sentence_length:
            dataset[index, :] = words[:max_sentence_length]
        else:
            dataset[index, :len(words)] = words

    data_tensor = torch.tensor(dataset, dtype=torch.long)
    data_label = torch.tensor(new_label, dtype=torch.long)
    # 数据分割
    train_size = int(data_tensor.shape[0] * 0.8)
    rest_size = data_tensor.shape[0] - train_size
    val_size = int(rest_size * 0.5)
    test_size = int(rest_size * 0.5)

    train_data = data_tensor[:train_size]
    train_label = data_label[:train_size]
    rest_data = data_tensor[train_size:]
    rest_label = data_label[train_size:]

    val_data = rest_data[:val_size]
    val_label = rest_label[:val_size]
    test_data = rest_data[val_size:]
    test_label = rest_label[val_size:]

2.模型训练

由于这里和上面的数据预处理代码是连着的,因此先上这个代码,模型构建放在最后,在训练或测试的时候导入即可。

该训练中,采用的是双向LSTM模型,在每一个epoch中,都会进行验证以及测试,并保存。

    # 数据封装
    train_dataset = TensorDataset(train_data, train_label)
    val_dataset = TensorDataset(val_data, val_label)
    test_dataset = TensorDataset(test_data, test_label)

    batch_size = 64

    train_loader = DataLoader(train_dataset, batch_size=batch_size,
                              pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,
                              pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,
                              pin_memory=True, shuffle=True, drop_last=True, num_workers=4)
    
    # 设置模型参数
    embed_dim = 300
    hidden_dim=256
    output_size=2
    vocab_size=len(word2id)
    epochs = 20
    device = "cuda" if torch.cuda.is_available() else 'cpu'
    # 导入模型
    model = Model.LSTM(embedding_dim=embed_dim, hidden_dim=hidden_dim, output_size=output_size, vocab_size=vocab_size)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.98,0.999), eps=1e-08)
    criterion = nn.CrossEntropyLoss()
    # 训练

    train_loss = 0.0
    best_f1 = 0.0
    for epoch in tqdm(range(epochs), desc='Epoch'):
        model.train()
        for step,(x,y) in enumerate(train_loader):
            x,y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            # 输出是(B, 2)
            loss = criterion(logits, y)
            train_loss += loss.item() * batch_size
            loss.backward()
            optimizer.step()
        train_loss /= len(train_loader)
        print()
        print('train loss: {}'.format(train_loss))
        
        # 每一个epoch进行一次验证,看看损失
        model.eval()
        correct = 0
        val_loss = 0
        for step,(x,y) in enumerate(val_loader):
            x,y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            val_loss += loss.item() * batch_size
            y_pred = torch.argmax(logits, dim=-1)
            correct+=y_pred.eq(y).sum().item()
        print('val loss: {}  accuracy: {}'.format(val_loss/len(val_loader), correct/(len(val_loader)*batch_size)))
        
        # 每一个epoch进行一次测试,看看准确率以及f1等数据,根据最好的分数保存模型
        
        model.eval()
        y_real = []
        y_pred = []
        test_loss = 0
        for step, (x, y) in enumerate(test_loader):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            test_loss += loss.item() * batch_size
            y_pre = torch.argmax(logits, dim=-1).tolist()

            y_real.extend(y.tolist())
            y_pred.extend(y_pre)
        ret = classification_report(y_real, y_pred, output_dict=True)
        
        # classification_report的各种分数
        precision_0 = ret['0']['precision']
        recall_0 = ret['0']['recall']
        f1_0 = ret['0']['f1-score']
        support_0 = ret['0']['support']

        precision_1 = ret['1']['precision']
        recall_1 = ret['1']['recall']
        f1_1 = ret['1']['f1-score']
        support_1 = ret['1']['support']

        weight_0 = support_0 / (support_1+support_0)
        weight_1 = 1-weight_0

        avg_precision = precision_0 * weight_0 + precision_1 * weight_1
        avg_recall = recall_0 * weight_0 + recall_1 * weight_1
        avg_f1 = f1_0 * weight_0 + f1_1 * weight_1

        all_avg_precision = ret['macro avg']['precision']
        all_avg_recall = ret['macro avg']['recall']
        all_avg_f1 = ret['macro avg']['f1-score']

        print('Testing Epoch: [%d/%d]' % (epoch, epochs))
        print('"0" precision=%.2f, recall=%.2f, f1=%.2f' % (precision_0, recall_0, f1_0))
        print('"1" precision=%.2f, recall=%.2f, f1=%.2f' % (precision_1, recall_1, f1_1))
        # 根据最好分数存储模型
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            torch.save(model.state_dict(), 'best_model_adjusted.bin')
            print('Save the best model %s, avg_f1=%.2f'%('best_model.bin', best_f1))

3.模型构建

模型由以下层组成:embedding层, LSTM层,全连接层

import torch.nn as nn
import torch

class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_size, vocab_size,dropout=0.5, num_layers=2):
        super(LSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_size = output_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.LSTM = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers,dropout=self.dropout, batch_first=True, bidirectional=True)
        self.fclayer = nn.Linear(self.hidden_dim * 2, self.output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        emb = self.embedding_layer(x)
        output,(h_n,c_n) = self.LSTM(emb)
        out = torch.cat([h_n[-1,...], h_n[-2,...]], dim=-1)
        logits = self.fclayer(out)
        sig = self.sigmoid(logits)
        return sig

if __name__ == '__main__':
    inputs = torch.randint(0,100,(128, 200))
    model = LSTM(embedding_dim=300, hidden_dim=256, output_size=2, vocab_size=500)
    out = model(inputs)
    y_pre = torch.argmax(out, dim=-1)
    print(out)

4.模型测试

对每一个输入的句子进行处理,去除标点,变为小写,并转成单词Id+padding,然后输入模型,得到预测结果。

import torch
from Model import LSTM
from string import punctuation

def text_process(text):
    text_no_punc = [char.lower() for char in text if char not in punctuation]
    text_list = ''.join(text_no_punc).split('\n')
    return text_list

def get_dict(text):
    # 获取每个单词并去重,去掉空字符
    # 转为word2id
    all_words = []
    for sentence in text:
        all_words.extend(sentence.split())
    unique_words = list(set(all_words))
    id2word = dict(enumerate(unique_words, 1))
    word2id = {word:int(idx) for idx, word in id2word.items()}
    return word2id, id2word

def sentence_process(word2id, sentence, max_sentence_length=200):
    sentence = ''.join([char for char in sentence if char not in punctuation])
    sentence = sentence.lower().split()
    words_id = [word2id.get(word, 0) for word in sentence]
    if len(words_id) < 200:
        words_id += [0] * (max_sentence_length-len(words_id))
    else:
        words_id = words_id[:max_sentence_length]

    words_id = torch.tensor(words_id).unsqueeze(0)
    return words_id

with open('reviews.txt','r') as f:
    text = f.read()
clean_text = text_process(text)
word2id, id2word = get_dict(clean_text)

# text = 'I think this is great!'
# text_id = sentence_process(word2id, sentence=text)

embed_dim = 300
hidden_dim = 256
output_size = 2
vocab_size = len(word2id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTM(embedding_dim=embed_dim, hidden_dim=hidden_dim, output_size=output_size, vocab_size=vocab_size)
model.load_state_dict(torch.load('best_model.bin'), strict=False)
model.to(device)

model.eval()
while True:
    print('============================')
    x = input('请输入英文评论(输入quit结束):')
    if x == 'quit':
        break
    else:
        inputs_ids = sentence_process(word2id, x)
        inputs_ids = inputs_ids.to(device)
        logits = model(inputs_ids)
        y_pre = torch.argmax(logits, dim=-1).tolist()
        print('result:', y_pre[0])

5.总结

本项目使用了双向LSTM模型,在使用双向之前,也用单向的进行了尝试,最后的测试准确率都在79%左右。通过优化器的改进没有取得很好的效果,如果使用BERT预训练模型,结果应该会更好一些。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值