数据准备
AclImdb – v1 Dataset 是用于二进制情绪分类的大型电影评论数据集,其涵盖比基准数据集更多的数据,其中有 25,000 条电影评论用于训练,25,000 条用于测试,还有其他未经标记的数据可供使用。
数据预处理和数据装载
import re
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
def tokenization(content):
content = re.sub("<.*?>"," ",content)
fileters = ['\t','\n','\x97','\x96','#','%','$','&',"\.","\?","!","\,"]
content = re.sub("|".join(fileters)," ",content)
tokens = [i.strip().lower() for i in content.split()]
return tokens
def collate_fn(batch):
"""
:param batch:( [tokens, labels], [tokens, labels])
:return:
"""
content, label = list(zip(*batch))
return content,label
class ImdbDataset(Dataset):
def __init__(self, train=True):
self.train_data_path = '..\\aclImdb\\train\\'
self.test_data_path = '..\\aclImdb\\test\\'
data_path = self.train_data_path if train else self.test_data_path
#把所有文件名放入列表
temp_data_path = [os.path.join(data_path,"pos"), os.path.join(data_path+"neg")]
print(temp_data_path)
self.total_file_path = []