视频:https://www.bilibili.com/video/BV1Ky4y1g7Nk?p=3
源码:https://github.com/lansinuote/NLP-Toturials
数据准备等
对于名字,以字母来处理会比较好。
不像句子可以分词。
这里的字典:字符和数字的对应表,共29个字符;不认识的字符用0表示。包含 - 和 1。
数据文件
CNN 实现姓名分类
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# 1、定义数据集
class SurnameDataset(Dataset):
def __init__(self, part):
data = pd.read_csv('./data/surnames/数字化数据.csv')
data = data[data.part == part]
self.data = data
def __getitem__(self, i):
return self.data.iloc[i, 0], self.data.iloc[i, 1]
def __len__(self):
return len(self.data)
train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')
print(len(train_dataset)) # 7680
print(len(val_dataset)) # 1640
print(len(test_dataset)) # 1660
# 2、x转one hot编码
def one_hot(data):
N = len(data)
#N句话,每句话15个词,每个词是个29维向量
xs = np.zeros((N, 15, 29))
ys = np.empty(N)
for i in range(N):
x, y = data[i]
ys[i] = y
x = x.split(',')
for j in range(min(15, len(x))):
xs[i, j, int(x[j]) - 1] = 1
return torch.FloatTensor(xs), torch.LongTensor(ys)
# 3、数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True,
drop_last=True, # 正好100个,不会有尾数
collate_fn=one_hot)
val_dataloader = DataLoader(dataset=val_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=one_hot)
test_dataloader = DataLoader(dataset=test_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=one_hot)
# 4、遍历数据
for i, data in enumerate(train_dataloader):
x, y = data
print(x[:2, :2], x.shape)
print(y[:5], y.shape)
break
# 5、定义网络模型
class SurnameClassifier(nn.Module):
def __init__(self):
super(SurnameClassifier, self).__init__()
h = 50
#[b,h,27] -> [b,h,13]
self.conv1 = nn.Conv1d(in_channels=15,
out_channels=h,
kernel_size=5,
stride=2)
#[b,h,13] -> [b,h,5]
self.conv2 = nn.Conv1d(in_channels=h,
out_channels=h,
kernel_size=5,
stride=2)
#[b,h,5] -> [b,h,1]
self.conv3 = nn.Conv1d(in_channels=h,
out_channels=h,
kernel_size=5,
stride=1)
#激活函数
self.elu = nn.ELU()
self.convnet = nn.Sequential(self.conv1, self.elu, self.conv2,
self.elu, self.conv3, self.elu)
self.fc = nn.Linear(h, 18)
def forward(self, x):
#out = self.conv1(x)
#print(out.shape)
#out = self.conv2(out)
#print(out.shape)
#out = self.conv3(out)
#print(out.shape)
#[b,h,27] -> [b,h]
out = self.convnet(x).squeeze(dim=2) # 压缩掉多余的维度
#[b,h] -> [b,18]
out = self.fc(out)
return out
model = SurnameClassifier()
model(torch.randn(2, 15, 29))
def test(dataloader):
model.eval()
correct = 0
total = 0
for i, data in enumerate(dataloader):
x, y = data
y_pred = model(x)
y_pred = y_pred.argmax(axis=1)
correct += (y_pred == y).sum().item()
total += len(y)
return correct / total
test(val_dataloader)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.train()
for epoch in range(10):
for i, data in enumerate(train_dataloader):
x, y = data
optimizer.zero_grad()
y_pred = model(x)
loss = loss_func(y_pred, y)
loss.backward()
optimizer.step()
if epoch % 1 == 0:
accurecy = test(val_dataloader)
print(epoch, loss.item(), accurecy)
test(test_dataloader) # 0.659375
RNN 计算过程
拿本次的词和上一次的记忆作为输入。
RNN 实现姓名分类
字典和数据都和前面使用 CNN 对姓名分类一样;
这里不将名字变为 One-hot,而是切割为字;后面补 0。
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# 1、定义数据
class SurnameDataset(Dataset):
def __init__(self, part):
data = pd.read_csv('./data/surnames/数字化数据.csv')
data = data[data.part == part]
self.data = data
def __getitem__(self, i):
return self.data.iloc[i, 0], self.data.iloc[i, 1]
def __len__(self):
return len(self.data)
train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))
# 2、数据转 tensor
def to_tensor(data):
N = len(data)
#N句话,每句话15个词
xs = np.zeros((N, 15))
ys = np.empty(N)
for i in range(N):
x, y = data[i]
ys[i] = y
x = x.split(',') + [0] * 15
x = x[:15]
xs[i] = x
return torch.LongTensor(xs), torch.LongTensor(ys)
# 3、数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
val_dataloader = DataLoader(dataset=val_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
test_dataloader = DataLoader(dataset=test_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
# 4、遍历数据
for i, data in enumerate(train_dataloader):
x, y = data
print(x[:5], x.shape)
print(y[:5], y.shape)
break
# 5、定义网络模型
class SurnameClassifier(nn.Module):
def __init__(self):
super(SurnameClassifier, self).__init__()
self.embedding = nn.Embedding(num_embeddings=30,
embedding_dim=50,
padding_idx=0)
self.rnn_cell = nn.RNNCell(50, 100) # 输入50维,输出100维;
# 这里使用 rnn cell,但先不使用 rnn 层。rnn层可以一次处理一整句话。
self.fc1 = nn.Linear(in_features=100, out_features=100)
self.fc2 = nn.Linear(in_features=100, out_features=18)
# 网络计算函数
def forward(self, x):
b = x.shape[0]
#[b,15] -> [b,15,20] # 多一个维度,20
embed = self.embedding(x)
#[b,15,20] -> [b,30]
out = torch.zeros((b, 100))
for i in range(15):
out = self.rnn_cell(embed[:, i, :], out) # 得到记忆
#[b,30] -> [b,18]
out = F.relu(self.fc1(F.dropout(out, 0.5)))
out = self.fc2(F.dropout(out, 0.5))
return out
model = SurnameClassifier()
model(torch.ones(2, 15).long())
# 预测函数
def test(dataloader):
model.eval()
correct = 0
total = 0
for i, data in enumerate(dataloader):
x, y = data
y_pred = model(x)
y_pred = y_pred.argmax(dim=1)
correct += (y_pred == y).sum().item()
total += len(y)
return correct / total
test(val_dataloader)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.train()
for epoch in range(20):
for i, data in enumerate(train_dataloader):
x, y = data
optimizer.zero_grad()
y_pred = model(x)
loss = loss_func(y_pred, y)
loss.backward()
optimizer.step()
if epoch % 1 == 0:
accurecy = test(val_dataloader)
print(epoch, loss.item(), accurecy)
test(test_dataloader)
GRU 实现字符预测
三大循环神经网络
这里的字符预测,是无监督学习
#定义数据
class SurnameDataset(Dataset):
def __init__(self, part):
data = pd.read_csv('./data/surnames/数字化数据.csv')
data = data[data.part == part]
#去掉少于3个字符的名字
def filter_by_len(line):
return len(line.x.split(',')) >= 3
data = data[data.apply(filter_by_len, axis=1)]
self.data = data
def __getitem__(self, i):
return self.data.iloc[i, 0], self.data.iloc[i, 1]
def __len__(self):
return len(self.data)
train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))
def to_tensor(data):
N = len(data)
#N句话,每句话14个词
xs = np.zeros((N, 14))
#尾字母
ys = np.empty(N)
for i in range(N):
x, y = data[i]
x = x.split(',')
#取x的最后一个字母作为y
ys[i] = x[-1]
#x去掉最后一个字母
x = x[:-1]
#反转后补0,在前面补0,切割到14位的长度
x = x[::-1] + ['0'] * 14
#切割到14位长度
x = x[:14]
#反转回来
x = x[::-1]
xs[i] = x
return torch.LongTensor(xs), torch.LongTensor(ys)
#数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
val_dataloader = DataLoader(dataset=val_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
test_dataloader = DataLoader(dataset=test_dataset,
batch_size=100,
shuffle=True,
drop_last=True,
collate_fn=to_tensor)
#遍历数据
sample = None
for i, data in enumerate(train_dataloader):
sample = data
x, y = data
print(x[:3], x.shape)
print(y[:3], y.shape)
break
#定义网络模型
class SurnameClassifier(nn.Module):
def __init__(self):
super(SurnameClassifier, self).__init__()
self.embedding = nn.Embedding(num_embeddings=30,
embedding_dim=50,
padding_idx=0)
self.rnn = nn.GRU(input_size=50, hidden_size=100, batch_first=True) # 输入是 50维的向量,记忆是100维;
self.fc1 = nn.Linear(in_features=100, out_features=100)
self.fc2 = nn.Linear(in_features=100, out_features=30)
def forward(self, x):
#[b,14] -> [b,14,50]
embed = self.embedding(x)
#[b,14,50] -> [b,14,100],[1,b,100]
out, h = self.rnn(embed)
#[b,100] -> [b,30]
out = F.relu(self.fc1(F.dropout(h.squeeze(), 0.2)))
out = self.fc2(F.dropout(out, 0.2))
return out
model = SurnameClassifier()
model(sample[0])
def test(dataloader):
model.eval()
correct = 0
total = 0
for i, data in enumerate(dataloader):
x, y = data
y_pred = model(x)
y_pred = y_pred.argmax(dim=1)
correct += (y_pred == y).sum().item()
total += len(y)
return correct / total
test(val_dataloader)
# -----------
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.train()
for epoch in range(20):
for i, data in enumerate(train_dataloader):
x, y = data
optimizer.zero_grad()
y_pred = model(x)
loss = loss_func(y_pred, y)
loss.backward()
optimizer.step()
if epoch % 1 == 0:
accurecy = test(val_dataloader)
print(epoch, loss.item(), accurecy)
test(test_dataloader)
2022-02-19(六) 下雨