- https://github.com/udacity/deep-learning-v2-pytorch/
文章目录

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
# Load in Data
# open text file and read in data as `text`
with open('data/anna.txt', 'r') as f:
text = f.read()
Tokenization 词语切分
# 我创建了两个字典来将字符与整数进行转换。将字符编码为整数使其更容易在网络中用作输入。
# encode the text and map each character to an integer and vice versa
# we create two dictionaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
# encode the text
encoded = np.array([char2int[ch] for ch in text])
Pre-processing the data
def one_hot_encode(arr, n_labels):
# Initialize the the encoded array
one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
# Fill the appropriate elements with ones
one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
# Finally reshape it to get back to the original array
one_hot = one_hot.reshape((*arr.shape, n_labels))
return one_hot
def one_hot_encodeV2(arr, n_labels):
I = np.eye(n_labels, n_labels)
return I[arr]
Making training mini-batches
Creating Batches
-
1.我们需要做的第一件事是丢弃一些文本,所以我们只有完全完整的小批量
每批包含 N × M N \times M N×M字符,其中 N N N是batch size
(批中序列的数量), M M M是序列的seq_length
或时间步长。
然后,为了得到总批数, K K K,我们可以从数组arr中得到,你用arr的长度除以每批的字符数。一旦你知道了批量的数量,你就可以得到从arr, N ∗ M ∗ K N * M * K N∗M∗K中保留的字符总数。 -
2.在那之后,我们需要把arr分成 N N N batches。
可以使用arr. remodelize (size)实现这一点,其中size是一个元组,包含重塑后的数组的维数大小。我们想要一批 N N N序列,所以我们把它设为第一个维度的大小。对于第二个维度,可以使用-1作为大小的占位符,它将用适当的数据填充数组。
在这之后,你应该有一个 N × ( M ∗ K ) N \times (M * K) N×(M∗K)的数组。 -
3.现在我们有了这个数组,我们可以遍历它来获得我们的小批量。
这个想法是每批是 N × ( M ∗ K ) N \times (M * K) N×(M∗K)数组上的 N × M N \times M N×M窗口。对于每一个后续批处理,窗口都会移动seq_length。我们还希望同时创建输入数组和目标数组。记住,目标仅仅是移动了一个字符的输入
。
我喜欢使用这个窗口的方法是使用range来从 0 0 0到arr取大小为n_steps的步长。形状[1],每个序列中令牌的总数。这样,从range中获得的整数总是指向批处理的开始,并且每个窗口都是seq_length宽的。
def get_batches(arr, batch_size, seq_length):
'''Create a generator that returns batches of size
batch_size x seq_length from arr.
Arguments
---------
arr: Array you want to make batches from
batch_size: Batch size, the number of sequences per batch
seq_length: Number of encoded chars in a sequence
'''
batch_size_total = batch_size * seq_length
# total number of batches we can make
n_batches = len(arr)//batch_size_total
# Keep only enough characters to make full batches
arr = arr[:n_batches * batch_size_total]
# Reshape into batch_size rows
arr = arr.reshape((batch_size, -1))
# iterate through the array, one sequence at a time
for n in range(0, arr.shape[1], seq_length):
# The features
x = arr[:, n:n+seq_length]
# The targets, shifted by one
y = np.zeros_like(x)
try:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
except IndexError:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
yield x, y
batches = get_batches(encoded, 8, 50)
x, y = next(batches)
Defining the network with PyTorch
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
print('Training on GPU!')
else:
print('No GPU available, training on CPU; consider making n_epochs very small.')
class CharRNN(nn.Module):
def __init__(self, tokens, n_hidden=256, n_layers=2,
drop_prob=0.5, lr=0.001):
super().__init__()
self.drop_prob = drop_prob
self.n_layers = n_layers
self.n_hidden = n_hidden
self.lr = lr
# creating character dictionaries
self.chars = tokens
self.int2char = dict(enumerate(self.chars))
self.char2int = {ch: ii for ii, ch in self.int2char.items()}
## TODO: define the LSTM
self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,
dropout=drop_prob, batch_first=True)
## TODO: define a dropout layer
self.dropout = nn.Dropout(drop_prob)
## TODO: define the final, fully-connected output layer
self.fc = nn.Linear(n_hidden, len(self.chars))
def forward(self, x, hidden):
''' Forward pass through the network.
These inputs are x, and the hidden/cell state `hidden`.
x :one-hot 编码 shape[batch,seq_len,len(self.chars)]
r_output: shape[batch,seq_len,n_hidden]
hidden: shape [n_layers,batch,n_hidden]
'''
## TODO: Get the outputs and the new hidden state from the lstm
r_output, hidden = self.lstm(x, hidden)
## TODO: pass through a dropout layer
out = self.dropout(r_output)
# Stack up LSTM outputs using view
# you may need to use contiguous to reshape the output
out = out.contiguous().view(-1, self.n_hidden) # 需要每个序列的输出
## TODO: put x through the fully-connected layer
out = self.fc(out)
# return the final output and the hidden state
return out, hidden
def init_hidden(self, batch_size):
''' Initializes hidden state '''
# Create two new tensors with sizes n_layers x batch_size x n_hidden,
# initialized to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
return hidden
Time to train
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
''' Training a network
Arguments
---------
net: CharRNN network
data: text data to train the network
epochs: Number of epochs to train
batch_size: Number of mini-sequences per mini-batch, aka batch size
seq_length: Number of character steps per mini-batch
lr: learning rate
clip: gradient clipping
val_frac: Fraction of data to hold out for validation
print_every: Number of steps for printing training and validation loss
'''
net.train()
opt = torch.optim.Adam(net.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
# create training and validation data
val_idx = int(len(data)*(1-val_frac))
data, val_data = data[:val_idx], data[val_idx:]
if(train_on_gpu):
net.cuda()
counter = 0
n_chars = len(net.chars)
for e in range(epochs):
# initialize hidden state
h = net.init_hidden(batch_size)
for x, y in get_batches(data, batch_size, seq_length):
counter += 1
# One-hot encode our data and make them Torch tensors
x = one_hot_encode(x, n_chars)
inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
if(train_on_gpu):
inputs, targets = inputs.cuda(), targets.cuda()
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
# zero accumulated gradients
net.zero_grad()
# get the output from the model
output, h = net(inputs, h)
# calculate the loss and perform backprop
loss = criterion(output, targets.view(batch_size*seq_length).long())
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
opt.step()
# loss stats
if counter % print_every == 0:
# Get validation loss
val_h = net.init_hidden(batch_size)
val_losses = []
net.eval()
for x, y in get_batches(val_data, batch_size, seq_length):
# One-hot encode our data and make them Torch tensors
x = one_hot_encode(x, n_chars)
x, y = torch.from_numpy(x), torch.from_numpy(y)
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
inputs, targets = x, y
if(train_on_gpu):
inputs, targets = inputs.cuda(), targets.cuda()
output, val_h = net(inputs, val_h)
val_loss = criterion(output, targets.view(batch_size*seq_length).long())
val_losses.append(val_loss.item())
net.train() # reset to train mode after iterationg through validation data
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.4f}...".format(loss.item()),
"Val Loss: {:.4f}".format(np.mean(val_losses)))
Instantiating the model
# define and print the net
n_hidden=512
n_layers=2
net = CharRNN(chars, n_hidden, n_layers)
print(net)
batch_size = 128
seq_length = 100
n_epochs = 20 # start smaller if you are just testing initial behavior
# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)
Checkpoint
# change the name, for saving multiple files
model_name = 'rnn_20_epoch.net'
checkpoint = {'n_hidden': net.n_hidden,
'n_layers': net.n_layers,
'state_dict': net.state_dict(),
'tokens': net.chars}
with open(model_name, 'wb') as f:
torch.save(checkpoint, f)
Making Predictions
我们传入一个字符,让网络预测下一个字符。然后我们取那个字符,把它传回去,得到另一个预测的字符。只要继续这样做,你就会生成一堆文本!
Top K sampling
def predict(net, char, h=None, top_k=None):
''' Given a character, predict the next character.
Returns the predicted character and the hidden state.
'''
# tensor inputs
x = np.array([[net.char2int[char]]])
x = one_hot_encode(x, len(net.chars))
inputs = torch.from_numpy(x)
if(train_on_gpu):
inputs = inputs.cuda()
# detach hidden state from history
h = tuple([each.data for each in h])
# get the output of the model
out, h = net(inputs, h)
# get the character probabilities
p = F.softmax(out, dim=1).data
if(train_on_gpu):
p = p.cpu() # move to cpu
# get top characters
if top_k is None:
top_ch = np.arange(len(net.chars))
else:
p, top_ch = p.topk(top_k)
top_ch = top_ch.numpy().squeeze()
# select the likely next character with some element of randomness
p = p.numpy().squeeze()
char = np.random.choice(top_ch, p=p/p.sum())
# return the encoded value of the predicted char and the hidden state
return net.int2char[char], h
Priming and generating text
def sample(net, size, prime='The', top_k=None):
if(train_on_gpu):
net.cuda()
else:
net.cpu()
net.eval() # eval mode
# First off, run through the prime characters
chars = [ch for ch in prime]
h = net.init_hidden(1)
for ch in prime:
char, h = predict(net, ch, h, top_k=top_k)
chars.append(char)
# Now pass in the previous character and get a new one
for ii in range(size):
char, h = predict(net, chars[-1], h, top_k=top_k)
chars.append(char)
return ''.join(chars)
print(sample(net, 1000, prime='Anna', top_k=5))
Loading a checkpoint
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_20_epoch.net', 'rb') as f:
checkpoint = torch.load(f)
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])
# Sample using a loaded model
print(sample(loaded, 2000, top_k=5, prime="And Levin said"))
seq2seq
- https://blog.csdn.net/weixin_43632501/article/details/98731800