- https://github.com/udacity/deep-learning-v2-pytorch
文章目录
Word embeddings
Word2Vec
下面是实现Word2Vec的两个体系结构
- CBOW (Continuous Bag-Of-Words)
[wi-2,wi-1,wi+1,wi+2]->[wi]
- Skip-gram
[wi]->[wi-2,wi-1,wi+1,wi+2]
- N-Gram语言模型
[wi-2,wi-1] -> [wi]
(N=2)
在这个实现中,我们将使用Skip-gram
架构,因为它的性能比CBOW
更好。在这里,我们传入一个词,并试着预测文中围绕这个词的词。
通过这种方式,我们可以训练网络学习在相似上下文中出现的单词的表示。
Loading Data
# read in the extracted text file
with open('data/text8') as f:
text = f.read()
# print out the first 100 characters
print(text[:100])
Pre-processing
- 它将任何标点符号转换为标记,因此句点被更改为 。在这个数据集中,没有任何周期,但它会帮助解决其他的NLP问题。
- 它删除在数据集中显示5次或更少的所有单词。这将大大减少数据中的噪声问题,并提高向量表示的质量。
- 它返回文本中的单词列表
# utils.py
import re
from collections import Counter
def preprocess(text):
# Replace punctuation with tokens so we can use them in our model
text = text.lower()
text = text.replace('.', ' <PERIOD> ')
text = text.replace(',', ' <COMMA> ')
text = text.replace('"', ' <QUOTATION_MARK> ')
text = text.replace(';', ' <SEMICOLON> ')
text = text.replace('!', ' <EXCLAMATION_MARK> ')
text = text.replace('?', ' <QUESTION_MARK> ')
text = text.replace('(', ' <LEFT_PAREN> ')
text = text.replace(')', ' <RIGHT_PAREN> ')
text = text.replace('--', ' <HYPHENS> ')
text = text.replace('?', ' <QUESTION_MARK> ')
# text = text.replace('\n', ' <NEW_LINE> ')
text = text.replace(':', ' <COLON> ')
words = text.split()
# Remove all words with 5 or fewer occurences
word_counts = Counter(words)
trimmed_words = [word for word in words if word_counts[word] > 5]
return trimmed_words
def create_lookup_tables(words):
"""
Create lookup tables for vocabulary
:param words: Input list of words
:return: Two dictionaries, vocab_to_int, int_to_vocab
"""
word_counts = Counter(words)
# sorting the words from most to least frequent in text occurrence
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
# create int_to_vocab dictionaries
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
return vocab_to_int, int_to_vocab
import utils
# get list of words
words = utils.preprocess(text)
print(words[:30])
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]
Subsampling
“the”、“of”和“for”等经常出现的词不会为附近的词提供太多上下文。如果我们抛弃其中的一些,我们就可以从我们的数据中去除一些噪声,从而得到更快的训练和更好的表示。
这个过程被Mikolov称为子采样。对于训练集中的每个单词
w
i
w_i
wi,我们将以给定的概率丢弃它
P ( w i ) = 1 − t f ( w i ) P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}} P(wi)=1−f(wi)t
其中 t t t是一个阈值参数, f ( w i ) f(w_i) f(wi)是单词 w i w_i wi在整个数据集中的频率。
为int_words中的单词实现子采样。也就是说,遍历int_words并根据上面所示的概率
P
(
w
i
)
P(w_i)
P(wi)放弃每个单词。
注意,
P
(
w
i
)
P(w_i)
P(wi)是一个单词被丢弃的概率。将采样后的数据赋值给train_words。
from collections import Counter
import random
import numpy as np
threshold = 1e-5
word_counts = Counter(int_words)
#print(list(word_counts.items())[0]) # dictionary of int_words, how many times they appear
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
# discard some frequent words, according to the subsampling equation
# create a new list of words for training
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]
print(train_words[:30])
Making batches
“由于距离较远的单词通常与当前单词的关联要小于与当前单词相近的单词,所以我们在训练示例中减少对距离较远的单词的采样……如果我们选择
C
=
5
C = 5
C=5,
对于每个训练单词,我们将在
[
1
:
C
]
[1:C]
[1:C]范围内随机选择一个数字
R
R
R,然后使用来自历史的单词和来自当前单词未来的单词作为正确的标签。
def get_target(words, idx, window_size=5):
''' Get a list of words in a window around an index. '''
R = np.random.randint(1, window_size+1)
start = idx - R if (idx - R) > 0 else 0
stop = idx + R
target_words = words[start:idx] + words[idx+1:stop+1]
return list(target_words)
# test your code!
# run this cell multiple times to check for random window selection
int_text = [i for i in range(10)]
print('Input: ', int_text)
idx=5 # word index of interest
target = get_target(int_text, idx=idx, window_size=5)
print('Target: ', target) # you should get some indices around the idx
Generating Batches
def get_batches(words, batch_size, window_size=5):
''' Create a generator of word batches as a tuple (inputs, targets) '''
n_batches = len(words)//batch_size
# only full batches
words = words[:n_batches*batch_size]
for idx in range(0, len(words), batch_size):
x, y = [], []
batch = words[idx:idx+batch_size]
for ii in range(len(batch)):
batch_x = batch[ii]
batch_y = get_target(batch, ii, window_size)
y.extend(batch_y)
x.extend([batch_x]*len(batch_y))
yield x, y
int_text = [i for i in range(20)]
x,y = next(get_batches(int_text, batch_size=4, window_size=5))
print('x\n', x)
print('y\n', y)
x
[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3]
y
[1, 2, 3, 0, 2, 3, 0, 1, 3, 1, 2]
Building the graph
这里的想法是训练嵌入层权重矩阵
,为我们的单词找到有效的表示。我们可以放弃softmax层,因为我们真的不关心用这个网络做预测。
我们只是想要嵌入矩阵,这样我们就可以在我们使用这个数据集建立的其他网络中使用它。
Validation
这里,我创建了一个函数,它可以帮助我们观察模型的学习过程。我们会选择一些常用词和一些不常见的词汇。然后,我们将使用余弦相似度打印出最接近它们的单词:
def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
""" Returns the cosine similarity of validation words with words in the embedding matrix.
Here, embedding should be a PyTorch embedding module.
"""
# Here we're calculating the cosine similarity between some random words and
# our embedding vectors. With the similarities, we can look at what words are
# close to our random words.
# sim = (a . b) / |a||b|
embed_vectors = embedding.weight
# magnitude of embedding vectors, |b|
magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
# pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent
valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
valid_examples = np.append(valid_examples,
random.sample(range(1000,1000+valid_window), valid_size//2))
valid_examples = torch.LongTensor(valid_examples).to(device)
valid_vectors = embedding(valid_examples)
similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
return valid_examples, similarities
SkipGram model
import torch
from torch import nn
import torch.optim as optim
class SkipGram(nn.Module):
def __init__(self, n_vocab, n_embed):
super().__init__()
self.embed = nn.Embedding(n_vocab, n_embed)
self.output = nn.Linear(n_embed, n_vocab)
self.log_softmax = nn.LogSoftmax(dim=1)
def forward(self, x):
"""
x:[batch,seq_length]
"""
x = self.embed(x) # [batch,seq_length,n_embed] ;对应的权重 shape [n_vocab, n_embed] (该权重矩阵 记录了每个词的向量表示)
scores = self.output(x)
log_ps = self.log_softmax(scores)
return log_ps
Training
# check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_dim=300 # you can change, if you want
model = SkipGram(len(vocab_to_int), embedding_dim).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
print_every = 500
steps = 0
epochs = 5
# train for some number of epochs
for e in range(epochs):
# get input and target batches
for inputs, targets in get_batches(train_words, 512):
steps += 1
inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets)
inputs, targets = inputs.to(device), targets.to(device)
log_ps = model(inputs)
loss = criterion(log_ps, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if steps % print_every == 0:
# getting examples and similarities
valid_examples, valid_similarities = cosine_similarity(model.embed, device=device)
_, closest_idxs = valid_similarities.topk(6) # topk highest similarities
valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
for ii, valid_idx in enumerate(valid_examples):
closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
print("...")
Visualizing the word vectors
下面我们将使用T-SNE来可视化我们的高维单词向量如何聚类在一起。利用T-SNE将这些向量投影到二维空间,同时保留局部结构。
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# getting embeddings from the embedding layer of our model, by name
embeddings = model.embed.weight.to('cpu').data.numpy()
viz_words = 600
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])
fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
plt.scatter(*embed_tsne[idx, :], color='steelblue')
plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)