关于词向量的思考

最近刚学习NNLM。NNLM网络在训练过程中为每个单词生成了一个词向量,但从表示每个单词语义信息的的角度来说,生成质量不佳,于是有了后来的word2vec,转么学习高质量的词向量表示。

先给出CBOW的简单实现代码:

import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import random

# 1. 准备语料和词汇表
corpus = "we are learning natural language processing using word2vec cbow model".split()
vocab = list(set(corpus))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# 2. 构造训练样本 (上下文词 → 中心词)
window_size = 2
def generate_context_target(corpus, window_size):
    dataset = []
    for i in range(window_size, len(corpus) - window_size):
        context = [corpus[j] for j in range(i - window_size, i + window_size + 1) if j != i]
        target = corpus[i]
        dataset.append((context, target))
    return dataset

training_data = generate_context_target(corpus, window_size)

# 3. 定义 CBOW 模型
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_idxs):
        # context_idxs shape: (batch_size, context_size)
        embeds = self.embeddings(context_idxs)  # (batch_size, context_size, embed_dim)
        avg_embeds = embeds.mean(dim=1)         # (batch_size, embed_dim)
        out = self.linear(avg_embeds)           # (batch_size, vocab_size)
        return out

# 4. 超参数和模型初始化
embedding_dim = 50
model = CBOW(vocab_size=len(vocab), embedding_dim=embedding_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 5. 训练模型
for epoch in range(100):
    total_loss = 0
    for context, target in training_data:
        context_idxs = torch.tensor([word2idx[w] for w in context], dtype=torch.long).unsqueeze(0)  # shape (1, context_size)
        target_idx = torch.tensor([word2idx[target]], dtype=torch.long)

        output = model(context_idxs)
        loss = loss_fn(output, target_idx)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# 6. 查看词向量
word_vec = model.embeddings.weight.data
print("Embedding for 'cbow':", word_vec[word2idx["cbow"]])

再给出skip gram的简单实现代码:

import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict

# 1. 语料与词表准备
corpus = "we are learning natural language processing using word2vec skipgram model".split()
vocab = list(set(corpus))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# 2. 构造训练数据(中心词 → 上下文词)
window_size = 2
def generate_skipgram_data(corpus, window_size):
    pairs = []
    for i in range(window_size, len(corpus) - window_size):
        center = corpus[i]
        context = [corpus[j] for j in range(i - window_size, i + window_size + 1) if j != i]
        for ctx_word in context:
            pairs.append((center, ctx_word))
    return pairs

training_pairs = generate_skipgram_data(corpus, window_size)

# 3. 定义 Skip-gram 模型
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word_idx):
        embed = self.input_embeddings(center_word_idx)  # (batch_size, embedding_dim)
        out = self.output_embeddings(embed)             # (batch_size, vocab_size)
        return out

# 4. 初始化
embedding_dim = 50
model = SkipGram(len(vocab), embedding_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 5. 训练模型
for epoch in range(100):
    total_loss = 0
    for center, context in training_pairs:
        center_idx = torch.tensor([word2idx[center]], dtype=torch.long)
        context_idx = torch.tensor([word2idx[context]], dtype=torch.long)

        output = model(center_idx)
        loss = loss_fn(output, context_idx)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# 6. 查看词向量
word_vec = model.input_embeddings.weight.data
print("Embedding for 'skipgram':", word_vec[word2idx["skipgram"]])

可以发现,word2vec网络更简洁了。

再之后

ELMo将word level embedding变成了token level embedding,更多地利用了上下文信息来生成token的embedding。那么,为什么transformer不使用ELMo的embedding,而只是简单地使用了nn.Embedding模块呢?

答案:Transformer的上下文相关信息,是通过后面的多层Self-Attention和Feed-forward层逐步编码出来的,而不是直接在Embedding层就完成的。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值