目录
2. Sclaed Dot-Product Attention 与 Multi-Head Attention (MHA)
2.1. Sclaed Dot-Product Attention
2.2. Multi-Head Attention (MHA)
3. Normalization系列(LN, BN,RMSNorm)
9. 池化系列(Maxpooling, Averagepooling)
12. 解码策略系列(Top-p samling,Top-k sampling)
1. transformer系列
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class MultiHeadAttention(nn.Module):
def __init__(self, d_model=512, n_head=8):
super().__init__()
self.d_k = d_model // n_head
self.n_head = n_head
# 手动实现Q/K/V线性变换
self.q_proj = nn.Linear(d_model, d_model)
self.k_proj = nn.Linear(d_model, d_model)
self.v_proj = nn.Linear(d_model, d_model)
self.out_proj = nn.Linear(d_model, d_model)
def forward(self, q, k, v, mask=None):
# 分头处理
q = self.q_proj(q).view(q.size(0), -1, self.n_head, self.d_k).transpose(1, 2)
k = self.k_proj(k).view(k.size(0), -1, self.n_head, self.d_k).transpose(1, 2)
v = self.v_proj(v).view(v.size(0), -1, self.n_head, self.d_k).transpose(1, 2)
# 手动计算注意力分数
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attn = torch.softmax(scores, dim=-1)
# 合并多头结果
output = torch.matmul(attn, v).transpose(1, 2).contiguous().view(q.size(0), -1, self.n_head * self.d_k)
return self.out_proj(output)
class PositionWiseFeedForward(nn.Module):
def __init__(self, d_model, d_ff):
super().__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(0.1)
def forward(self, x):
return self.fc2(self.dropout(F.relu(self.fc1(x))))
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_seq_len):
super().__init__()
pe = torch.zeros(max_seq_len, d_model)
position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
return x + self.pe[:, :x.size(1)]
class EncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(0.1)
def forward(self, x, mask):
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return x
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.cross_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(0.1)
def forward(self, x, enc_output, src_mask, tgt_mask):
# 自注意力 + add & norm
attn_output = self.self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
# 交叉注意力 + add & norm
attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(attn_output))
# 前馈网络 + add & norm
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x
class Transformer(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len):
super().__init__()
self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
self.fc = nn.Linear(d_model, tgt_vocab_size)
self.dropout = nn.Dropout(0.1)
def generate_mask(self, src, tgt):
src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
seq_len = tgt.size(1)
nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool()
tgt_mask = tgt_mask & nopeak_mask
return src_mask, tgt_mask
def forward(self, src, tgt):
src_mask, tgt_mask = self.generate_mask(src, tgt)
# Encoder
src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
enc_output = src_embedded
for enc_layer in self.encoder_layers:
enc_output = enc_layer(enc_output, src_mask)
# Decoder
tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
dec_output = tgt_embedded
for dec_layer in self.decoder_layers:
dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
output = self.fc(dec_output)
return output
# 样例测试
src_vocab_size, tgt_vocab_size = 10, 10 # 源语言词汇表大小, 目标语言词汇表大小
batch_size, num_heads, max_seq_len, d_model = 2, 8, 10, 512
d_ff, num_layers = d_model*4, 6 # ffn隐藏层维度, # encoder和decoder层数
# 创建随机输入数据
src = torch.randint(1, src_vocab_size, (batch_size, max_seq_len)) # 假设0是padding
tgt = torch.randint(1, tgt_vocab_size, (batch_size, max_seq_len)) # 假设0是padding
# 创建模型
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len)
output = model(src, tgt)
print("Input shape:", src.shape, tgt.shape)
print("Output shape:", output.shape) # 应该是 (batch_size, seq_len, tgt_vocab_size)
# 计算损失
criterion = nn.CrossEntropyLoss(ignore_index=0)
loss = criterion(output.view(-1, tgt_vocab_size), tgt.view(-1))
print("Loss:",round( loss.item(), 4))
2. Sclaed Dot-Product Attention 与 Multi-Head Attention (MHA)
- 更多‘Sclaed Dot-Product Attention、MHA、GQA、MLA、MQA’手撕代码详见:多模态学习路线(2)——DL基础系列-CSDN博客
2.1. Sclaed Dot-Product Attention
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
np.random.seed(42)
torch.manual_seed(42)
class DotProductAttention(nn.Module):
def __init__(self, d_model):
super().__init__()
self.scale = math.sqrt(d_model)
def forward(self, q, k, v, mask=None):
scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
weights = F.softmax(scores, dim=-1)
output = torch.matmul(weights, v)
return output, weights
batch_size, seq_len, d_model = 4, 12, 512
attn = DotProductAttention(d_model)
q = torch.randn(batch_size, seq_len, d_model)
k = v = torch.randn(batch_size, seq_len, d_model) # 简单起见,k和v相同
output, weights = attn(q, k, v)
print(f"输入: q{k.shape}, k{k.shape}, v{v.shape}")
print(f"输出: {output.shape}, 权重形状: {weights.shape}")
print("第一个样本的注意力权重:\n", weights[0].detach().numpy().round(3))
2.2. Multi-Head Attention (MHA)
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, dim, n_head, dropout=0.1):
super(MultiHeadAttention, self).__init__()
self.dim = dim
self.head_dim = dim // n_head
self.n_head = n_head
assert self.dim == self.head_dim * n_head
self.linear_q = nn.Linear(dim, dim)
self.linear_k = nn.Linear(dim, dim)
self.linear_v = nn.Linear(dim, dim)
self.dropout = nn.Dropout(dropout)
self.fc_out = nn.Linear(dim, dim)
def forward(self, x, mask=None):
b, t, d = x.size()
Q = self.linear_q(x)
K = self.linear_k(x)
V = self.linear_v(x)
# Reshape and transpose for multi-head attention
Q = Q.view(b, t, self.n_head, self.head_dim).transpose(1, 2)
K = K.view(b, t, self.n_head, self.head_dim).transpose(1, 2)
V = V.view(b, t, self.n_head, self.head_dim).transpose(1, 2)
# Scaled dot-product attention
score = torch.matmul(Q, K.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
# Apply mask (for decoder)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# Softmax and dropout
score = F.softmax(score, dim=-1)
if self.dropout is not None:
score = self.dropout(score)
# Combine heads and project back to original dimension
output = torch.matmul(score, V).transpose(1, 2).contiguous().view(b, t, d)
output = self.fc_out(output)
return output
def generate_mask(len_seq):
"""生成下三角掩码矩阵(用于Decoder的自回归掩码)"""
return torch.tril(torch.ones(len_seq, len_seq))
# 示例用法
embed_dim = 512
num_heads = 8
model = MultiHeadAttention(embed_dim, num_heads)
x = torch.randn(16, 10, 512) # 输入形状: [batch_size, seq_len, dim]
mask = generate_mask(10).unsqueeze(0).expand(16, 10, 10) # 扩展掩码到batch维度
output = model(x, mask)
print(output.shape) # 输出: torch.Size([16, 10, 512])
3. Normalization系列(LN, BN,RMSNorm)
3.1. LayerNorm(LN)
### 输入:(batch_size, seq_len, feature_dim)(常见于序列模型Transformer/RNN)
def layernorm(x, gamma, beta, eps=1e-5):
# x shape: (N, L, D)
N = x.shape[0]
mean = np.mean(x, axis=(1, 2)).reshape(N, 1, 1) # (N, 1, 1)
var = np.var(x, axis=(1, 2)).reshape(N, 1, 1) # (N, 1, 1)
x_normalized = (x - mean) / np.sqrt(var + eps) # 归一化
return gamma * x_normalized + beta
### e.g. x形状为 (2, 3, 4)(batch_size=2, seq_len=3, feature_dim=4)
x = [
[ # 样本 1
[1, 2, 3, 4], # seq_len1
[5, 6, 7, 8], # seq_len2
[9, 10, 11, 12] # seq_len3
],
[ # 样本 2
[-1, -2, -3, -4], # seq_len1
[-5, -6, -7, -8], # seq_len2
[-9, -10, -11, -12] # seq_len3
]
]
x = np.array(x)
gamma, beta = 0.1, 0.1
x_ln = layernorm(x, gamma, beta, eps=1e-5)
print(x_ln.shape)
"""
LN 计算:对样本1归一化:计算所有时间步和特征的均值
mean = (1+2+...+12) / 12 = 6.5
var = (1²+2²+...+12²)/12 - mean² = 11.9167
样本2同理(每个样本独立计算)。
"""
3.2. BatchNorm(BN)
# 输入x:(batch_size, channels, height, width)(常见于CNN)
def batchnorm(x, gamma, beta, eps=1e-5):
# x shape: (N, C, H, W)
c = x.shape[1]
mean = np.mean(x, axis=(0, 2, 3)).reshape(1,c,1,1) # 计算每个通道的均值,维度 (1,C,1,1)
var = np.var(x, axis=(0, 2, 3)).reshape(1,c,1,1) # 计算每个通道的方差,维度 (1,C,1,1)
x_normalized = (x - mean) / np.sqrt(var + eps) # 归一化
return gamma * x_normalized + beta # 缩放和偏移
# e.g. x形状为 (2, 3, 2, 2)(batch_size=2, channels=3, height=2, width=2)
x = [
[ # 样本 1
[[1, 2], [3, 4]], # 通道 1
[[5, 6], [7, 8]], # 通道 2
[[9, 10], [11, 12]] # 通道 3
],
[ # 样本 2
[[-1, -2], [-3, -4]], # 通道 1
[[-5, -6], [-7, -8]], # 通道 2
[[-9, -10], [-11, -12]] # 通道 3
]
]
x = np.array(x)
gamma, beta = 0.1, 0.1
x_bn = batchnorm(x, gamma, beta, eps=1e-5)
print(x_bn.shape)
"""
对通道1归一化:计算所有样本和空间位置的均值.
mean = (1+2+3+4 + (-1-2-3-4)) / 8 = 0
var = (1²+2²+...+(-4)²)/8 - mean² = 7.5
其他通道类似(每个通道独立计算)。
"""
3.3. RMSNorm
传统LN计算均值和方差;RMSNorm去除均值中心化,仅使用均方根RMS进行缩放。避免减法,提高数值稳定性,简化计算;仅使用均方根RMS归一化,计算量更小。只对特征维度(feature_dim)计算。
# (batch_size, seq_len, feature_dim)
def rms_norm(x, gamma, eps=1e-5):
rms = np.sqrt(np.mean(x**2, axis=-1, keepdims=True)) # 只在特征维度(D)计算
return gamma * x / (rms + eps)
# x形状为 (2, 3, 4)
x = [
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], # 样本1
[[-1, -2, -3, -4], [-5, -6, -7, -8], [-9, -10, -11, -12]] # 样本2
]
x = np.array(x)
gamma = 1.0
x_rms = rms_norm(x, gamma, eps=1e-5)
print(x_rms.shape)
"""
对样本1的第一个时间步 [1, 2, 3, 4] 计算:
RMS = sqrt(1**2 + 2**2 + 3**2 + 4**2)/4 = sqrt(7.5) ≈ 2.7386
归一化后:
[1,2,3,4]/2.7386 ≈ [0.3651,0.7303,1.0954,1.4606]
最终输出(γ=1):与归一化值相同。
"""
4. 位置编码PE系列(正余弦编码,ROPE)
4.1. 绝对位置编码(正余弦编码)
import torch
import math
def position_encoding(seq_len, d_model):
# 创建一个空的张量用于存储位置编码
position = torch.arange(0, seq_len).unsqueeze(1).float() # [seq_len, 1] --> (10, 1)
dim = torch.arange(0, d_model // 2).float() # [d_model // 2] # (1, 8)
# 计算位置编码
encoding = position / (10000 ** (2 * dim / (d_model))) # [10, 16 // 2]
# 分别计算sin和cos
sin_enc, cos_enc = torch.sin(encoding), torch.cos(encoding) # sin编码, cos编码
print(sin_enc.shape)
# 将sin和cos交替插入,得到完整的编码(sin值放到偶数位置, cos值放到奇数位置)
encoding = torch.zeros(seq_len, d_model) # [seq_len, d_model]
encoding[:, 0::2], encoding[:, 1::2] = sin_enc, cos_enc
return encoding
seq_len, d_model = 10, 16 # 序列长度, 嵌入维度
encoding = sinusoidal_position_encoding(seq_len, d_model)
print("Sin_Cos PE:\n", encoding)
4.2. 旋转位置编码(ROPE)
### 手撕ROPE
import torch
import torch.nn as nn
import numpy as np
np.random.seed(42)
torch.manual_seed(42)
class RotaryEmbedding(nn.Module):
def __init__(self, d_model, num_heads, base=10000, max_len=512):
super().__init__()
self.head_dim = d_model // num_heads
half_dim = self.head_dim // 2
pos = torch.arange(max_len).float()
freqs = 1.0 / (base ** (2 * torch.arange(half_dim).float() / self.head_dim))
angles = torch.einsum("i,j->ij", pos, freqs) # m*θ [max_len, half_dim]
sin = torch.sin(angles).repeat_interleave(2, -1) # sin(m*θ)
cos = torch.cos(angles).repeat_interleave(2, -1) # cos(m*θ)
self.register_buffer("cos", cos[None, :, :]) # [1, max_len, head_dim]
self.register_buffer("sin", sin[None, :, :]) # [1, max_len, head_dim]
def forward(self, q): # q: [B, T, D]
B, T, D = q.shape
q = q.view(B, T, -1, self.head_dim).transpose(1, 2) # [B, H, T, D/H]
cos = self.cos[:, :T, :].unsqueeze(1) # [1, 1, T, D/H]
sin = self.sin[:, :T, :].unsqueeze(1)
# 把 q(形状 [B, H, T, D/H])分成两个子张量:
# q1: 取偶数维度(即第 0、2、4…),q2: 取奇数维度(即第 1、3、5…)
q1, q2 = q[..., ::2], q[..., 1::2]
q_rot = torch.stack([q1 * cos[..., ::2] - q2 * sin[..., ::2],
q1 * sin[..., ::2] + q2 * cos[..., ::2]], dim=-1)
return q_rot.flatten(-2, -1) # [B, H, T, (D/H)/2, 2]--> [B, H, T, D/H]
# 示例调用
bs, num_heads, T, d_model = 4, 8, 128, 512
q = torch.randn(bs, seq_len, d_model)
rope = RotaryEmbedding(d_model, num_heads)
q_rope = rope(q)
q_rope.shape # [4, 8, 128, 64]
5. 在MHA里加入KV-Cache
import torch
import torch.nn as nn
import numpy as np
np.random.seed(42)
torch.manual_seed(42)
class MHAWithCache(nn.Module):
def __init__(self, dim, n_head):
super().__init__()
self.dim = dim
self.n_head = n_head
self.head_dim = dim // n_head
self.q_proj = nn.Linear(dim, dim)
self.k_proj = nn.Linear(dim, dim)
self.v_proj = nn.Linear(dim, dim)
self.out = nn.Linear(dim, dim)
self.k_cache = None
self.v_cache = None
def forward(self, x, mask=None, use_cache=False):
B, T, _ = x.shape
q, k, v = self.q_proj(x), self.k_proj(x), self.v_proj(x) # 计算QKV
# KV Cache
if use_cache:
if self.k_cache is not None:
k = torch.cat([self.k_cache, k], dim=1)
v = torch.cat([self.v_cache, v], dim=1)
self.k_cache = k
self.v_cache = v
# 多头处理
q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
k = k.view(B, -1, self.n_head, self.head_dim).transpose(1, 2)
v = v.view(B, -1, self.n_head, self.head_dim).transpose(1, 2)
# attention计算
attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
if mask is not None:
attn = attn.masked_fill(mask == 0, float('-inf'))
attn = attn.softmax(-1)
out = (attn @ v).transpose(1, 2).reshape(B, T, self.dim)
return self.out(out)
def clear_cache(self):
self.k_cache = None
self.v_cache = None
# (1) 首次调用不使用缓存
batch_size, seq_len, n_head, dim = 2, 5, 4, 64
model = MHAWithCache(dim, n_head)
x = torch.randn(batch_size, seq_len, dim) # 2个样本,序列长度5,维度64
out1 = model(x, use_cache=False)
print("首次输出形状:", out1.shape) # 应该输出 torch.Size([2, 5, 64])
# (2) 使用缓存的自回归生成
model.clear_cache()
x1 = torch.randn(batch_size, 1, dim) # 第一个token
out2, _ = model(x1, use_cache=True)
x2 = torch.randn(batch_size, 1, dim) # 第二个token
out3, _ = model(x2, use_cache=True)
print("自回归输出形状:", out3.shape) # 应该输出 torch.Size([2, 1, 64])
print("缓存k的形状:", model.k_cache.shape) # 应该输出 torch.Size([2, 2, 64])
print("缓存v的形状:", model.v_cache.shape) # 应该输出 torch.Size([2, 2, 64])
# (3) 清空缓存
model.clear_cache()
print("清空后缓存:", model.k_cache is None) # 应该输出 True
6. 激活函数系列
见多模态学习路线(2)——DL基础系列-CSDN博客,章节“二、激活函数”
7. 损失函数系列
(MSE损失,交叉熵损失,KL散度,Focal Loss,InfoNCE Loss)
见多模态学习路线(2)——DL基础系列-CSDN博客,章节“四、损失函数”
8. Numpy手撕MLP的前向和反向传播
注释:手撕MLP的回归和二分类任务除了 损失函数 和 输出层激活函数 不同,其他代码部分(例如前向传播、反向传播、梯度更新等)是完全相同的。
- 输出层激活函数:Sigmoid(二分类) vs. 无激活函数/线性激活(回归)。
- 损失函数:BCE Loss(二分类) vs. MSE(回归)。
8.1. Numpy手撕MLP(二分类)
import numpy as np
# 激活函数及其导数
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
# 损失函数:二分类交叉熵损失
def binary_crossentropy_loss(y_true, y_pred):
epsilon = 1e-15
y_pred = np.clip(y_pred, epsilon, 1 - epsilon) # 防止log(0)错误
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
# MLP单层模型定义(适用于二分类)
class MLP:
def __init__(self, input_size, output_size):
self.W = np.random.randn(input_size, output_size) * 0.01 # 权重
self.b = np.zeros((1, output_size)) # 偏置
def forward(self, X):
self.Z = np.dot(X, self.W) + self.b
return sigmoid(self.Z) # Sigmoid 激活函数
def backward(self, X, y, lr):
m = X.shape[0] # 样本数
output_error = y - self.Z # 预测误差
output_delta = output_error * sigmoid_derivative(self.Z) # 输出层梯度
# 计算梯度并更新权重和偏置
self.dW = np.dot(X.T, output_delta) / m
self.db = np.sum(output_delta, axis=0, keepdims=True) / m
self.W += lr * self.dW
self.b += lr * self.db
def train(self, X, y, epochs, lr, batch_size):
for epoch in range(epochs):
indices = np.random.permutation(X.shape[0]) # 随机打乱数据
X_shuffled, y_shuffled = X[indices], y[indices]
for i in range(0, X.shape[0], batch_size):
X_batch = X_shuffled[i:i + batch_size]
y_batch = y_shuffled[i:i + batch_size]
# 前向传播和反向传播
self.forward(X_batch)
self.backward(X_batch, y_batch, lr)
if epoch % 100 == 0:
y_pred = self.forward(X)
loss = binary_crossentropy_loss(y, y_pred)
print(f'Epoch {epoch}, Loss: {loss}')
def predict(self, X, threshold=0.5):
# 预测类别标签
y_pred = self.forward(X)
return (y_pred >= threshold).astype(int) # 返回 0 或 1 类别标签
# 测试:二分类(XOR)
if __name__ == '__main__':
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # 输入
y = np.array([[0], [1], [1], [0]]) # 目标输出
model = MLP(input_size=2, output_size=1)
model.train(X, y, epochs=1000, lr=0.1, batch_size=2)
predictions = model.predict(X) # 输出类别标签(0 或 1)
print("Predictions after training (class labels): \n", predictions)
8.2. Numpy手撕MLP(回归)
import numpy as np
# 激活函数及其导数
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
# MSE Loss
def mse_loss(y_true, y_pred):
return np.mean((y_true - y_pred) ** 2)
# MLP单层模型定义(适用于回归)
class MLP:
def __init__(self, input_size, output_size):
self.W = np.random.randn(input_size, output_size) * 0.01 # 权重
self.b = np.zeros((1, output_size)) # 偏置
print(self.W, self.b)
def forward(self, X):
self.Z = np.dot(X, self.W) + self.b
return self.Z # 线性输出,适用于回归
def backward(self, X, y, lr):
m = X.shape[0] # 样本数
output_error = y - self.Z # 预测误差
output_delta = output_error # 回归问题没有激活函数,梯度直接为误差
# 计算梯度并更新权重和偏置
self.dW = np.dot(X.T, output_delta) / m
self.db = np.sum(output_delta, axis=0, keepdims=True) / m
self.W += lr * self.dW
self.b += lr * self.db
def train(self, X, y, epochs, lr, batch_size):
for epoch in range(epochs):
indices = np.random.permutation(X.shape[0]) # 随机打乱数据
X_shuffled, y_shuffled = X[indices], y[indices]
for i in range(0, X.shape[0], batch_size):
X_batch = X_shuffled[i:i + batch_size]
y_batch = y_shuffled[i:i + batch_size]
# 前向传播和反向传播
self.forward(X_batch)
self.backward(X_batch, y_batch, lr)
if epoch % 100 == 0:
y_pred = self.forward(X)
loss = mse_loss(y, y_pred)
print(f'Epoch {epoch}, Loss: {loss}')
def predict(self, X):
return self.forward(X) # 回归任务直接返回线性输出
# 测试:回归(预测y = n1 * x1 + n2 * x2)
if __name__ == '__main__':
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # 输入
y = np.array([[0], [3], [2], [5]]) # 目标输出(回归目标)
model = MLP(input_size=2, output_size=1)
model.train(X, y, epochs=1000, lr=0.1, batch_size=2)
predictions = model.predict(X)
print("Predictions after training: ", predictions)
8.3. Torch手撕MLP(二分类)
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# 定义单层 MLP 模型(用于二分类)
class MLPBinaryClassification(nn.Module):
def __init__(self, input_size):
super(MLPBinaryClassification, self).__init__()
self.fc = nn.Linear(input_size, 1) # 单层全连接层
def forward(self, x):
x = torch.sigmoid(self.fc(x)) # Sigmoid 激活函数输出概率
return x
def predict(self, X):
self.eval() # 切换到评估模式
with torch.no_grad():
predictions = self.forward(X)
predicted_labels = (predictions >= 0.5).float() # 阈值0.5判断类别
return predicted_labels
def train_model(self, X_train, y_train, epochs=1000, lr=0.1):
criterion = nn.BCELoss() # 二分类交叉熵损失
optimizer = optim.SGD(self.parameters(), lr=lr)
for epoch in range(epochs):
# 前向传播
outputs = self.forward(X_train)
loss = criterion(outputs, y_train)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 每100次输出一次损失值
if epoch % 100 == 0:
print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
# 二分类
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # 输入
y = np.array([[0], [1], [1], [0]]) # 目标输出(二分类目标)
# 数据转换为 PyTorch 张量
X_train = torch.tensor(X, dtype=torch.float32)
y_train = torch.tensor(y, dtype=torch.float32).view(-1, 1)
# 训练 & 预测
model = MLPBinaryClassification(input_size=2)
model.train_model(X_train, y_train, epochs=1000, lr=0.1)
predictions = model.predict(X_train) # 使用predict函数进行预测
print("Predictions (class labels) after training:: ", predictions.numpy())
8.4. Torch手撕MLP(回归)
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# 定义单层 MLP 模型(用于二分类)
class MLPRegression(nn.Module):
def __init__(self, input_size):
super(MLPRegression, self).__init__()
self.fc = nn.Linear(input_size, 1) # 单层全连接层
def forward(self, x):
return self.fc(x) # 回归问题,直接输出线性值
def predict(self, X):
self.eval() # 切换到评估模式
with torch.no_grad():
predictions = self.forward(X)
return predictions
def train_model(self, X_train, y_train, epochs=1000, lr=0.1):
criterion = nn.MSELoss() # 均方误差损失
optimizer = optim.SGD(self.parameters(), lr=lr)
for epoch in range(epochs):
# 前向传播
outputs = self.forward(X_train)
loss = criterion(outputs, y_train)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 每100次输出一次损失值
if epoch % 100 == 0:
print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
# 回归
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # 输入
y = np.array([[0], [3], [2], [5]]) # 目标输出(回归目标)
X_train = torch.tensor(X, dtype=torch.float32)
y_train = torch.tensor(y, dtype=torch.float32).view(-1, 1)
model = MLPRegression(input_size=2)
model.train_model(X_train, y_train, epochs=1000, lr=0.1)
# 训练 & 预测
predictions = model.predict(X_train) # 使用predict函数进行预测
print("Predictions after training:: ", predictions.numpy())
9. 池化系列(Maxpooling, Averagepooling)
9.1. Maxpooling
import torch
import numpy as np
np.random.seed(42)
torch.manual_seed(42)
class MaxPooling:
def __init__(self, kernel_size=2):
self.k = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
def __call__(self, x):
b, c, h, w = x.shape
kh, kw = self.k
oh, ow = h // kh, w // kw
x = x.view(b, c, oh, kh, ow, kw) # 拆分成 patch [8, 3, 2, 2, 2, 2])
return x.amax(dim=(3, 5))
# 使用示例
maxpool = MaxPooling(2)
b, c, h, w = 8, 3, 4, 4
x = torch.rand(b, c, h, w)
print('maxpool: ', maxpool(x)) # maxpool: torch.Size([8, 3, 2, 2])
9.2. Averagepooling
import torch
import numpy as np
np.random.seed(42)
torch.manual_seed(42)
class AvgPooling:
def __init__(self, kernel_size=2):
self.k = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
def __call__(self, x):
b, c, h, w = x.shape
kh, kw = self.k
oh, ow = h // kh, w // kw
x = x.view(b, c, oh, kh, ow, kw) # 拆分成 patch, [8, 3, 2, 2, 2, 2]
return x.mean(dim=(3, 5))
# 使用示例
avgpool = AvgPooling(2)
b, c, h, w = 8, 3, 4, 4
x = torch.rand(b, c, h, w)
print('avgpool: ', avgpool(x).shape) # avgpool: torch.Size([8, 3, 2, 2])
10. 传统ML类
10.1. 手撕线性回归-随机梯度下降sgd
-
解析解(正规方程):其中X是设计矩阵(含偏置项),y是目标值向量;
import numpy as np
np.random.seed(42)
class LR_W:
def fit(self, X, y):
# 添加偏置项
X_b = np.c_[np.ones((X.shape[0], 1)), X]
# 正规方程求解
self.theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y
return self
def predict(self, X):
X_b = np.c_[np.ones((X.shape[0], 1)), X]
return X_b @ self.theta
# 生成数据
X = np.random.rand(100, 2) # (100, 2)
y = np.random.rand(100) # (100, 1)
# 训练模型
model = LR_W().fit(X, y)
print("Closed-form theta:", model.theta)
-
梯度下降法(SGD)
梯度公式
参数更新
import numpy as np
np.random.seed(42)
class LR_SGD:
def __init__(self, lr=0.01, epochs=1000):
self.lr = lr
self.epochs = epochs
def fit(self, X, y):
m, n = X.shape
X_b = np.c_[np.ones((m, 1)), X]
self.theta = np.random.randn(n + 1)
for _ in range(self.epochs):
gradients = X_b.T @ (X_b @ self.theta - y) / m
self.theta -= self.lr * gradients
return self
def predict(self, X):
X_b = np.c_[np.ones((X.shape[0], 1)), X]
return X_b @ self.theta
# 生成数据
X = np.random.rand(100, 2) # (100, 2)
y = np.random.rand(100) # (100, 1)
# 训练模型
model = LR_SGD(lr=0.1, epochs=100).fit(X, y)
print("SGD theta:", model.theta)
10.2. 实现KMeans聚类
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
class KMeans:
def __init__(self, k=3, max_iters=100):
self.k = k
self.max_iters = max_iters
def fit(self, X):
# 1. 随机初始化质心
self.centroids = X[np.random.choice(len(X), self.k, replace=False)]
for _ in range(self.max_iters):
# 2. 分配样本到最近质心(向量化实现)
distances = np.linalg.norm(X[:, None] - self.centroids, axis=2)
labels = np.argmin(distances, axis=1)
# 3. 更新质心
new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(self.k)])
# 4. 检查收敛
if np.allclose(self.centroids, new_centroids):
break
self.centroids = new_centroids
def predict(self, X):
distances = np.linalg.norm(X[:, None] - self.centroids, axis=2)
return np.argmin(distances, axis=1)
X = np.concatenate([
np.random.normal(0, 1, (50, 3)),
np.random.normal(5, 1, (50, 3))], axis=0)
model = KMeans(k=2)
model.fit(X)
labels = model.predict(X)
plt.scatter(X[:,0], X[:,1], c=labels)
plt.scatter(model.centroids[:,0], model.centroids[:,1], marker='X', s=200, c='red')
plt.show()
10.3. 手写AUC指标
- M为正样本数,N为负样本数;
- ranki为第i个正样本的排序位次(从小到大排序后的索引+1)。
def AUC_by_RANK(labels, preds):
f = list(zip(preds, labels))
rank = [i+1 for i, (p, l) in enumerate(sorted(f, key=lambda x: x[0])) if l == 1]
M = sum(labels)
N = len(labels) - M
return (sum(rank) - M*(M+1)/2) / (M * N)
y_true = [0,0,1,1,1]
y_preds = [0.7,0.2,0.5,0.1,0.2]
print('AUC by RANK: ', round(AUC_by_RANK(y_true, y_preds), 4))
11. 调包实现Clip图文对比
- 训练伪代码
- 推理代码
import torch
import clip
from PIL import Image
# (1)配置GPU & 导入model、preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# (2)导入本地image、text
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
# (3)通过CLIP的对比学习算出每个text和image的匹配概览
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs)
# prints: [[0.9927937 0.00421068 0.00299572]]
12. 解码策略系列(Top-p samling,Top-k sampling)
12.1. Top-k samling
- 原理:仅从概率最高的前
k
个词中采样,过滤低概率词; - 特点:平衡多样性与质量,但
k
是固定值,可能在某些上下文下不合适。
def top_k_sampling(logits, k=2):
values, indices = torch.topk(logits, k)
probs = torch.softmax(values, dim=-1)
return indices[torch.multinomial(probs, 1)]
# 示例
logits = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
print(top_k_sampling(logits, k=2)) # 可能输出: tensor([3])
12.2. Top-p sampling
- 原理:从累积概率超过阈值
p
的最小词集合中采样; - 特点:动态调整候选词数量,适应不同上下文。
def top_p_sampling(logits, p=0.9):
probs = torch.softmax(logits, dim=-1)
sorted_probs, sorted_indices = torch.sort(probs, descending=True)
cum_probs = torch.cumsum(sorted_probs, dim=-1)
mask = cum_probs <= p
mask = torch.cat([torch.ones_like(mask[:1]), mask[:-1]], dim=0)
filtered_probs = sorted_probs * mask
sampled_idx = torch.multinomial(filtered_probs, 1)
return sorted_indices[sampled_idx]
# 示例
logits = torch.tensor([[0.1, 0.3, 0.2, 0.4]])
print(top_p_sampling(logits, p=0.8)) # 可能输出: tensor([3])
13. 算法系列
13.1. 完全平方数
import math
def numSquares(n):
# 初始化动态规划数组,dp[i]表示和为i的完全平方数的最少数量
dp = [float('inf')] * (n + 1)
# 初始条件:和为0需要0个完全平方数
dp[0] = 0
# 遍历所有可能的i(从1到n)
for i in range(1, n + 1):
# 遍历所有可能的完全平方数j*j(j从1到sqrt(i))
for j in range(1, int(math.sqrt(i)) + 1):
square = j * j
if square <= i:
dp[i] = min(dp[i], dp[i - square] + 1)
return dp[n]
print(numSquares(12)) # 输出: 12 = 4 + 4 + 4, 3
print(numSquares(13)) # 输出: 13 = 4 + 9, 2
13.2. sqrt(x, eps)开平方根(小数点后三位,两种方法)
- 二分法;
- 牛顿迭代法:通过迭代公式逼近平方根。公式为
。
# 方法(1):二分法
def sqrt_binary(x, eps=1e-3):
if x < 0:
raise ValueError("x must be non-negative")
if x == 0:
return 0.0
low, high = 0.0, max(x, 1.0)
while high - low > eps:
mid = (low + high) / 2
if mid * mid < x:
low = mid
else:
high = mid
return round((low + high) / 2, 3)
print(sqrt_binary(8)) # 输出: 2.828
# 方法(2):牛顿迭代法——通过迭代x_n+1 = (x_n + x/x_n) /2 逼近平方根
def sqrt_newton(x, eps=1e-3):
if x < 0:
raise ValueError("x must be non-negative")
if x == 0:
return 0.0
guess = x # 初始猜测值
while abs(guess * guess - x) > eps:
guess = (guess + x / guess) / 2
return round(guess, 3)
print(sqrt_newton(8)) # 输出: 2.828
13.3. 除自身外数组乘积
def productExceptSelf(nums):
n = len(nums)
left, right = [1] * n, [1] * n
# 计算左侧乘积
for i in range(1, n):
left[i] = left[i-1] * nums[i-1]
# 计算右侧乘积
for i in range(n-2, -1, -1):
right[i] = right[i+1] * nums[i+1]
# 合并结果
return [left[i] * right[i] for i in range(n)]
13.4. 最大绝对值子数组和
def maxAbsoluteSum(nums):
max_sum = min_sum = current_max = current_min = nums[0]
for num in nums[1:]:
current_max = max(num, current_max + num)
current_min = min(num, current_min + num)
max_sum = max(max_sum, current_max)
min_sum = min(min_sum, current_min)
return max(abs(max_sum), abs(min_sum))
# 示例测试
print(maxAbsoluteSum([1, -3, 2, 3, -4])) # 输出: 5(子数组 [2,3])
print(maxAbsoluteSum([2, -5, 1, -4, 3, -2])) # 输出: 8(子数组 [-5,1,-4])
-
最大子数组和
- 买卖股票的最佳时机2
class Solution:
def maxProfit(self, prices):
max_profit = 0
for i in range(1, len(prices)):
if prices[i] > prices[i-1]:
max_profit += prices[i] - prices[i-1]
return max_profit
14. 排序系列
14.1. 二分查找
class Solution:
def minNumberInRotateArray(self, rotateArray):
array_len = len(rotateArray)
left = 0
right = array_len - 1
while left < right:
mid = int((left + right) / 2)
if rotateArray[mid] > rotateArray[right]:
left = mid + 1
elif rotateArray[mid] < rotateArray[right]:
right = mid
elif rotateArray[mid] == rotateArray[right]:
right -= 1
return rotateArray[right]
14.2. 快排
"""
快速排序是一种递归/分治算法,核心思想是:
(1) 选择基准(Pivot or mid):从数组中选择一个元素作为基准(通常选中间元素或随机元素)。
(2) 分区:
将比基准小的元素放在左边(left)。
等于基准的元素放在中间(middle)。
比基准大的元素放在右边(right)。
(3) 递归排序:对左右子数组递归调用快速排序,直到子数组长度为 1 或 0(即已排序)。
(4) 合并结果:最终合并 left + middle + right 得到排序后的数组。
时间复杂度:
最优/平均情况:O(nlogn)
最坏情况(如数组已排序且基准选择不当):O(n²)
"""
def quick_sort(nums):
if len(nums) <= 1:
return nums
mid = nums[len(nums)//2]
left = [x for x in nums if x < mid]
middle = [x for x in nums if x == mid]
right = [x for x in nums if x > mid]
return quick_sort(left) + middle + quick_sort(right)
14.3. 冒泡
"""
冒泡排序是一种交换排序算法,核心思想是:
(1) 相邻比较:从数组的第一个元素开始,依次比较相邻的两个元素。
(2) 交换位置:如果前一个元素比后一个大,就交换它们的位置。
(3) 重复遍历:每一轮遍历都会将当前未排序部分的最大元素“冒泡”到数组末尾。
(4) 终止条件:当某一轮遍历没有发生任何交换时,说明数组已排序完成。
时间复杂度:
最优情况(数组已排序):O(n)(只需一次遍历)。
平均/最坏情况:O(n²)。
"""
def bubble_sort(nums):
n = len(nums)
for i in range(n):
swapped = False
for j in range(n-i-1):
if nums[j] > nums[j+1]:
nums[j], nums[j+1] = nums[j+1], nums[j]
swapped = True
if not swapped: # 终止冒泡
break
return nums