注意力熵保存

最新推荐文章于 2025-08-02 01:34:13 发布

dt3t

最新推荐文章于 2025-08-02 01:34:13 发布

阅读量263

点赞数 1

CC 4.0 BY-SA版权

文章标签：深度学习人工智能

本文链接：https://blog.csdn.net/qq_36396406/article/details/138122148

def scaled_dot_product_attention(query_states, key_states, attn_mask=None):
    """
    Calculates scaled dot-product attention scores.

    Parameters:
        query_states: torch.Tensor of shape (bs, n_heads, q_len, d_h_qk)
        key_states: torch.Tensor of shape (bs, n_heads, kv_len, d_h_qk)
        attn_mask: torch.Tensor of shape (bs, n_heads, q_len, kv_len), optional

    Returns:
        torch.Tensor: Attention scores after softmax of shape (bs, n_heads, q_len, kv_len)
    """
    # Calculate the raw attention scores from query and key
    d_h_qk = query_states.size(-1)  # Dimension of the query/key vectors in each head
    raw_scores = torch.matmul(query_states, key_states.transpose(-2, -1))
    # Scale scores by the inverse square root of the dimensionality
    scaled_scores = raw_scores / (d_h_qk ** 0.5)

    # Apply the attention mask if provided (assuming mask elements are 0 for positions to attend to and -inf or a large negative value for masked positions)
    if attn_mask is not None:
        scaled_scores = scaled_scores + attn_mask

    # Apply softmax to get probabilities (dim=-1 to softmax over the key/value dimension kv_len)
    attention_probs = F.softmax(scaled_scores, dim=-1)

    return attention_probs


def entropy_of_distributions(tensor):
    """
    Calculates the entropy of distributions for each q_len in the tensor of shape (bs, n_heads, q_len, kv_len).

    Parameters:
        tensor: torch.Tensor of shape (bs, n_heads, q_len, kv_len) containing probability distributions

    Returns:
        torch.Tensor: Entropy values for each distribution of shape (bs, n_heads, q_len)
    """
    # Ensure the tensor values are treated as probabilities, i.e., >= 0 and sum to 1 across kv_len
    tensor = torch.clamp(tensor, min=0)  # Clamp values to be >= 0 to handle potential numerical issues
    tensor /= tensor.sum(dim=-1, keepdim=True)  # Normalize to ensure sums to 1 along kv_len

    # Calculate entropy
    log_tensor = torch.log2(tensor + 1e-6)  # Adding epsilon to avoid log(0)
    entropy = -torch.sum(tensor * log_tensor, dim=-1)

    return entropy


def save_attention_details(query_states, key_states, causal_mask):
    """
    计算注意力得分并保存注意力得分的形状和熵到文件中。

    参数:
    - query_states: 查询向量（tensor）, shape: (bs, n_heads, q_len, d_h_qk)
    - key_states: 键向量（tensor）, shape: (bs, n_heads, kv_len, d_h_qk)
    - causal_mask: 因果掩码（tensor）, shape: (bs, n_heads, q_len, kv_len)
    """
    # 计算注意力得分
    attn_score = scaled_dot_product_attention(query_states, key_states, attn_mask=causal_mask)

    # 计算注意力得分的熵
    attn_entropy = entropy_of_distributions(attn_score)  # 假设这个函数已经定义

    # 打开文件进行追加
    with open('/data2/hkzheng/clm/tensor.txt', 'a') as f:
        # 写入attn_score的形状
        attn_score_shape_str = str(attn_score.shape)
        f.write('attn_score shape: ' + attn_score_shape_str + '\n')

        # 选择attn_entropy的第一个元素，并将其从GPU转移到CPU，然后转换为numpy数组
        tensor_np = attn_entropy[0].cpu().numpy()

        # 写入attn_entropy的形状
        tensor_np_shape_str = str(tensor_np.shape)
        f.write('attn_entropy shape: ' + tensor_np_shape_str + '\n')

        # 写入tensor_np的内容
        np.savetxt(f, tensor_np, fmt='%.3e', delimiter=' ', newline='\n')
        f.write('\n')  # 在数组后添加一个新行以便区分不同的写入块