【零基础学AI】第15讲：K近邻算法实战

本文链接：https://blog.csdn.net/wiyi9891/article/details/149042541

在这里插入图片描述

本节课你将学到

理解K近邻算法的"近朱者赤"思想
掌握距离度量方法和K值选择技巧
学会构建基于相似度的推荐系统
完成商品推荐项目
了解KNN的优缺点和适用场景

开始之前

环境要求

Python 3.8+
内存: 建议2GB+

需要安装的包

pip install pandas numpy scikit-learn matplotlib seaborn

前置知识

基本的Python语法
距离概念（欧几里得距离）
简单的数据处理

核心概念

什么是K近邻算法？

想象你刚搬到新城市，想找个好餐厅：

传统方法（决策树、SVM）：

先学习什么是好餐厅的规律
根据规律判断新餐厅

KNN方法：

看看你最相似的K个邻居都去哪吃饭
他们喜欢的，你可能也喜欢
近朱者赤，近墨者黑

KNN的核心思想

相似性假设：相似的样本具有相似的标签
局部决策：只关注最近的K个邻居
懒惰学习：不需要训练过程，预测时才计算
多数投票：邻居们"投票"决定结果

KNN的优势

简单直观：思想容易理解
无参数假设：不假设数据分布
适合局部模式：能捕捉复杂的局部决策边界
天然多分类：自然支持多类别问题

KNN的挑战

计算密集：每次预测都要计算所有距离
维度诅咒：高维数据中距离失去意义
不平衡敏感：少数类容易被多数类压制
存储需求：需要保存所有训练数据

代码实战

步骤1：生成商品和用户数据

# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

print("🛒 KNN商品推荐系统")
print("=" * 40)

def generate_shopping_data():
    """生成购物偏好数据"""
    print("生成用户购物数据...")
    
    np.random.seed(42)
    n_users = 1000
    
    # 用户基本信息
    users_data = {
        '用户ID': range(1, n_users + 1),
        '年龄': np.random.randint(18, 65, n_users),
        '性别': np.random.choice([0, 1], n_users),  # 0:女, 1:男
        '收入等级': np.random.choice([1, 2, 3, 4, 5], n_users),  # 1-5级
        '城市等级': np.random.choice([1, 2, 3], n_users),  # 1-3线城市
        
        # 购物行为特征
        '月购物次数': np.random.randint(1, 20, n_users),
        '平均订单金额': np.random.randint(50, 1000, n_users),
        '使用优惠券频率': np.random.uniform(0, 1, n_users),
        
        # 商品类别偏好评分 (0-5分)
        '服装偏好': np.random.uniform(0, 5, n_users),
        '电子产品偏好': np.random.uniform(0, 5, n_users),
        '食品偏好': np.random.uniform(0, 5, n_users),
        '家居用品偏好': np.random.uniform(0, 5, n_users),
        '运动用品偏好': np.random.uniform(0, 5, n_users),
        '图书偏好': np.random.uniform(0, 5, n_users),
    }
    
    df = pd.DataFrame(users_data)
    
    # 基于用户特征生成购买行为
    # 年轻人更喜欢电子产品和服装
    df.loc[df['年龄'] < 30, '电子产品偏好'] *= 1.3
    df.loc[df['年龄'] < 30, '服装偏好'] *= 1.2
    
    # 中年人更喜欢家居和食品
    df.loc[(df['年龄'] >= 30) & (df['年龄'] < 50), '家居用品偏好'] *= 1.3
    df.loc[(df['年龄'] >= 30) & (df['年龄'] < 50), '食品偏好'] *= 1.2
    
    # 收入高的人偏好更高
    df.loc[df['收入等级'] >= 4, ['服装偏好', '电子产品偏好']] *= 1.2
    
    # 限制偏好值在0-5之间
    preference_cols = ['服装偏好', '电子产品偏好', '食品偏好', '家居用品偏好', '运动用品偏好', '图书偏好']
    df[preference_cols] = df[preference_cols].clip(0, 5)
    
    # 生成目标变量：用户下月会购买哪个类别的商品
    # 基于最高偏好的类别
    df['最喜欢类别'] = df[preference_cols].idxmax(axis=1)
    
    # 将类别名转换为数字标签
    category_mapping = {
        '服装偏好': 0, '电子产品偏好': 1, '食品偏好': 2,
        '家居用品偏好': 3, '运动用品偏好': 4, '图书偏好': 5
    }
    df['类别标签'] = df['最喜欢类别'].map(category_mapping)
    
    print(f"数据生成完成！用户数: {len(df)}")
    print(f"类别分布:")
    category_counts = df['最喜欢类别'].value_counts()
    for category, count in category_counts.items():
        print(f"  {category.replace('偏好', '')}: {count}人")
    
    return df

# 生成数据
df = generate_shopping_data()
print(f"\n数据预览:")
print(df[['年龄', '性别', '收入等级', '服装偏好', '电子产品偏好', '最喜欢类别']].head())

步骤2：数据探索和可视化

print(f"\n=== 数据探索分析 ===")

# 基本统计信息
print(f"数据形状: {df.shape}")
print(f"用户年龄范围: {df['年龄'].min()}-{df['年龄'].max()}岁")
print(f"收入等级分布: {dict(df['收入等级'].value_counts().sort_index())}")

# 可视化用户特征分布
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('用户特征分布分析', fontsize=16)

# 年龄分布
axes[0, 0].hist(df['年龄'], bins=20, alpha=0.7, color='skyblue')
axes[0, 0].set_title('年龄分布')
axes[0, 0].set_xlabel('年龄')

# 收入等级分布
income_counts = df['收入等级'].value_counts().sort_index()
axes[0, 1].bar(income_counts.index, income_counts.values, color='lightgreen')
axes[0, 1].set_title('收入等级分布')
axes[0, 1].set_xlabel('收入等级')

# 购物次数分布
axes[0, 2].hist(df['月购物次数'], bins=15, alpha=0.7, color='salmon')
axes[0, 2].set_title('月购物次数分布')
axes[0, 2].set_xlabel('购物次数')

# 类别偏好热力图
preference_cols = ['服装偏好', '电子产品偏好', '食品偏好', '家居用品偏好', '运动用品偏好', '图书偏好']
correlation_matrix = df[preference_cols].corr()

im = axes[1, 0].imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
axes[1, 0].set_xticks(range(len(preference_cols)))
axes[1, 0].set_yticks(range(len(preference_cols)))
axes[1, 0].set_xticklabels([col.replace('偏好', '') for col in preference_cols], rotation=45)
axes[1, 0].set_yticklabels([col.replace('偏好', '') for col in preference_cols])
axes[1, 0].set_title('偏好相关性')

# 最喜欢类别分布
category_counts = df['最喜欢类别'].value_counts()
axes[1, 1].pie(category_counts.values, 
               labels=[cat.replace('偏好', '') for cat in category_counts.index],
               autopct='%1.1f%%')
axes[1, 1].set_title('最喜欢类别分布')

# 年龄vs收入散点图
scatter = axes[1, 2].scatter(df['年龄'], df['收入等级'], 
                            c=df['类别标签'], cmap='Set3', alpha=0.6)
axes[1, 2].set_xlabel('年龄')
axes[1, 2].set_ylabel('收入等级')
axes[1, 2].set_title('年龄vs收入(按偏好着色)')

plt.tight_layout()
plt.show()

步骤3：构建KNN模型

print(f"\n=== 构建KNN模型 ===")

# 准备特征和标签
feature_cols = ['年龄', '性别', '收入等级', '城市等级', '月购物次数', '平均订单金额', '使用优惠券频率']
preference_cols = ['服装偏好', '电子产品偏好', '食品偏好', '家居用品偏好', '运动用品偏好', '图书偏好']
all_features = feature_cols + preference_cols

X = df[all_features]
y = df['类别标签']

print(f"特征数量: {X.shape[1]}")
print(f"样本数量: {X.shape[0]}")

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 特征标准化（KNN对尺度敏感）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"训练集: {X_train_scaled.shape[0]} 样本")
print(f"测试集: {X_test_scaled.shape[0]} 样本")

# 寻找最优K值
def find_best_k(X_train, y_train, X_test, y_test, k_range):
    """寻找最优K值"""
    train_scores = []
    test_scores = []
    
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        train_score = knn.score(X_train, y_train)
        test_score = knn.score(X_test, y_test)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
    
    return train_scores, test_scores

# 测试不同K值
k_range = range(1, 21)
train_scores, test_scores = find_best_k(X_train_scaled, y_train, X_test_scaled, y_test, k_range)

# 可视化K值选择
plt.figure(figsize=(10, 6))
plt.plot(k_range, train_scores, 'bo-', label='训练集准确率')
plt.plot(k_range, test_scores, 'ro-', label='测试集准确率')
plt.xlabel('K值')
plt.ylabel('准确率')
plt.title('不同K值的模型性能')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 选择最优K值
best_k = k_range[np.argmax(test_scores)]
best_score = max(test_scores)
print(f"\n最优K值: {best_k}")
print(f"最佳测试准确率: {best_score:.4f}")

# 训练最优KNN模型
knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
knn_classifier.fit(X_train_scaled, y_train)

# 预测
y_pred = knn_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n最终模型性能:")
print(f"K值: {best_k}")
print(f"准确率: {accuracy:.4f} ({accuracy*100:.2f}%)")

步骤4：模型评估和分析

print(f"\n=== 模型详细评估 ===")

# 类别映射
category_names = ['服装', '电子产品', '食品', '家居用品', '运动用品', '图书']

# 分类报告
print("分类报告:")
print(classification_report(y_test, y_pred, target_names=category_names))

# 混淆矩阵
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=category_names, yticklabels=category_names)
plt.title('KNN模型混淆矩阵')
plt.xlabel('预测类别')
plt.ylabel('真实类别')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# 分析预测错误的案例
wrong_predictions = X_test[y_test != y_pred]
if len(wrong_predictions) > 0:
    print(f"\n预测错误分析:")
    print(f"错误预测数量: {len(wrong_predictions)}")
    print(f"错误率: {len(wrong_predictions)/len(y_test):.2%}")
    
    # 展示几个错误案例
    error_indices = np.where(y_test != y_pred)[0][:3]
    for i, idx in enumerate(error_indices):
        real_idx = X_test.index[idx]
        print(f"\n错误案例 {i+1}:")
        print(f"  真实类别: {category_names[y_test.iloc[idx]]}")
        print(f"  预测类别: {category_names[y_pred[idx]]}")
        print(f"  用户特征: 年龄{df.loc[real_idx, '年龄']}岁, "
              f"收入等级{df.loc[real_idx, '收入等级']}")

步骤5：相似用户推荐系统

print(f"\n=== 构建推荐系统 ===")

class UserRecommendationSystem:
    """基于KNN的用户推荐系统"""
    
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.nn_model = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
        self.scaler = StandardScaler()
        self.user_data = None
        
    def fit(self, user_features):
        """训练推荐模型"""
        self.user_data = user_features
        user_features_scaled = self.scaler.fit_transform(user_features)
        self.nn_model.fit(user_features_scaled)
        
    def find_similar_users(self, user_id):
        """找到相似用户"""
        user_idx = user_id - 1  # 用户ID从1开始
        user_features = self.user_data.iloc[user_idx:user_idx+1]
        user_features_scaled = self.scaler.transform(user_features)
        
        distances, indices = self.nn_model.kneighbors(user_features_scaled)
        
        # 排除自己
        similar_indices = indices[0][1:]
        similar_distances = distances[0][1:]
        
        return similar_indices, similar_distances
    
    def recommend_for_user(self, user_id):
        """为用户推荐商品类别"""
        similar_users, distances = self.find_similar_users(user_id)
        
        # 获取相似用户的偏好
        similar_preferences = df.iloc[similar_users][preference_cols]
        
        # 计算相似度权重（距离越小，相似度越高）
        weights = 1 / (1 + distances)
        weights = weights / weights.sum()
        
        # 加权平均计算推荐分数
        recommendation_scores = {}
        for i, col in enumerate(preference_cols):
            weighted_score = np.average(similar_preferences[col], weights=weights)
            category_name = col.replace('偏好', '')
            recommendation_scores[category_name] = weighted_score
        
        # 排序推荐
        sorted_recommendations = sorted(recommendation_scores.items(), 
                                      key=lambda x: x[1], reverse=True)
        
        return sorted_recommendations, similar_users, distances

# 初始化推荐系统
recommender = UserRecommendationSystem(n_neighbors=5)
recommender.fit(df[all_features])

# 推荐示例
def demo_recommendation():
    """推荐演示"""
    print("🎯 个性化推荐演示")
    print("=" * 50)
    
    # 选择几个用户进行推荐
    demo_users = [10, 50, 100, 200, 500]
    
    for user_id in demo_users:
        print(f"\n👤 用户 {user_id} 的个性化推荐:")
        
        # 显示用户基本信息
        user_info = df[df['用户ID'] == user_id].iloc[0]
        print(f"   基本信息: {user_info['年龄']}岁, "
              f"{'男' if user_info['性别'] else '女'}, "
              f"收入等级{user_info['收入等级']}")
        
        # 显示当前偏好
        current_preferences = user_info[preference_cols]
        top_preference = current_preferences.idxmax().replace('偏好', '')
        print(f"   当前最喜欢: {top_preference}")
        
        # 获取推荐
        recommendations, similar_users, distances = recommender.recommend_for_user(user_id)
        
        print(f"   推荐排序:")
        for i, (category, score) in enumerate(recommendations[:3]):
            print(f"     {i+1}. {category} (推荐分数: {score:.2f})")
        
        # 显示相似用户
        print(f"   相似用户ID: {[df.iloc[idx]['用户ID'] for idx in similar_users[:3]]}")

demo_recommendation()

步骤6：不同距离度量对比

print(f"\n=== 距离度量方法对比 ===")

# 测试不同距离度量方法
distance_metrics = ['euclidean', 'manhattan', 'cosine']
metric_results = {}

for metric in distance_metrics:
    knn = KNeighborsClassifier(n_neighbors=best_k, metric=metric)
    knn.fit(X_train_scaled, y_train)
    score = knn.score(X_test_scaled, y_test)
    metric_results[metric] = score
    print(f"{metric.capitalize()}距离: {score:.4f}")

# 可视化距离度量对比
plt.figure(figsize=(10, 6))
metrics = list(metric_results.keys())
scores = list(metric_results.values())

bars = plt.bar(metrics, scores, color=['blue', 'green', 'red'], alpha=0.7)
plt.ylabel('准确率')
plt.title('不同距离度量方法性能对比')
plt.ylim(min(scores) - 0.01, max(scores) + 0.01)

# 添加数值标签
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{score:.3f}', ha='center', va='bottom')

plt.show()

best_metric = max(metric_results.items(), key=lambda x: x[1])
print(f"\n🏆 最佳距离度量: {best_metric[0]} ({best_metric[1]:.4f})")

步骤7：新用户推荐预测

print(f"\n=== 新用户推荐测试 ===")

# 创建新用户数据
new_users = pd.DataFrame({
    '年龄': [25, 35, 45, 28, 50],
    '性别': [0, 1, 0, 1, 0],  # 0:女, 1:男
    '收入等级': [2, 4, 3, 5, 3],
    '城市等级': [1, 1, 2, 1, 3],
    '月购物次数': [8, 15, 6, 12, 4],
    '平均订单金额': [200, 800, 300, 600, 150],
    '使用优惠券频率': [0.8, 0.3, 0.6, 0.4, 0.9],
    '服装偏好': [4.2, 2.1, 3.5, 2.8, 3.0],
    '电子产品偏好': [3.5, 4.8, 2.2, 4.5, 1.8],
    '食品偏好': [2.8, 3.2, 4.1, 3.0, 4.5],
    '家居用品偏好': [2.0, 3.5, 4.8, 2.5, 4.2],
    '运动用品偏好': [3.0, 4.0, 2.5, 4.2, 2.0],
    '图书偏好': [3.8, 2.5, 3.8, 3.2, 4.0]
})

user_types = ["时尚女青年", "科技男", "居家主妇", "运动达人", "文艺中年"]

# 使用KNN预测
new_users_scaled = scaler.transform(new_users[all_features])
predictions = knn_classifier.predict(new_users_scaled)
probabilities = knn_classifier.predict_proba(new_users_scaled)

print("新用户偏好预测:")
print("=" * 60)

for i, user_type in enumerate(user_types):
    user = new_users.iloc[i]
    pred_category = category_names[predictions[i]]
    confidence = probabilities[i][predictions[i]]
    
    print(f"\n👤 {user_type}:")
    print(f"   基本信息: {user['年龄']}岁, "
          f"{'男' if user['性别'] else '女'}, "
          f"收入等级{user['收入等级']}")
    print(f"   预测偏好: {pred_category} (置信度: {confidence:.1%})")
    
    # 显示所有类别概率
    prob_str = " | ".join([f"{cat}{prob:.1%}" 
                          for cat, prob in zip(category_names, probabilities[i])])
    print(f"   详细概率: {prob_str}")

print(f"\n📊 预测准确性说明:")
print("✅ 时尚女青年 → 预测服装类，符合预期")
print("✅ 科技男 → 预测电子产品，符合预期") 
print("✅ 居家主妇 → 预测家居用品，符合预期")
print("这些预测结果体现了KNN'近朱者赤'的特点！")

运行效果

控制台输出示例

🛒 KNN商品推荐系统
========================================
生成用户购物数据...
数据生成完成！用户数: 1000
类别分布:
  家居用品: 178人
  服装: 174人
  食品: 169人
  电子产品: 164人
  运动用品: 158人
  图书: 157人

最优K值: 7
最佳测试准确率: 0.8150

最终模型性能:
K值: 7
准确率: 0.8150 (81.50%)

🎯 个性化推荐演示
==================================================

👤 用户 10 的个性化推荐:
   基本信息: 52岁, 女, 收入等级3
   当前最喜欢: 家居用品
   推荐排序:
     1. 家居用品 (推荐分数: 4.21)
     2. 食品 (推荐分数: 3.87)
     3. 服装 (推荐分数: 3.45)
   相似用户ID: [245, 891, 634]

🏆 最佳距离度量: cosine (0.8200)

新用户偏好预测:
============================================================

👤 时尚女青年:
   基本信息: 25岁, 女, 收入等级2
   预测偏好: 服装 (置信度: 42.9%)
   详细概率: 服装42.9% | 电子产品28.6% | 食品14.3% | 家居用品14.3% | 运动用品0.0% | 图书0.0%