Traceback (most recent call last): File "C:\Users\17680\PyCharmMiscProject\过拟合修改5.py", line 102, in <module> plot_learning_curve(model, X_train, y_train, cv=cv, n_jobs=-1) ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\17680\PyCharmMiscProject\过拟合修改5.py", line 73, in plot_learning_curve estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_scores, ^^^^^^^^^^^^ UnboundLocalError: cannot access local variable 'train_scores' where it is not associated with a value
时间: 2025-05-26 09:41:32 浏览: 21
根据你提供的错误信息,问题出在 `plot_learning_curve` 函数中。具体来说,`train_scores` 变量在调用 `learning_curve` 函数之前没有被正确定义。
### 问题原因
在 `plot_learning_curve` 函数中,`train_sizes, train_scores, test_scores = learning_curve(...)` 这一行代码的返回值应该是三个变量:`train_sizes`、`train_scores` 和 `test_scores`。然而,在后续代码中,`train_sizes` 被错误地赋值为 `train_scores`,导致了 `UnboundLocalError` 错误。
### 解决方法
我们需要修正 `plot_learning_curve` 函数中的变量名冲突问题,并确保 `learning_curve` 的返回值被正确分配给对应的变量。
以下是修复后的完整代码:
```python
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
data = pd.read_csv('D:/数据/数据8.csv')
# 数据预处理
X = data.drop(['station_id', '空间活力'], axis=1)
y = data['空间活力']
# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# 使用最佳参数创建XGBoost模型,并添加正则化和早停机制
model = xgb.XGBRegressor(
colsample_bytree=0.7,
gamma=0.2,
learning_rate=0.01, # 减小学习率
max_depth=5,
n_estimators=500, # 增加迭代次数
subsample=0.7,
reg_alpha=0.1, # 添加L1正则化
reg_lambda=1.0, # 添加L2正则化
early_stopping_rounds=10, # 添加早停机制
random_state=42
)
# 训练模型(使用验证集)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
# 在测试集上进行预测
y_pred = model.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"模型性能评估:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² 分数: {r2:.4f}")
# 定义函数绘制学习曲线
def plot_learning_curve(estimator, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
"""
绘制学习曲线:训练集和验证集上的性能随训练样本量的变化
"""
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('学习曲线')
ax.set_xlabel("训练样本数量")
ax.set_ylabel("分数 (负MSE)")
# 计算学习曲线数据
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring='neg_mean_squared_error', shuffle=True, random_state=42
)
# 计算平均和标准差
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# 绘制学习曲线
ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="训练集分数")
ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="验证集分数")
ax.legend(loc="best")
plt.grid(True)
plt.tight_layout()
plt.savefig('learning_curve.png')
plt.show()
return fig
# 绘制学习曲线
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
plot_learning_curve(model, X_train, y_train, cv=cv, n_jobs=-1)
# 创建SHAP解释器并计算SHAP值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# 可视化特征重要性
shap.summary_plot(shap_values, X_test, feature_names=X.columns, plot_type="bar")
plt.title('特征重要性 (SHAP值绝对值的平均)')
plt.tight_layout()
plt.savefig('shap_feature_importance.png')
plt.show()
# 可视化SHAP汇总图
shap.summary_plot(shap_values, X_test, feature_names=X.columns)
plt.tight_layout()
plt.savefig('shap_summary_plot.png')
plt.show()
# 可视化单个预测的SHAP值
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0], feature_names=X.columns)
plt.savefig('shap_force_plot_single.png', bbox_inches='tight')
plt.close()
# 可视化所有预测的SHAP值分布
shap.force_plot(explainer.expected_value, shap_values, X_test, feature_names=X.columns)
plt.savefig('shap_force_plot_all.png', bbox_inches='tight')
plt.close()
# 计算并显示每个特征的平均SHAP值
feature_importance = pd.DataFrame({
'Feature': X.columns,
'SHAP Importance': np.abs(shap_values).mean(axis=0)
}).sort_values('SHAP Importance', ascending=False)
print("\n特征重要性排序:")
print(feature_importance)
# 保存模型
model.save_model('xgb_shap_model.model')
print("\n模型已保存为: xgb_shap_model.model")
```
### 解释
1. **变量名冲突**:
- 在原始代码中,`train_sizes, train_scores, test_scores = learning_curve(...)` 的返回值被错误地分配给了 `train_scores`,导致后续代码无法正确访问 `train_scores`。
- 修复后,`train_scores` 和 `test_scores` 正确接收了 `learning_curve` 的输出。
2. **学习曲线功能**:
- 学习曲线展示了模型在不同训练样本数量下的表现,帮助我们判断模型是否过拟合或欠拟合。
---
###
阅读全文
相关推荐


















