Traceback (most recent call last): File "D:\python\负荷预测测试版.py", line 165, in <module> bagging_model = BaggingRegressor(base_estimator=clone(best_rf_model), n_estimators=10, random_state=42) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: BaggingRegressor.__init__() got an unexpected keyword argument 'base_estimator'
时间: 2025-05-27 18:25:55 浏览: 42
你遇到的问题是因为 `BaggingRegressor` 的初始化参数中没有 `base_estimator` 这个参数。`BaggingRegressor` 的正确参数应该是 `estimator` 而不是 `base_estimator`。此外,为了确保兼容性和更好的集成模型效果,我们可以使用 `Pipeline` 和 `BaggingRegressor` 的默认参数进行集成。
以下是修正后的代码:
### 修正后的代码
```python
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from scipy.stats import randint, uniform
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from mlxtend.regressor import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# 设置中文字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体显示中文
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号 '-' 显示为方块的问题
# 加载数据
data = pd.read_csv('2016年电工数学建模竞赛负荷预测数据集.csv')
# 查看前几行数据确保正确加载
print(data.head())
# 检查是否有缺失值并处理
print(data.isnull().sum())
data.fillna(data.mean(), inplace=True) # 使用均值填充缺失值
# 创建新特征
data['日期'] = pd.to_datetime(data.index) # 假设日期是索引
data['星期几'] = data['日期'].dt.weekday
data['月份'] = data['日期'].dt.month
data['是否节假日'] = data['日期'].apply(lambda x: 1 if x.day_name() in ['Saturday', 'Sunday'] else 0)
# 更新特征矩阵
X = data[['最高温度℃', '最低温度℃', '平均温度℃', '相对湿度(平均)', '降雨量(mm)', '星期几', '月份', '是否节假日']]
y = data['日需求负荷(KWh)']
# 特征选择:使用递归特征消除(RFE)选择重要特征
rf_initial = RandomForestRegressor(random_state=42)
selector = RFE(rf_initial, n_features_to_select=5, step=1)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.support_]
print("Selected Features:", selected_features)
# 标准化特征
scaler = MinMaxScaler() # 使用 MinMaxScaler 缩放特征
X_scaled = scaler.fit_transform(X_selected)
X_scaled_df = pd.DataFrame(X_scaled, columns=selected_features)
# 添加多项式特征
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled_df)
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(selected_features))
# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_poly_df, y, test_size=0.2, random_state=42)
# 定义随机森林超参数空间
param_dist = {
'n_estimators': randint(100, 501),
'max_depth': [None] + list(range(10, 51, 10)),
'min_samples_split': randint(2, 11),
'min_samples_leaf': randint(1, 5),
'bootstrap': [True],
'max_features': ['sqrt', 'log2'] + list(np.linspace(0.1, 1.0, 10)), # 添加合理的浮点数值
'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'] # 使用有效的损失函数
}
rf_model = RandomForestRegressor(random_state=42)
# 使用随机搜索进行粗略超参数调优
random_search = RandomizedSearchCV(
estimator=rf_model,
param_distributions=param_dist,
n_iter=50, # 先进行较少量的搜索
cv=5,
verbose=2,
random_state=42,
n_jobs=-1,
error_score='raise'
)
random_search.fit(X_train, y_train)
# 输出粗略调优后的最佳参数
print("Initial Best Parameters:", random_search.best_params_)
# 使用 GridSearchCV 进行精细超参数调优
param_grid = {
'n_estimators': [random_search.best_params_['n_estimators']],
'max_depth': [random_search.best_params_['max_depth']],
'min_samples_split': [random_search.best_params_['min_samples_split']],
'min_samples_leaf': [random_search.best_params_['min_samples_leaf']],
'bootstrap': [True],
'max_features': [random_search.best_params_['max_features']],
'criterion': [random_search.best_params_['criterion']]
}
grid_search = GridSearchCV(
estimator=rf_model,
param_grid=param_grid,
cv=5,
verbose=2,
n_jobs=-1,
scoring='neg_mean_squared_error'
)
grid_search.fit(X_train, y_train)
# 输出最终最佳参数
print("Final Best Parameters:", grid_search.best_params_)
# 使用最佳参数训练模型
best_rf_model = grid_search.best_estimator_
# 交叉验证评估
cv_scores_rmse = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores_mae = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
cv_scores_r2 = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='r2')
print(f'Random Forest CV RMSE: {-np.sqrt(-cv_scores_rmse.mean()):.2f}')
print(f'Random Forest CV MAE: {-cv_scores_mae.mean():.2f}')
print(f'Random Forest CV R^2: {cv_scores_r2.mean():.4f}')
# 在测试集上进行预测
y_pred_rf = best_rf_model.predict(X_test)
# 计算多种评估指标
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = mean_absolute_percentage_error(y_test, y_pred_rf)
r2_test = r2_score(y_test, y_pred_rf)
print(f'Random Forest Test RMSE: {rmse_rf:.2f}')
print(f'Random Forest Test MAE: {mae_rf:.2f}')
print(f'Random Forest Test MAPE: {mape_rf:.4f}')
print(f'Random Forest Test R^2: {r2_test:.4f}')
# 特征重要性分析
importances = best_rf_model.feature_importances_
feature_importances = pd.Series(importances, index=X_poly_df.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10, 6))
plt.title('特征重要性')
plt.show()
# 绘制预测值和实际值的对比图
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='实际值', color='blue', marker='o')
plt.plot(y_pred_rf, label='RF预测值', color='red', marker='x')
plt.title('实际值 vs 随机森林预测值')
plt.xlabel('样本索引')
plt.ylabel('日需求负荷 (KWh)')
plt.legend()
plt.grid(True)
plt.show()
# 使用袋外分数评估模型(如果使用 oob_score=True)
if hasattr(best_rf_model, 'oob_score_'):
print(f'OOB Score: {best_rf_model.oob_score_:.4f}')
# 模型集成:使用 Bagging 和 Stacking
bagging_model = BaggingRegressor(estimator=best_rf_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
xgb_model = XGBRegressor(objective='reg:squarederror', n_jobs=-1)
lgbm_model = LGBMRegressor(boosting_type='gbdt', objective='regression', n_jobs=-1)
stacking_model = StackingRegressor(
regressors=[best_rf_model, xgb_model, lgbm_model],
meta_regressor=RandomForestRegressor(random_state=42)
)
stacking_model.fit(X_train, y_train)
# 在测试集上进行预测
y_pred_bagging = bagging_model.predict(X_test)
y_pred_stacking = stacking_model.predict(X_test)
# 计算集成模型的评估指标
rmse_bagging = np.sqrt(mean_squared_error(y_test, y_pred_bagging))
mae_bagging = mean_absolute_error(y_test, y_pred_bagging)
mape_bagging = mean_absolute_percentage_error(y_test, y_pred_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)
rmse_stacking = np.sqrt(mean_squared_error(y_test, y_pred_stacking))
mae_stacking = mean_absolute_error(y_test, y_pred_stacking)
mape_stacking = mean_absolute_percentage_error(y_test, y_pred_stacking)
r2_stacking = r2_score(y_test, y_pred_stacking)
print(f'Bagging Test RMSE: {rmse_bagging:.2f}, MAE: {mae_bagging:.2f}, MAPE: {mape_bagging:.4f}, R^2: {r2_bagging:.4f}')
print(f'Stacking Test RMSE: {rmse_stacking:.2f}, MAE: {mae_stacking:.2f}, MAPE: {mape_stacking:.4f}, R^2: {r2_stacking:.4f}')
# 绘制集成模型预测值和实际值的对比图
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='实际值', color='blue', marker='o')
plt.plot(y_pred_bagging, label='Bagging预测值', color='green', marker='^')
plt.plot(y_pred_stacking, label='Stacking预测值', color='purple', marker='s')
plt.title('实际值 vs 集成模型预测值')
plt.xlabel('样本索引')
plt.ylabel('日需求负荷 (KWh)')
plt.legend()
plt.grid(True)
plt.show()
```
### 关键修正点
1. **修正 `BaggingRegressor` 初始化参数**:
- 使用 `estimator` 而不是 `base_estimator`。
2. **确保参数合理性**:
- 修正了参数名称以确保代码的正确性。
通过这些修正,`BaggingRegressor` 和 `StackingRegressor` 应该可以正常工作。如果有任何问题或需要进一步调整,请告诉我!
### 进一步优化建议
1. **减少参数空间**:如果调优时间过长,可以适当减少参数范围或调优次数。
2. **使用 `GridSearchCV`**:对于关键参数,可以考虑使用 `GridSearchCV` 进行更细粒度的搜索。
3. **模型集成**:可以尝试集成多个模型(如 Bagging、Stacking)以提高模型的鲁棒性和泛化能力。
阅读全文
相关推荐



















