# Look through unique values in each categorical column categorical_cols = train_df.select_dtypes(include="object").columns.tolist() for col in categorical_cols: print(f"{col}", f"Number of unique entries: {len(train_df[col].unique().tolist())},") print(train_df[col].unique().tolist()) def plot_bar_chart(df, columns, grid_rows, grid_cols, x_label='', y_label='', title='', whole_numbers_only=False, count_labels=True, as_percentage=True): num_plots = len(columns) grid_size = grid_rows * grid_cols num_rows = math.ceil(num_plots / grid_cols) if num_plots == 1: fig, axes = plt.subplots(1, 1, figsize=(12, 8)) axes = [axes] # Wrap the single axes in a list for consistent handling else: fig, axes = plt.subplots(num_rows, grid_cols, figsize=(12, 8)) axes = axes.ravel() # Flatten the axes array to iterate over it for i, column in enumerate(columns): df_column = df[column] if whole_numbers_only: df_column = df_column[df_column % 1 == 0] ax = axes[i] y = [num for (s, num) in df_column.value_counts().items()] x = [s for (s, num) in df_column.value_counts().items()] ax.bar(x, y, color='blue', alpha=0.5) try: ax.set_xticks(range(x[-1], x[0] + 1)) except: pass ax.set_xlabel(x_label) ax.set_ylabel(y_label) ax.set_title(title + ' - ' + column) if count_labels: df_col = df_column.value_counts(normalize=True).mul(100).round(1).astype(str) + '%' for idx, (year, value) in enumerate(df_column.value_counts().items()): if as_percentage == False: ax.annotate(f'{value}\n', xy=(year, value), ha='center', va='center') else: ax.annotate(f'{df_col[year]}\n', xy=(year, value), ha='center', va='center', size=8) if num_plots < grid_size: for j in range(num_plots, grid_size): fig.delaxes(axes[j]) # Remove empty subplots if present plt.tight_layout() plt.show()

时间: 2024-04-02 08:34:27 浏览: 317

这段代码定义了一个名为plot_bar_chart的函数，它可以绘制柱状图。函数的输入包括一个数据框（df）、一个列名的列表（columns）、网格的行数和列数（grid_rows和grid_cols）、x轴和y轴标签（x_label和y_label）、标题（title）、是否只显示整数（whole_numbers_only）、是否在图上显示数据标签（count_labels）、以及是否以百分比形式显示数据标签（as_percentage）。在函数内部，它首先计算需要绘制的子图的数量，然后根据网格的行数和列数创建子图。接下来，对于每个给定的列名，函数从数据框中选择该列，并将其存储在变量df_column中。如果whole_numbers_only为True，则函数只绘制整数值。接着，函数将x轴和y轴的值分别设置为列中每个唯一值的出现次数和值本身。最后，函数在每个子图上绘制柱状图，并将数据标签添加到每个条形上。如果没有足够的子图来填充整个网格，则函数会删除空的子图以使图像更清晰。

import pandas as pd import numpy as np from sklearn.preprocessing import OrdinalEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from factor_analyzer import FactorAnalyzer from factor_analyzer.factor_analyzer import calculate_kmo from xgboost import XGBClassifier import lightgbm as lgb from sklearn.metrics import classification_report # ====================== # 1. 数据读取与清洗 # ====================== def load_and_clean(data_path): # 读取数据 df = pd.read_csv(data_path) # 缺失值处理 num_imputer = SimpleImputer(strategy='median') cat_imputer = SimpleImputer(strategy='most_frequent') # 数值型字段 numeric_cols = ['付费金额', '活跃时长', '广告收入', '留存'] df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols]) # 分类型字段 categorical_cols = ['设备价值档位', '用户初始广告档位'] df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols]) # 异常值处理 df['活跃时长'] = np.where(df['活跃时长'] > 24, 24, df['活跃时长']) df['付费金额'] = np.where( df['付费金额'] > df['付费金额'].quantile(0.99), df['付费金额'].quantile(0.95), df['付费金额'] ) return df # ====================== # 2. 特征工程 # ====================== def feature_engineering(df): # 构造复合特征 df['ARPU密度'] = df['付费金额'] / (df['活跃天数'] + 1) df['广告展示率'] = df['广告曝光次数'] / df['短剧观看次数'] df['内容互动指数'] = np.log1p(df['收藏数']1 + df['分享数']2 + df['评论数']*3) # 分类变量编码 encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) cat_cols = ['设备价值档位', '用户初始广告档位'] df[cat_cols] = encoder.fit_transform(df[cat_cols]) return df # ====================== # 3. 特征筛选与降维 # ====================== def feature_selection(df, target_col='付费意愿档位'): # 划分特征和目标 X = df.drop(columns=[target_col]) y = df[target_col] # 计算IV值筛选 from sklearn.feature_selection import mutual_info_classif iv_values = mutual_info_classif(X, y) iv_df = pd.DataFrame({'feature': X.columns, 'iv': iv_values}) selected_features = iv_df[iv_df['iv'] > 0.02]['feature'].tolist() X_selected = X[selected_features] # 因子分析降维 kmo_all, kmo_model = calculate_kmo(X_selected) if kmo_model > 0.6: fa = FactorAnalyzer(n_factors=5, rotation='varimax') fa.fit(X_selected) factor_scores = fa.transform(X_selected) factor_cols = [f'Factor_{i}' for i in range(1,6)] X_factors = pd.DataFrame(factor_scores, columns=factor_cols) else: X_factors = X_selected.copy() return X_factors, y # ====================== # 4. XGBoost特征筛选 # ====================== def xgb_feature_importance(X, y): # 训练XGBoost模型 model = XGBClassifier( objective='multi:softmax', eval_metric='mlogloss', use_label_encoder=False ) model.fit(X, y) # 获取特征重要性 importance = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) top10_features = importance.head(10)['feature'].tolist() return X[top10_features] # ====================== # 5. LightGBM建模预测 # ====================== def lgb_modeling(X, y): # 数据划分 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) # 模型训练 model = lgb.LGBMClassifier( num_leaves=31, max_depth=5, learning_rate=0.1, n_estimators=300, class_weight='balanced' ) model.fit(X_train, y_train) # 模型评估 y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) return model # ====================== # 主流程执行 # ====================== if name == "main": # 数据路径 DATA_PATH = "user_data.csv" # 执行流程 df = load_and_clean(DATA_PATH) df = feature_engineering(df) X, y = feature_selection(df) X_top10 = xgb_feature_importance(X, y) final_model = lgb_modeling(X_top10, y) # 模型保存 final_model.booster_.save_model('user_value_model.txt')

<think>嗯，用户想优化他们的机器学习代码，里面用到了Pandas、NumPy、Scikit-learn、XGBoost和LightGBM。首先，我需要回忆一下这些库的常见优化方法。比如，Pandas的内存优化，可能涉及到数据类型转换，比如把category类型用上，或者用更节省内存的数据类型。然后，数据预处理部分，像缺失值处理，是否有更高效的方法？比如用SimpleImputer或者直接fillna。接着是特征工程，用户有没有提到特征数量太多？可能需要降维，比如PCA或者特征选择方法，比如SelectKBest。还有，分类特征的处理，比如OneHotEncoder可能会增加维度，也许用OrdinalEncoder或者TargetEncoding更高效？然后是模型训练部分，Scikit-learn的Pipeline可以整合预处理和模型，减少代码量，同时避免数据泄露。交叉验证的设置是否正确？比如用StratifiedKFold来处理类别不平衡。超参数优化方面，用户可能用的是GridSearchCV，但如果是参数空间大，可以考虑RandomizedSearchCV或者贝叶斯优化，这样更快。 XGBoost和LightGBM的参数设置有没有优化空间？比如调整n_estimators和learning_rate，或者使用早停法来防止过拟合。并行计算方面，XGBoost的n_jobs参数设置是否正确？LightGBM的categorical_feature参数有没有正确指定，避免重复编码？代码结构方面，是否有重复的部分可以封装成函数？比如数据预处理和模型训练的部分。内存管理，比如删除不再使用的变量，或者使用gc.collect()来释放内存。还有，是否使用了最新的库版本，因为新版本可能有性能改进。另外，评估指标是否合适？比如分类问题用AUC，回归问题用RMSE。模型解释性方面，可以用SHAP或LIME来增强可解释性。最后，代码的可读性和维护性，比如添加注释，使用有意义的变量名，模块化代码结构。可能需要检查用户是否有提到具体的问题，比如运行速度慢，内存不足，或者准确率不高。根据不同的痛点，优化方向会不同。例如，如果内存不够，重点在数据类型的优化和降维；如果速度慢，可能需要并行处理或更高效的算法。另外，确保引用的库函数是最佳实践，比如使用fit_transform而不是分开fit和transform，避免冗余计算。还要考虑数据量的大小，如果数据非常大，可能需要分块处理或者使用Dask来替代Pandas。对于模型部分，是否可以使用增量学习，或者更轻量级的模型。最后，验证优化效果，通过时间统计和内存使用情况对比，确保优化确实有效。</think>我将根据代码优化原则和机器学习最佳实践，分步骤分析可能存在的优化点并提供改进建议： ### 一、内存与数据处理优化 1. **数据类型优化** 使用`category`类型处理低基数特征可减少内存占用： ```python cat_cols = ['gender', 'education_level'] df[cat_cols] = df[cat_cols].astype('category') ``` 2. **高效缺失值处理** 推荐使用Scikit-learn的$SimpleImputer$进行统一处理： ```python from sklearn.impute import SimpleImputer num_imputer = SimpleImputer(strategy='median') X[num_cols] = num_imputer.fit_transform(X[num_cols]) ``` ### 二、特征工程优化 3. **分类特征编码优化** 使用$OrdinalEncoder$替代独热编码减少维度爆炸： ```python from sklearn.preprocessing import OrdinalEncoder encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) X[cat_cols] = encoder.fit_transform(X[cat_cols]) ``` 4. **特征选择** 使用方差阈值或互信息法筛选有效特征： ```python from sklearn.feature_selection import SelectKBest, mutual_info_classif selector = SelectKBest(mutual_info_classif, k=20) X_selected = selector.fit_transform(X, y) ``` ### 三、模型训练优化 5. **构建处理管道** 使用$Pipeline$整合预处理流程： ```python from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer preprocessor = ColumnTransformer([ ('num', num_imputer, num_cols), ('cat', encoder, cat_cols) ]) pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', XGBClassifier()) ]) ``` 6. **交叉验证优化** 推荐使用分层交叉验证： ```python from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) ``` ### 四、模型参数优化 7. **XGBoost/LightGBM参数调整** 设置早停法防止过拟合： ```python xgb_params = { 'n_estimators': 2000, 'learning_rate': 0.05, 'early_stopping_rounds': 50, 'eval_metric': 'auc' } ``` 8. **并行计算设置** 充分利用多核CPU： ```python lgbm_params = { 'num_threads': 8, 'device': 'gpu' # 如果支持GPU加速 } ``` ### 五、代码结构优化 9. **内存管理** 及时释放无用变量： ```python del train_df import gc gc.collect() ``` 10. **版本检查** 确保使用优化后的库版本： ```python import xgboost print(xgboost.__version__) # 推荐 >=1.6.0 ``` ### 六、评估与监控 11. **添加评估指标** 推荐使用更全面的评估方法： ```python from sklearn.metrics import RocCurveDisplay, classification_report RocCurveDisplay.from_estimator(model, X_test, y_test) print(classification_report(y_test, y_pred)) ``` 12. **特征重要性分析** 增强模型可解释性： ```python import shap explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) ``` ### 优化效果验证建议 1. 使用$timeit$模块测量代码段执行时间 2. 通过$memory_profiler$监控内存使用 3. 采用超参数优化工具（如Optuna）进行自动调参[^2]

def median_target(var): temp = data[data[var].notnull()] temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index() return temp data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5 data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5 data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107 data.loc[(data['Outcome'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 1 data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27 data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32 data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70 data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5 data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1 data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3 target_col = ["Outcome"] cat_cols = data.nunique()[data.nunique() < 12].keys().tolist() cat_cols = [x for x in cat_cols ] #numerical columns num_cols = [x for x in data.columns if x not in cat_cols + target_col] #Binary columns with 2 values bin_cols = data.nunique()[data.nunique() == 2].keys().tolist() #Columns more than 2 values multi_cols = [i for i in cat_cols if i not in bin_cols] #Label encoding Binary columns le = LabelEncoder() for i in bin_cols : data[i] = le.fit_transform(data[i]) #Duplicating columns for multi value columns data = pd.get_dummies(data = data,columns = multi_cols ) #Scaling Numerical columns std = StandardScaler() scaled = std.fit_transform(data[num_cols]) scaled = pd.DataFrame(scaled,columns=num_cols) #dropping original values merging scaled values for numerical columns df_data_og = data.copy() data = data.drop(columns = num_cols,axis = 1) data = data.merge(scaled,left_index=True,right_index=True,how = "left") # Def X and Y X = data.drop('Outcome', axis=1) y = data['Outcome'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1) y_train = to_categorical(y_train) y_test = to_categorical(y_test)

这段代码看起来是在进行数据预处理，首先定义了一个函数`median_target`，用于计算每个类别中某个特征的中位数。然后根据不同的`Outcome`值，对缺失值进行填充。接着将数据集中的分类特征进行编码，将二元特征进行二元编码，将多元特征进行独热编码。最后，对数值特征进行标准化处理，并将处理后的数据集进行拆分为训练集和测试集。

阅读全文

相关推荐

keras.utils.to_categorical和one hot格式解析

解决keras,val_categorical_accuracy:,0.0000e+00问题

浅谈keras中的keras.utils.to_categorical用法

AttributeError: 'DataFrame' object has no attribute 'categorical_cols'

胃癌数据库的构建与临床应用的开题报告.docx

大家在看

大唐杯仿真介绍.zip

桌面便签_SimpleStickyNotes.zip

美敦力BIS监护仪串口通讯协议手册

PFC与Fipy耦合技术：基于三角网格单元的双向流固耦合双轴压缩模拟,基于PFC流固耦合原理的双向耦合模拟技术：PFC与Fipy结合，三角网格单元实现渗流与双轴压缩模拟的双向交互作用 ,PFC流固耦合

KR C4 电气元器件检修服务 - 系统技术.pdf

最新推荐

胃癌数据库的构建与临床应用的开题报告.docx

plc可编程控制器综合实训装置.doc

计算机网络期末考试题及答案.doc

计算机档案管理系统的设计与实现分析.docx

全面解析SOAP库包功能与应用

编程语言选择指南：为不同项目量身定制的编程语言策略

手写vue2的插件vue-router

《软件工程：实践者的方法》第6版课件解析

QUARTUS II 13.0全攻略：新手到专家的10个必备技能

IllegalArgumentException.class