kaggle经典-DataScienceLondon-CSDN博客

本文链接：https://blog.csdn.net/qq_42645279/article/details/136874511

本文介绍了在数据科学竞赛中，通过导入多个Python库（如numpy、pandas、sklearn等），对数据进行预处理（包括异常值检测、特征缩放和Stacking模型），并使用各种机器学习算法（如SVM、随机森林、XGBoost、CatBoost和LightGBM）进行训练，最终优化参数并生成预测结果的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# https://www.kaggle.com/code/aadityavardhan/stacking-lgbm-cb-xg-and-rf hotness

train_df = pd.read_csv("D:/python/dataset/DataScienceLondon/train.csv", header=None)
df_test = pd.read_csv("D:/python/dataset/DataScienceLondon/test.csv", header=None)
label_df = pd.read_csv("D:/python/dataset/DataScienceLondon/trainLabels.csv", header=None)

df_train = pd.merge(train_df, label_df, left_index=True, right_index=True)
df_train.head()

print(f"Training data shape is: {df_train.shape}")
print(f"Testing data shape is: {df_test.shape}")
describe = df_train.describe()
print(describe)
null_sum = df_train.isnull().sum()
print(null_sum)
# plt.figure(figsize=(10, 10))
# # plot a bar plot of each class 0 and 1
# label_df.value_counts().plot(kind='bar')
# plt.title('Class distribution of the target')
# plt.xlabel('Class')
# plt.ylabel('Count')
# # plt.show()
#
# # plot a pie chart for the same
# plt.figure(figsize=(10, 10))
# label_df.value_counts().plot(kind='pie', autopct='%1.1f%%')
# plt.title('Class distribution of the target')
# plt.show()

# plot distribution of the features
num_features = 40
num_rows = 8
num_cols = 5


# fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 4 * num_rows))
#
# for i, ax in enumerate(axs.flatten()):
#     if i < num_features:
#         # 直方图
#         ax.hist(df_train[df_train.columns[i + 1]], bins=20, alpha=0.7)
#         ax.set_title(f'Feature {i + 1} distribution')
#         ax.set_xlabel(f'Feature {i + 1}')
#         ax.set_ylabel('Count')
#     else:
#         ax.axis('off')  # turn off extra subplots if there are more than 40 features
#
# plt.tight_layout()


# plt.show()


def calculate_outlier_percentage(dataframe):
    outlier_percentage = {}

    for i, feature in enumerate(dataframe.columns):
        # Calculate the interquartile range (IQR)
        Q1 = dataframe[feature].quantile(0.25)
        Q3 = dataframe[feature].quantile(0.75)
        IQR = Q3 - Q1

        # Calculate the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Count the number of outliers
        num_outliers = len(dataframe[(dataframe[feature] < lower_bound) | (dataframe[feature] > upper_bound)])

        # Calculate the percentage of outliers
        percentage = (num_outliers / len(dataframe)) * 100

        # Store the percentage of outliers for the feature
        outlier_percentage[f'feature {i}'] = percentage

    return outlier_percentage


unscaled_outlier_percentage = calculate_outlier_percentage(df_train)

print(unscaled_outlier_percentage)

# Convert the outlier_percentage dictionary to a pandas DataFrame
df_outliers = pd.DataFrame.from_dict(unscaled_outlier_percentage, orient='index', columns=['Percentage'])

# Sort the DataFrame by the outlier percentage in descending order

# Create the horizontal bar plot
plt.figure(figsize=(10, 8))
plt.barh(df_outliers.index, df_outliers['Percentage'])
plt.title('Outlier Percentage for Each Feature')
plt.xlabel('Percentage')
plt.ylabel('Feature')
plt.show()

df_train.columns = [i for i in range(0, 40)] + ['target']
features_with_outliers = np.arange(0, 40)
scaler = RobustScaler()

# Scale the selected features
df_train_scaled = df_train.copy()
df_train_scaled[features_with_outliers] = scaler.fit_transform(df_train[features_with_outliers])

# Now find the outliers percentage after scaling it and plot it
scaled_outlier_percentage = calculate_outlier_percentage(df_train_scaled)

df_outliers_scaled = pd.DataFrame.from_dict(scaled_outlier_percentage, orient='index', columns=['Percentage'])

# Create the horizontal bar plot
plt.figure(figsize=(10, 8))
plt.barh(df_outliers_scaled.index, df_outliers_scaled['Percentage'])
plt.title('Outlier Percentage for Each Feature After scaling')
plt.xlabel('Percentage')
plt.ylabel('Feature')
plt.show()

X_train, X_test, y_train, y_test = train_test_split(df_train_scaled.iloc[:, :-1], df_train_scaled['target'],
                                                    test_size=0.2, random_state=42)

training_score = []
testing_score = []


def model_prediction(model):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    training_score.append(train_accuracy)
    testing_score.append(test_accuracy)
    print(f"Training accuracy: {train_accuracy}")
    print(f"Testing accuracy: {test_accuracy}")


model_prediction(SVC())
model_prediction(RandomForestClassifier())
model_prediction(AdaBoostClassifier())
model_prediction(GradientBoostingClassifier())
model_prediction(LGBMClassifier())
model_prediction(XGBClassifier())
model_prediction(CatBoostClassifier(verbose=False))

models = ["SVC", "RandomForest", "AdaBoost", "GradientBoost", "LGBM", "XGB", "CatBoost"]

df = pd.DataFrame({'Model': models, 'Training Accuracy': training_score, 'Testing Accuracy': testing_score})

# Plotting the above results as column-bar chart
# df.plot(x='Model', y=['Training Accuracy', 'Testing Accuracy'], kind='bar', figsize=(10, 8))
# plt.title('Training and Testing Accuracy for Each Model')
# plt.ylabel('Accuracy')
# plt.xlabel('Model')
# plt.show()

model1 = LGBMClassifier()

parameters1 = {"n_estimators": [100, 200, 300, 400, 500],
               "learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
               "random_state": [42],
               "num_leaves": [16, 17, 18]}

grid_search1 = GridSearchCV(model1, parameters1, scoring='accuracy', n_jobs=-1, cv=5)
grid_search1.fit(X_train, y_train)
print(f"grid_search1.best_score_: {grid_search1.best_score_}")
best_parameters1 = grid_search1.best_params_
print(best_parameters1)

model1 = LGBMClassifier(**best_parameters1)
model1.fit(X_train, y_train)
X_test_pred1 = model1.predict(X_test)
accuracy_score(y_test, X_test_pred1)

model2 = CatBoostClassifier(verbose=False)
parameters2 = {"learning_rate": np.arange(0.1, 0.7),
               "random_state": [42],
               "depth": [8, 9, 10],
               "iterations": [35, 40, 50]}
grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1, scoring='accuracy')

grid_search2.fit(X_train, y_train)
print(f"grid_search2.best_score_: {grid_search2.best_score_}")
best_parameters2 = grid_search2.best_params_
print(best_parameters2)
model2 = CatBoostClassifier(**best_parameters2, verbose=False)
model2.fit(X_train, y_train)

X_test_pred2 = model2.predict(X_test)
accuracy_score(y_test, X_test_pred2)

model3 = XGBClassifier()
parameters3 = {"n_estimators": [50, 100, 150],
               "random_state": [42],
               "learning_rate": [0.1, 0.3, 0.5, 1.0]}

grid_search3 = GridSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')
grid_search3.fit(X_train, y_train)
grid_search3.best_score_
best_parameters3 = grid_search3.best_params_
model3 = XGBClassifier(**best_parameters3)
model3.fit(X_train, y_train)
X_test_pred3 = model3.predict(X_test)
accuracy_score(y_test, X_test_pred3)

model4 = RandomForestClassifier()
parameters4 = {'n_estimators': [100, 300, 500, 550],
               'min_samples_split': [7, 8, 9],
               'max_depth': [10, 11, 12],
               'min_samples_leaf': [4, 5, 6]}
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1, scoring='accuracy')
grid_search4.fit(X_train, y_train)
best_parameters4 = grid_search4.best_params_
model4 = RandomForestClassifier(**best_parameters4)
model4.fit(X_train, y_train)
X_test_pred4 = model4.predict(X_test)
accuracy_score(y_test, X_test_pred4)

stacking_model = StackingClassifier(estimators=[('LGBM', model1),
                                                ('CatBoost', model2),
                                                ('XGBoost', model3),
                                                ('RandomForest', model4)])

stacking_model.fit(X_train, y_train)

X_train_pred5 = stacking_model.predict(X_train)
X_test_pred5 = stacking_model.predict(X_test)
print(f'Stacking model training data is {accuracy_score(y_train, X_train_pred5)}')

print(f'Stacking model testing data is {accuracy_score(y_test, X_test_pred5)}')
df_preds = pd.DataFrame({'Id': range(1, len(df_test) + 1), 'Solution': stacking_model.predict(df_test)})

df_preds.to_csv('submission.csv', index=False)

我使用了多种机器学习方法去比较训练集和测试集的精确度，尝试找出每种算法的最佳参数，最后用stacking的方式使用多种算法得出预测结果