import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
train_df = pd.read_csv("D:/python/dataset/DataScienceLondon/train.csv", header=None)
df_test = pd.read_csv("D:/python/dataset/DataScienceLondon/test.csv", header=None)
label_df = pd.read_csv("D:/python/dataset/DataScienceLondon/trainLabels.csv", header=None)
df_train = pd.merge(train_df, label_df, left_index=True, right_index=True)
df_train.head()
print(f"Training data shape is: {df_train.shape}")
print(f"Testing data shape is: {df_test.shape}")
describe = df_train.describe()
print(describe)
null_sum = df_train.isnull().sum()
print(null_sum)
num_features = 40
num_rows = 8
num_cols = 5
def calculate_outlier_percentage(dataframe):
outlier_percentage = {}
for i, feature in enumerate(dataframe.columns):
Q1 = dataframe[feature].quantile(0.25)
Q3 = dataframe[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
num_outliers = len(dataframe[(dataframe[feature] < lower_bound) | (dataframe[feature] > upper_bound)])
percentage = (num_outliers / len(dataframe)) * 100
outlier_percentage[f'feature {i}'] = percentage
return outlier_percentage
unscaled_outlier_percentage = calculate_outlier_percentage(df_train)
print(unscaled_outlier_percentage)
df_outliers = pd.DataFrame.from_dict(unscaled_outlier_percentage, orient='index', columns=['Percentage'])
plt.figure(figsize=(10, 8))
plt.barh(df_outliers.index, df_outliers['Percentage'])
plt.title('Outlier Percentage for Each Feature')
plt.xlabel('Percentage')
plt.ylabel('Feature')
plt.show()
df_train.columns = [i for i in range(0, 40)] + ['target']
features_with_outliers = np.arange(0, 40)
scaler = RobustScaler()
df_train_scaled = df_train.copy()
df_train_scaled[features_with_outliers] = scaler.fit_transform(df_train[features_with_outliers])
scaled_outlier_percentage = calculate_outlier_percentage(df_train_scaled)
df_outliers_scaled = pd.DataFrame.from_dict(scaled_outlier_percentage, orient='index', columns=['Percentage'])
plt.figure(figsize=(10, 8))
plt.barh(df_outliers_scaled.index, df_outliers_scaled['Percentage'])
plt.title('Outlier Percentage for Each Feature After scaling')
plt.xlabel('Percentage')
plt.ylabel('Feature')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(df_train_scaled.iloc[:, :-1], df_train_scaled['target'],
test_size=0.2, random_state=42)
training_score = []
testing_score = []
def model_prediction(model):
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
training_score.append(train_accuracy)
testing_score.append(test_accuracy)
print(f"Training accuracy: {train_accuracy}")
print(f"Testing accuracy: {test_accuracy}")
model_prediction(SVC())
model_prediction(RandomForestClassifier())
model_prediction(AdaBoostClassifier())
model_prediction(GradientBoostingClassifier())
model_prediction(LGBMClassifier())
model_prediction(XGBClassifier())
model_prediction(CatBoostClassifier(verbose=False))
models = ["SVC", "RandomForest", "AdaBoost", "GradientBoost", "LGBM", "XGB", "CatBoost"]
df = pd.DataFrame({'Model': models, 'Training Accuracy': training_score, 'Testing Accuracy': testing_score})
model1 = LGBMClassifier()
parameters1 = {"n_estimators": [100, 200, 300, 400, 500],
"learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
"random_state": [42],
"num_leaves": [16, 17, 18]}
grid_search1 = GridSearchCV(model1, parameters1, scoring='accuracy', n_jobs=-1, cv=5)
grid_search1.fit(X_train, y_train)
print(f"grid_search1.best_score_: {grid_search1.best_score_}")
best_parameters1 = grid_search1.best_params_
print(best_parameters1)
model1 = LGBMClassifier(**best_parameters1)
model1.fit(X_train, y_train)
X_test_pred1 = model1.predict(X_test)
accuracy_score(y_test, X_test_pred1)
model2 = CatBoostClassifier(verbose=False)
parameters2 = {"learning_rate": np.arange(0.1, 0.7),
"random_state": [42],
"depth": [8, 9, 10],
"iterations": [35, 40, 50]}
grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1, scoring='accuracy')
grid_search2.fit(X_train, y_train)
print(f"grid_search2.best_score_: {grid_search2.best_score_}")
best_parameters2 = grid_search2.best_params_
print(best_parameters2)
model2 = CatBoostClassifier(**best_parameters2, verbose=False)
model2.fit(X_train, y_train)
X_test_pred2 = model2.predict(X_test)
accuracy_score(y_test, X_test_pred2)
model3 = XGBClassifier()
parameters3 = {"n_estimators": [50, 100, 150],
"random_state": [42],
"learning_rate": [0.1, 0.3, 0.5, 1.0]}
grid_search3 = GridSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')
grid_search3.fit(X_train, y_train)
grid_search3.best_score_
best_parameters3 = grid_search3.best_params_
model3 = XGBClassifier(**best_parameters3)
model3.fit(X_train, y_train)
X_test_pred3 = model3.predict(X_test)
accuracy_score(y_test, X_test_pred3)
model4 = RandomForestClassifier()
parameters4 = {'n_estimators': [100, 300, 500, 550],
'min_samples_split': [7, 8, 9],
'max_depth': [10, 11, 12],
'min_samples_leaf': [4, 5, 6]}
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1, scoring='accuracy')
grid_search4.fit(X_train, y_train)
best_parameters4 = grid_search4.best_params_
model4 = RandomForestClassifier(**best_parameters4)
model4.fit(X_train, y_train)
X_test_pred4 = model4.predict(X_test)
accuracy_score(y_test, X_test_pred4)
stacking_model = StackingClassifier(estimators=[('LGBM', model1),
('CatBoost', model2),
('XGBoost', model3),
('RandomForest', model4)])
stacking_model.fit(X_train, y_train)
X_train_pred5 = stacking_model.predict(X_train)
X_test_pred5 = stacking_model.predict(X_test)
print(f'Stacking model training data is {accuracy_score(y_train, X_train_pred5)}')
print(f'Stacking model testing data is {accuracy_score(y_test, X_test_pred5)}')
df_preds = pd.DataFrame({'Id': range(1, len(df_test) + 1), 'Solution': stacking_model.predict(df_test)})
df_preds.to_csv('submission.csv', index=False)
我使用了多种机器学习方法去比较训练集和测试集的精确度,尝试找出每种算法的最佳参数,最后用stacking的方式使用多种算法得出预测结果