0% found this document useful (0 votes)
3 views

assig_5_mining

The document outlines a Python script that implements various machine learning classifiers on the breast cancer dataset using cross-validation. It evaluates the classifiers based on metrics such as accuracy, precision, recall, F1 score, and AUC, and saves the results to an Excel file. Additionally, it generates ROC and Precision-Recall curves, as well as a boxplot for test accuracy across different classifiers.

Uploaded by

manavkansal584
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

assig_5_mining

The document outlines a Python script that implements various machine learning classifiers on the breast cancer dataset using cross-validation. It evaluates the classifiers based on metrics such as accuracy, precision, recall, F1 score, and AUC, and saves the results to an Excel file. Additionally, it generates ROC and Precision-Recall curves, as well as a boxplot for test accuracy across different classifiers.

Uploaded by

manavkansal584
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 5

import pandas as pd

import numpy as np

from sklearn.model_selection import cross_val_predict, StratifiedKFold

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,


matthews_corrcoef, roc_curve

from sklearn.metrics import auc, precision_recall_curve

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.datasets import load_breast_cancer

import matplotlib.pyplot as plt

import seaborn as sns


# Load dataset

data = load_breast_cancer()

X, y = data.data, data.target

# Define classifiers

classifiers = {

'RandomForest': RandomForestClassifier(),

'SVM': SVC(probability=True),

'KNN': KNeighborsClassifier(),

'LogisticRegression': LogisticRegression(),

'NaiveBayes': GaussianNB(),

'DecisionTree': DecisionTreeClassifier(),

'MLP': MLPClassifier(),

'LDA': LinearDiscriminantAnalysis(),

'GaussianProcess': GaussianProcessClassifier(),

# Cross-validation

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []

for clf_name, clf in classifiers.items():

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):

X_train, X_test = X[train_idx], X[test_idx]

y_train, y_test = y[train_idx], y[test_idx]

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

y_prob = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else y_pred

acc_train = clf.score(X_train, y_train)


acc_test = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

mcc = matthews_corrcoef(y_test, y_pred)

fpr, tpr, _ = roc_curve(y_test, y_prob)

auc_score = auc(fpr, tpr)

prec, rec, _ = precision_recall_curve(y_test, y_prob)

pr_auc = auc(rec, prec)

results.append([data.feature_names[0], clf_name, acc_train, acc_test, precision, recall, f1, mcc,


auc_score, pr_auc])

# Save results to Excel

df_results = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'TrainAccuracy', 'TestAccuracy',


'Precision', 'Recall', 'F1Score', 'MCC', 'AUC', 'PR_AUC'])

df_results.to_excel('UCI_Results.xlsx', index=False)

# Plot ROC Curve

plt.figure()

for clf_name, clf in classifiers.items():

y_prob = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]

fpr, tpr, _ = roc_curve(y, y_prob)

plt.plot(fpr, tpr, label=f'{clf_name} (AUC = {auc(fpr, tpr):.2f})')

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.legend()

plt.title('ROC Curves')

plt.show()

# Plot Precision-Recall Curve


plt.figure()

for clf_name, clf in classifiers.items():

y_prob = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]

prec, rec, _ = precision_recall_curve(y, y_prob)

plt.plot(rec, prec, label=f'{clf_name} (PR AUC = {auc(rec, prec):.2f})')

plt.xlabel('Recall')

plt.ylabel('Precision')

plt.legend()

plt.title('Precision-Recall Curves')

plt.show()

# Plot Boxplot for Test Accuracy

plt.figure(figsize=(10, 6))

sns.boxplot(x='Classifier', y='TestAccuracy', data=df_results)

plt.xticks(rotation=45)

plt.title('Boxplot of Test Accuracy for Different Classifiers')

plt.show()

You might also like