1. Load the Fashion MNIST dataset (or digits if offline).
Train and compare the performance of the following classifiers:
• Logistic Regression
• K-Nearest Neighbors (KNN)
• Decision Tree
• Random Forest
• Report the accuracy of each model on a test set.
• Visualize a bar chart comparing the test accuracies.
• Display confusion matrices for each model side by side.
• Identify and visualize misclassified images for each model.
from [Link] import mnist
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from [Link] import KNeighborsClassifier
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier
from [Link] import accuracy_score, confusion_matrix
(X_train_full, y_train_full), (X_test, y_test) = mnist.load_data()
X_train_full = X_train_full.reshape(-1, 784) / 255.0
X_test = X_test.reshape(-1, 784) / 255.0
X_train, _, y_train, _ = train_test_split(X_train_full, y_train_full, train_size=10000, stratify=y_train_full,
random_state=42)
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"K-Nearest Neighbors": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier()
}
accuracies = {}
conf_matrices = {}
misclassified = {}
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 1
for name, model in [Link]():
print(f"Training {name}...")
[Link](X_train, y_train)
y_pred = [Link](X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
miscl = [Link](y_pred != y_test)[0]
accuracies[name] = acc
conf_matrices[name] = cm
misclassified[name] = miscl
[Link](figsize=(8, 5))
[Link](x=list([Link]()), y=list([Link]()))
[Link]("Test Accuracy of Classifiers on MNIST")
[Link]("Accuracy")
[Link](0.8, 1.0)
[Link](rotation=45)
plt.tight_layout()
[Link]()
fig, axes = [Link](2, 2, figsize=(14, 10))
for ax, (name, cm) in zip([Link](), conf_matrices.items()):
[Link](cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(f"{name} Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
plt.tight_layout()
[Link]()
fig, axes = [Link](len(models), 5, figsize=(12, 10))
[Link]("Misclassified MNIST Images by Model", fontsize=16)
for i, (name, indices) in enumerate([Link]()):
for j, idx in enumerate(indices[:5]):
ax = axes[i, j]
[Link](X_test[idx].reshape(28, 28), cmap='gray')
ax.set_title(f"True: {y_test[idx]}\nPred: {models[name].predict([X_test[idx]])[0]}")
[Link]('off')
for j in range(len(indices), 5):
axes[i, j].axis('off')
plt.tight_layout(rect=[0, 0, 1, 0.96])
[Link]()
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 2
Output :
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 3
2. Plot learning curves to visualize the performance of your model with varying training sizes. Plot
validation curves to determine the optimal values for hyperparameters.
import numpy as np
import [Link] as plt
from sklearn.model_selection import learning_curve, validation_curve, train_test_split
from [Link] import KNeighborsClassifier
from [Link] import DecisionTreeClassifier
from [Link] import mnist
(X_train_full, y_train_full), (X_test, y_test) = mnist.load_data()
X_train_full = X_train_full.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0
X_train, _, y_train, _ = train_test_split(X_train_full, y_train_full, train_size=10000, stratify=y_train_full,
random_state=42)
def plot_learning_curve(estimator, title, X, y, cv=3):
train_sizes, train_scores, val_scores = learning_curve(estimator, X, y, cv=cv, scoring='accuracy',
n_jobs=-1)
train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)
[Link](figsize=(8, 5))
[Link](train_sizes, train_mean, label="Training score", marker='o')
[Link](train_sizes, val_mean, label="Validation score", marker='s')
[Link](title)
[Link]("Training Set Size")
[Link]("Accuracy")
[Link]()
[Link]()
plt.tight_layout()
[Link]()
def plot_validation_curve(estimator, X, y, param_name, param_range, title, cv=3):
train_scores, val_scores = validation_curve(
estimator, X, y, param_name=param_name, param_range=param_range,
scoring='accuracy', cv=cv, n_jobs=-1
)
train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 4
[Link](figsize=(8, 5))
[Link](param_range, train_mean, label="Training score", marker='o')
[Link](param_range, val_mean, label="Validation score", marker='s')
[Link](title)
[Link](param_name)
[Link]("Accuracy")
[Link]()
[Link]()
plt.tight_layout()
[Link]()
plot_learning_curve(KNeighborsClassifier(), "Learning Curve: K-Nearest Neighbors", X_train, y_train)
plot_validation_curve(KNeighborsClassifier(), X_train, y_train, param_name="n_neighbors",
param_range=[Link](1, 11), title="Validation Curve: KNN - n_neighbors")
plot_learning_curve(DecisionTreeClassifier(), "Learning Curve: Decision Tree", X_train, y_train)
plot_validation_curve(DecisionTreeClassifier(), X_train, y_train, param_name="max_depth",
param_range=[Link](1, 21), title="Validation Curve: Decision Tree - max_depth")
Output :
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 5
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 6
CS&E (DATA SCIENCE) BIET, DAVANGERE Page | 7