Code:
import pandas as pd
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from [Link] import (
classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from [Link] import LabelEncoder
from [Link] import RandomForestClassifier
from [Link] import SVC
# Load and process the dataset
metabric_df = pd.read_csv("./METABRIC_RNA_Mutation.csv")
metabric_df.shape
metabric_df.info(verbose=True)
metabric_df.sample(5)
metabric_df = metabric_df.set_index('patient_id')
df_expression = metabric_df.iloc[:, 30:519].join(metabric_df['overall_survival'], how='inner')
df_expression
# Dictionary to store F1 and accuracy scores of each model
metrics_summary = {"Model": [], "F1 Score": [], "Accuracy": []}
# Function to evaluate and display results
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
[Link](X_train, y_train)
# Predictions and probabilities (if applicable)
y_pred = [Link](X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(
model, 'predict_proba') else None
# Calculate metrics
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
# Store the metrics
metrics_summary["Model"].append(model_name)
metrics_summary["F1 Score"].append(f1)
metrics_summary["Accuracy"].append(accuracy)
# Display classification metrics
print(f"\n=== {model_name} ===")
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
[Link](figsize=(6, 4))
[Link](cm, annot=True, fmt='d', cmap='Blues', xticklabels=[
'Pred 0', 'Pred 1'], yticklabels=['True 0', 'True 1'])
[Link](f'Confusion Matrix - {model_name}')
[Link]('Predicted Label')
[Link]('True Label')
[Link]()
# AUC-ROC Score (if applicable)
if y_proba is not None:
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_proba):.4f}")
# Main function to process the dataset and evaluate models
def main(df, target_column):
# Prepare the dataset
X = [Link](target_column, axis=1)
y = df[target_column]
# Handle categorical variables
X = pd.get_dummies(X, drop_first=True)
if [Link] == 'object':
y = LabelEncoder().fit_transform(y)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42)
# Define classifiers to be evaluated
models = {
'Random Forest': RandomForestClassifier(),
'Support Vector Classifier': SVC(probability=True),
# Train and evaluate all models
for name, model in [Link]():
evaluate_model(model, X_train, X_test, y_train, y_test, name)
# Plot F1 Scores of all models
[Link](figsize=(10, 5))
[Link](x=metrics_summary["Model"],
y=metrics_summary["F1 Score"], palette='coolwarm')
[Link](rotation=45)
[Link]('F1 Scores of All Models')
[Link]('F1 Score')
[Link]()
# Plot Accuracy Scores of all models
[Link](figsize=(10, 5))
[Link](x=metrics_summary["Model"],
y=metrics_summary["Accuracy"], palette='coolwarm')
[Link](rotation=45)
[Link]('Accuracy Scores of All Models')
[Link]('Accuracy')
[Link]()
# Load your dataset and specify the target column
if __name__ == "__main__":
# Replace with your dataframe
df = df_expression # Example input dataframe
target_column = 'overall_survival' # Replace with the actual target column name
main(df, target_column)
Output: