0% found this document useful (0 votes)
36 views12 pages

Python Machine Learning Workflow Guide

Uploaded by

shoaib
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
36 views12 pages

Python Machine Learning Workflow Guide

Uploaded by

shoaib
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

MACHINE LEARNING

CHEAT SHEET (THE PYTHON WORKFLOW)

a quick guide for beginners &


intermediate | Python + Scikit-learn

Replace 'target_column' with your


dataset’s target variable
Import Libraries

# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import [Link] as plt

# Preprocessing & Scaling


from [Link] import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Regression Metrics
from [Link] import (
mean_squared_error, mean_absolute_error, r2_score )

# Classification Metrics
from [Link] import (
accuracy_score, recall_score, f1_score, confusion_matrix,
classification_report )

ModuleNotFoundError: No module named '<library_name>'

!pip install pandas numpy seaborn


matplotlib scikit-learn
Data Loading & Exploration

# Load Dataset
df = pd.read_csv('your_file.csv')

# Basic info & stats


print([Link]())
print([Link]())

# Unique values for categorical columns


for col in df.select_dtypes(include='object').columns:
print(df[col].value_counts())

# Missing values
print([Link]().sum())

# Numeric columns -> median


for col in df.select_dtypes(include='number').columns:
df[col].fillna(df[col].median(), inplace=True)

# Categorical columns -> mode


for col in df.select_dtypes(include='object').columns:
df[col].fillna(df[col].mode()[0], inplace=True)

Median handles skewed data (ignores outliers).


Drop records if missing % is very low (<5%).
Splitting & Encoding

# Separate Feature (X) and Target (y)


# Replace target_column with actual target column
X = [Link]('target_column', axis=1)
y = df['target_column']

# Split data to 80-20 for training & testing


# This is done before scaling & encoding to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split( X, y,
test_size=0.2, random_state=42 )

# Encoding - Turning words to numbers


numeric_cols = X_train.select_dtypes(include=[Link]).columns
categorical_cols = X_train.select_dtypes(include='object').columns

# Option 1: One-Hot Encoding


# drop_first=True prevents multicollinearity in your dummy variables
X_train_encoded = pd.get_dummies(X_train[categorical_cols],
drop_first=True)
X_test_encoded = pd.get_dummies(X_test[categorical_cols],
drop_first=True)

Use One-Hot if categories <80;


otherwise, apply Label Encoding.
Encoding & Scaling

# Align columns after one-hot encoding


# Ensure order is the same
train_cols = X_train_encoded.columns
test_cols = X_test_encoded.columns
missing_in_test = set(train_cols) - set(test_cols)

for c in missing_in_test:
X_test_encoded[c] = 0

X_test_encoded = X_test_encoded[train_cols]

# Scaling - Resizing number sizes


scaler = StandardScaler()

X_train_scaled =
[Link](scaler.fit_transform(X_train[numeric_cols]),
columns=numeric_cols, index=X_train.index)

X_test_scaled =
[Link]([Link](X_test[numeric_cols]),
columns=numeric_cols, index=X_test.index)

Features play uneven without scaling -


Standard or MinMax keeps the game fair.
Visualizing

# Combine scaled numeric and encoded categorical features


X_train_final = [Link]([X_train_scaled, X_train_encoded], axis=1)
X_test_final = [Link]([X_test_scaled, X_test_encoded], axis=1)

# Histograms & Boxplots


for col in numeric_cols:
[Link](figsize=(12,4))
[Link](1,2,1)
[Link](X_train[col], kde=True)
[Link](f'Histogram of {col}')

[Link](1,2,2)
[Link](x=X_train[col])
[Link](f'Boxplot of {col}')
[Link]()

# Correlation heatmap
[Link](figsize=(12,8))
[Link](X_train.corr(numeric_only = True), annot=True,
cmap='coolwarm')
[Link]('Correlation Heatmap')
[Link]()

Avoid multicollinearity: If correlation > 0.8,


drop one feature to reduce redundancy.
Regression Model

from sklearn.linear_model import LinearRegression


from [Link] import DecisionTreeRegressor
from [Link] import RandomForestRegressor,
GradientBoostingRegressor
from [Link] import KNeighborsRegressor

# Models
reg_models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(max_depth=7),
'Random Forest': RandomForestRegressor(n_estimators=100),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100),
'KNN': KNeighborsRegressor(metric="manhattan", n_neighbors=20)
}

# Helper functions
pct = lambda x: f"{x*100:.2f}%"
rnd = lambda x: f"{x:.2f}"

# Store Results
results = []

More estimators in Random Forest ≠


always better; watch compute cost.
Regression Model CONT...

# Build Models
for name, model in reg_models.items():
[Link](X_train_final, y_train)
train_r2 = [Link](X_train_final, y_train)

y_pred = [Link](X_test_final)

test_r2 = r2_score(y_test, y_pred)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

[Link]({
'Model': name,
'Train R²': pct(train_r2),
'Test R²': pct(test_r2),
'MAE': rnd(mae),
'MSE': rnd(mse)
})

# Convert to DataFrame for comparison


results_df = [Link](results)
print(results_df)

Overfitting alert - high Train R²


but low Test R² is a red flag.
Classification Model

from sklearn.linear_model import LogisticRegression


from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier,
GradientBoostingClassifier
from [Link] import KNeighborsClassifier
from [Link] import accuracy_score, recall_score, f1_score,
confusion_matrix, classification_report
import pandas as pd

# Models
clf_models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Decision Tree': DecisionTreeClassifier(max_depth=7),
'Random Forest': RandomForestClassifier(n_estimators=100),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
'KNN': KNeighborsClassifier(metric="manhattan", n_neighbors=20)
}

# Helper functions
pct = lambda x: f"{x*100:.2f}%" # format as percentage

# Store results
results = {}

Tree models handle non-linear splits well;


Logistic works best on linear boundaries.
Classification Model CONT...

# Build Models
for name, model in clf_models.items():
[Link](X_train_final, y_train)

train_acc = [Link](X_train_final, y_train)

y_pred = [Link](X_test_final)

test_acc = accuracy_score(y_test, y_pred)


recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

results[name] = {
'Train Accuracy': pct(train_acc),
'Test Accuracy': pct(test_acc),
'Recall': pct(recall),
'F1 Score': pct(f1),
'Confusion Matrix': confusion_matrix(y_test, y_pred),
'Classification Report': classification_report(y_test, y_pred)
}

Use Precision-Recall tradeoff to explore


class imbalance solutions.
Classification Model CONT...

# Convert results to DataFrame for comparison


summary = [Link]([
{
'Model': name,
'Train Accuracy': metrics['Train Accuracy'],
'Test Accuracy': metrics['Test Accuracy'],
'Recall': metrics['Recall'],
'F1 Score': metrics['F1 Score']
}
for name, metrics in [Link]()
])

print(summary)

# If you also want to print confusion matrices & reports


for name, metrics in [Link]():
print(f"\n{name}")
print("Confusion Matrix:\n", metrics["Confusion Matrix"])
print("Classification Report:\n", metrics["Classification Report"])

ROC & AUC help compare


classifiers beyond raw accuracy.
THANKS FOR READING
I hope this sheet makes your ML journey a bit easier

Here are good add-ons

Handle class imbalance with SMOTE


Remove irrelevant features to improve performance
Cross-Validation for robust evaluation
Hyperparameter tuning matters - e.g. n_neighbors in
KNN, learning_rate in Gradient Boosting

Shared for learning – connect on LinkedIn @Harsh_Bang if helpful

You might also like