MACHINE LEARNING
CHEAT SHEET (THE PYTHON WORKFLOW)
a quick guide for beginners &
intermediate | Python + Scikit-learn
Replace 'target_column' with your
dataset’s target variable
Import Libraries
# Data Manipulation
import pandas as pd
import numpy as np
# Visualization
import seaborn as sns
import [Link] as plt
# Preprocessing & Scaling
from [Link] import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
# Regression Metrics
from [Link] import (
mean_squared_error, mean_absolute_error, r2_score )
# Classification Metrics
from [Link] import (
accuracy_score, recall_score, f1_score, confusion_matrix,
classification_report )
ModuleNotFoundError: No module named '<library_name>'
!pip install pandas numpy seaborn
matplotlib scikit-learn
Data Loading & Exploration
# Load Dataset
df = pd.read_csv('your_file.csv')
# Basic info & stats
print([Link]())
print([Link]())
# Unique values for categorical columns
for col in df.select_dtypes(include='object').columns:
print(df[col].value_counts())
# Missing values
print([Link]().sum())
# Numeric columns -> median
for col in df.select_dtypes(include='number').columns:
df[col].fillna(df[col].median(), inplace=True)
# Categorical columns -> mode
for col in df.select_dtypes(include='object').columns:
df[col].fillna(df[col].mode()[0], inplace=True)
Median handles skewed data (ignores outliers).
Drop records if missing % is very low (<5%).
Splitting & Encoding
# Separate Feature (X) and Target (y)
# Replace target_column with actual target column
X = [Link]('target_column', axis=1)
y = df['target_column']
# Split data to 80-20 for training & testing
# This is done before scaling & encoding to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split( X, y,
test_size=0.2, random_state=42 )
# Encoding - Turning words to numbers
numeric_cols = X_train.select_dtypes(include=[Link]).columns
categorical_cols = X_train.select_dtypes(include='object').columns
# Option 1: One-Hot Encoding
# drop_first=True prevents multicollinearity in your dummy variables
X_train_encoded = pd.get_dummies(X_train[categorical_cols],
drop_first=True)
X_test_encoded = pd.get_dummies(X_test[categorical_cols],
drop_first=True)
Use One-Hot if categories <80;
otherwise, apply Label Encoding.
Encoding & Scaling
# Align columns after one-hot encoding
# Ensure order is the same
train_cols = X_train_encoded.columns
test_cols = X_test_encoded.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
X_test_encoded[c] = 0
X_test_encoded = X_test_encoded[train_cols]
# Scaling - Resizing number sizes
scaler = StandardScaler()
X_train_scaled =
[Link](scaler.fit_transform(X_train[numeric_cols]),
columns=numeric_cols, index=X_train.index)
X_test_scaled =
[Link]([Link](X_test[numeric_cols]),
columns=numeric_cols, index=X_test.index)
Features play uneven without scaling -
Standard or MinMax keeps the game fair.
Visualizing
# Combine scaled numeric and encoded categorical features
X_train_final = [Link]([X_train_scaled, X_train_encoded], axis=1)
X_test_final = [Link]([X_test_scaled, X_test_encoded], axis=1)
# Histograms & Boxplots
for col in numeric_cols:
[Link](figsize=(12,4))
[Link](1,2,1)
[Link](X_train[col], kde=True)
[Link](f'Histogram of {col}')
[Link](1,2,2)
[Link](x=X_train[col])
[Link](f'Boxplot of {col}')
[Link]()
# Correlation heatmap
[Link](figsize=(12,8))
[Link](X_train.corr(numeric_only = True), annot=True,
cmap='coolwarm')
[Link]('Correlation Heatmap')
[Link]()
Avoid multicollinearity: If correlation > 0.8,
drop one feature to reduce redundancy.
Regression Model
from sklearn.linear_model import LinearRegression
from [Link] import DecisionTreeRegressor
from [Link] import RandomForestRegressor,
GradientBoostingRegressor
from [Link] import KNeighborsRegressor
# Models
reg_models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(max_depth=7),
'Random Forest': RandomForestRegressor(n_estimators=100),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100),
'KNN': KNeighborsRegressor(metric="manhattan", n_neighbors=20)
}
# Helper functions
pct = lambda x: f"{x*100:.2f}%"
rnd = lambda x: f"{x:.2f}"
# Store Results
results = []
More estimators in Random Forest ≠
always better; watch compute cost.
Regression Model CONT...
# Build Models
for name, model in reg_models.items():
[Link](X_train_final, y_train)
train_r2 = [Link](X_train_final, y_train)
y_pred = [Link](X_test_final)
test_r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
[Link]({
'Model': name,
'Train R²': pct(train_r2),
'Test R²': pct(test_r2),
'MAE': rnd(mae),
'MSE': rnd(mse)
})
# Convert to DataFrame for comparison
results_df = [Link](results)
print(results_df)
Overfitting alert - high Train R²
but low Test R² is a red flag.
Classification Model
from sklearn.linear_model import LogisticRegression
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier,
GradientBoostingClassifier
from [Link] import KNeighborsClassifier
from [Link] import accuracy_score, recall_score, f1_score,
confusion_matrix, classification_report
import pandas as pd
# Models
clf_models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Decision Tree': DecisionTreeClassifier(max_depth=7),
'Random Forest': RandomForestClassifier(n_estimators=100),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
'KNN': KNeighborsClassifier(metric="manhattan", n_neighbors=20)
}
# Helper functions
pct = lambda x: f"{x*100:.2f}%" # format as percentage
# Store results
results = {}
Tree models handle non-linear splits well;
Logistic works best on linear boundaries.
Classification Model CONT...
# Build Models
for name, model in clf_models.items():
[Link](X_train_final, y_train)
train_acc = [Link](X_train_final, y_train)
y_pred = [Link](X_test_final)
test_acc = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
results[name] = {
'Train Accuracy': pct(train_acc),
'Test Accuracy': pct(test_acc),
'Recall': pct(recall),
'F1 Score': pct(f1),
'Confusion Matrix': confusion_matrix(y_test, y_pred),
'Classification Report': classification_report(y_test, y_pred)
}
Use Precision-Recall tradeoff to explore
class imbalance solutions.
Classification Model CONT...
# Convert results to DataFrame for comparison
summary = [Link]([
{
'Model': name,
'Train Accuracy': metrics['Train Accuracy'],
'Test Accuracy': metrics['Test Accuracy'],
'Recall': metrics['Recall'],
'F1 Score': metrics['F1 Score']
}
for name, metrics in [Link]()
])
print(summary)
# If you also want to print confusion matrices & reports
for name, metrics in [Link]():
print(f"\n{name}")
print("Confusion Matrix:\n", metrics["Confusion Matrix"])
print("Classification Report:\n", metrics["Classification Report"])
ROC & AUC help compare
classifiers beyond raw accuracy.
THANKS FOR READING
I hope this sheet makes your ML journey a bit easier
Here are good add-ons
Handle class imbalance with SMOTE
Remove irrelevant features to improve performance
Cross-Validation for robust evaluation
Hyperparameter tuning matters - e.g. n_neighbors in
KNN, learning_rate in Gradient Boosting
Shared for learning – connect on LinkedIn @Harsh_Bang if helpful