In [ ]: # Import required libraries
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from scipy import stats
In [ ]: # Read the dataset
df = pd.read_csv('medical_dataset.csv')
In [ ]: # 1. Basic Dataset Information
print("Dataset Shape:", [Link])
print("\nColumns:", [Link]())
print("\nData Types:\n", [Link])
print("\nMissing Values:\n", [Link]().sum())
In [ ]: # 2. Statistical Summary
print("\nNumerical Features Summary:")
numerical_summary = [Link]()
display(numerical_summary)
print("\nCategorical Features Summary:")
categorical_summary = df.select_dtypes(include=['object']).describe()
display(categorical_summary)
In [ ]: # 3. Disease Distribution
[Link](figsize=(12, 6))
[Link](data=df, x='diagnosis')
[Link]('Distribution of Diseases')
[Link](rotation=45)
plt.tight_layout()
[Link]()
In [ ]: # 4. Age Distribution by Disease
[Link](figsize=(12, 6))
[Link](data=df, x='diagnosis', y='age')
[Link]('Age Distribution by Disease')
[Link](rotation=45)
plt.tight_layout()
[Link]()
In [ ]: # 5. Correlation Analysis for Numerical Features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numerical_features].corr()
[Link](figsize=(12, 10))
[Link](correlation_matrix, annot=True, cmap='coolwarm', center=0)
[Link]('Correlation Matrix of Numerical Features')
plt.tight_layout()
[Link]()
In [ ]: # 6. Symptom Analysis by Disease
symptoms = ['fatigue', 'headache', 'nausea', 'chest_pain', 'shortness_of_breath', 'joint_pain', 'dizziness']
[Link](figsize=(15, 8))
df_symptoms_mean = [Link]('diagnosis')[symptoms].mean()
[Link](df_symptoms_mean, annot=True, cmap='YlOrRd', fmt='.2f')
[Link]('Average Symptom Intensity by Disease')
plt.tight_layout()
[Link]()
In [ ]: # 7. Vital Signs Distribution
vital_signs = ['blood_pressure_systolic', 'blood_pressure_diastolic', 'heart_rate', 'temperature']
fig, axes = [Link](2, 2, figsize=(15, 10))
[Link]('Distribution of Vital Signs')
for i, vital in enumerate(vital_signs):
row = i // 2
col = i % 2
[Link](data=df, x='diagnosis', y=vital, ax=axes[row, col])
axes[row, col].tick_params(axis='x', rotation=45)
plt.tight_layout()
[Link]()
In [ ]: # 8. BMI and Glucose Level Analysis
[Link](figsize=(12, 6))
[Link](data=df, x='bmi', y='glucose_level', hue='diagnosis', alpha=0.6)
[Link]('BMI vs Glucose Level by Disease')
plt.tight_layout()
[Link]()
In [ ]: # 9. Gender Distribution across Diseases
[Link](figsize=(12, 6))
gender_disease = [Link](df['diagnosis'], df['gender'], normalize='index') * 100
gender_disease.plot(kind='bar', stacked=True)
[Link]('Gender Distribution Across Diseases (%)')
[Link]('Percentage')
[Link](title='Gender')
plt.tight_layout()
[Link]()
In [ ]: # 10. Statistical Tests
print("\nStatistical Tests:")
# ANOVA test for age differences among diseases
f_statistic, p_value = stats.f_oneway(*[group['age'].values for name, group in [Link]('diagnosis')])
print("\nANOVA Test for Age Differences among Diseases:")
print(f"F-statistic: {f_statistic:.4f}")
print(f"p-value: {p_value:.4f}")
# Chi-square test for gender and disease association
chi2, p_value, dof, expected = stats.chi2_contingency([Link](df['diagnosis'], df['gender']))
print("\nChi-square Test for Gender and Disease Association:")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p_value:.4f}")
In [ ]: # 11. Descriptive Statistics by Disease
print("\nKey Metrics by Disease:")
disease_stats = [Link]('diagnosis').agg({
'age': ['mean', 'std'],
'bmi': ['mean', 'std'],
'glucose_level': ['mean', 'std'],
'cholesterol': ['mean', 'std']
}).round(2)
display(disease_stats)
In [ ]: # 12. Feature Distribution Plots
def plot_feature_distributions(df, features, ncols=3):
nrows = (len(features) + ncols - 1) // ncols
fig, axes = [Link](nrows, ncols, figsize=(15, 4*nrows))
axes = [Link]()
for i, feature in enumerate(features):
[Link](data=df, x=feature, hue='diagnosis', multiple="stack", ax=axes[i])
axes[i].tick_params(axis='x', rotation=45)
# Remove empty subplots if any
for j in range(i+1, len(axes)):
[Link](axes[j])
plt.tight_layout()
[Link]()
numerical_features = ['age', 'bmi', 'glucose_level', 'cholesterol', 'heart_rate']
plot_feature_distributions(df, numerical_features)
In [ ]: import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from [Link] import StandardScaler, LabelEncoder
from [Link] import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier, GradientBoostingClassifier
from [Link] import SVC
from [Link] import KNeighborsClassifier
import xgboost as xgb
import warnings
# Suppress scikit-learn warnings
[Link]("ignore", category=UserWarning)
In [ ]: # Load and preprocess the data
def prepare_data(file_path='medical_dataset.csv'):
# Read the dataset
df = pd.read_csv(file_path)
# Separate features and target
X = [Link](['diagnosis', 'patient_id'], axis=1)
y = df['diagnosis']
# Get unique classes
unique_classes = [Link]()
# Encode categorical variables
le_gender = LabelEncoder()
le_blood = LabelEncoder()
le_diagnosis = LabelEncoder()
X['gender'] = le_gender.fit_transform(X['gender'])
X['blood_type'] = le_blood.fit_transform(X['blood_type'])
y = le_diagnosis.fit_transform(y)
# Store label encoders and unique classes for future reference
encoders = {
'gender': le_gender,
'blood_type': le_blood,
'diagnosis': le_diagnosis,
'unique_classes': unique_classes
}
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = [Link](X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, encoders
def train_and_evaluate_models(X_train, X_test, y_train, y_test, encoders):
# Initialize models
models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'SVM': SVC(),
'KNN': KNeighborsClassifier(),
'XGBoost': [Link]()
}
# Dictionary to store results
results = {}
# Get actual class names present in the data
class_names = list(encoders['unique_classes'])
# Train and evaluate each model
for name, model in [Link]():
print(f"\nTraining {name}...")
# Train the model
[Link](X_train, y_train)
# Make predictions
y_pred = [Link](X_test)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
# Store results
results[name] = {
'accuracy': accuracy,
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std(),
'predictions': y_pred,
'model': model # Store the trained model
}
# Print results
print(f"{name} Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
# Get the labels actually present in the test set
unique_labels = [Link]([Link]([y_test, y_pred]))
present_class_names = [class_names[i] for i in unique_labels]
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=present_class_names))
# Plot confusion matrix
[Link](figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=present_class_names,
yticklabels=present_class_names)
[Link](f'Confusion Matrix - {name}')
[Link]('True Label')
[Link]('Predicted Label')
[Link](rotation=45)
plt.tight_layout()
[Link]()
return results
def plot_model_comparison(results):
# Prepare data for plotting
models = list([Link]())
accuracies = [results[model]['accuracy'] for model in models]
cv_means = [results[model]['cv_mean'] for model in models]
# Plot comparison
[Link](figsize=(12, 6))
x = [Link](len(models))
width = 0.35
[Link](x - width/2, accuracies, width, label='Test Accuracy')
[Link](x + width/2, cv_means, width, label='CV Mean Accuracy')
[Link]('Models')
[Link]('Accuracy')
[Link]('Model Performance Comparison')
[Link](x, models, rotation=45)
[Link]()
plt.tight_layout()
[Link]()
def get_best_model(results):
# Find the model with the highest cross-validation score
best_model = max([Link](), key=lambda x: x[1]['cv_mean'])
return best_model[0], best_model[1]['cv_mean'], best_model[1]['model']
# Function to make predictions with the best model
def predict_disease(model, new_data, encoders, scaler):
# Preprocess new data
if isinstance(new_data, [Link]):
if 'gender' in new_data.columns:
new_data['gender'] = encoders['gender'].transform(new_data['gender'])
if 'blood_type' in new_data.columns:
new_data['blood_type'] = encoders['blood_type'].transform(new_data['blood_type'])
# Scale the features
new_data_scaled = [Link](new_data)
# Make prediction
prediction_encoded = [Link](new_data_scaled)
# Decode prediction
prediction = encoders['diagnosis'].inverse_transform(prediction_encoded)
return prediction
# Main execution
if __name__ == "__main__":
# Prepare the data
X_train, X_test, y_train, y_test, encoders = prepare_data()
# Train and evaluate models
results = train_and_evaluate_models(X_train, X_test, y_train, y_test, encoders)
# Plot model comparison
plot_model_comparison(results)
# Get the best model
best_model_name, best_score, model = get_best_model(results)
print(f"\nBest performing model: {best_model_name}")
print(f"Cross-validation accuracy: {best_score:.4f}")
In [ ]: