0% found this document useful (0 votes)
14 views6 pages

Medical Data Analysis and Modeling

Uploaded by

Murali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views6 pages

Medical Data Analysis and Modeling

Uploaded by

Murali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

In [ ]: # Import required libraries

import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from scipy import stats

In [ ]: # Read the dataset


df = pd.read_csv('medical_dataset.csv')

In [ ]: # 1. Basic Dataset Information


print("Dataset Shape:", [Link])
print("\nColumns:", [Link]())
print("\nData Types:\n", [Link])
print("\nMissing Values:\n", [Link]().sum())

In [ ]: # 2. Statistical Summary
print("\nNumerical Features Summary:")
numerical_summary = [Link]()
display(numerical_summary)
print("\nCategorical Features Summary:")
categorical_summary = df.select_dtypes(include=['object']).describe()
display(categorical_summary)

In [ ]: # 3. Disease Distribution
[Link](figsize=(12, 6))
[Link](data=df, x='diagnosis')
[Link]('Distribution of Diseases')
[Link](rotation=45)
plt.tight_layout()
[Link]()

In [ ]: # 4. Age Distribution by Disease


[Link](figsize=(12, 6))
[Link](data=df, x='diagnosis', y='age')
[Link]('Age Distribution by Disease')
[Link](rotation=45)
plt.tight_layout()
[Link]()

In [ ]: # 5. Correlation Analysis for Numerical Features


numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numerical_features].corr()

[Link](figsize=(12, 10))
[Link](correlation_matrix, annot=True, cmap='coolwarm', center=0)
[Link]('Correlation Matrix of Numerical Features')
plt.tight_layout()
[Link]()

In [ ]: # 6. Symptom Analysis by Disease


symptoms = ['fatigue', 'headache', 'nausea', 'chest_pain', 'shortness_of_breath', 'joint_pain', 'dizziness']

[Link](figsize=(15, 8))
df_symptoms_mean = [Link]('diagnosis')[symptoms].mean()
[Link](df_symptoms_mean, annot=True, cmap='YlOrRd', fmt='.2f')
[Link]('Average Symptom Intensity by Disease')
plt.tight_layout()
[Link]()

In [ ]: # 7. Vital Signs Distribution


vital_signs = ['blood_pressure_systolic', 'blood_pressure_diastolic', 'heart_rate', 'temperature']

fig, axes = [Link](2, 2, figsize=(15, 10))


[Link]('Distribution of Vital Signs')

for i, vital in enumerate(vital_signs):


row = i // 2
col = i % 2
[Link](data=df, x='diagnosis', y=vital, ax=axes[row, col])
axes[row, col].tick_params(axis='x', rotation=45)

plt.tight_layout()
[Link]()

In [ ]: # 8. BMI and Glucose Level Analysis


[Link](figsize=(12, 6))
[Link](data=df, x='bmi', y='glucose_level', hue='diagnosis', alpha=0.6)
[Link]('BMI vs Glucose Level by Disease')
plt.tight_layout()
[Link]()

In [ ]: # 9. Gender Distribution across Diseases


[Link](figsize=(12, 6))
gender_disease = [Link](df['diagnosis'], df['gender'], normalize='index') * 100
gender_disease.plot(kind='bar', stacked=True)
[Link]('Gender Distribution Across Diseases (%)')
[Link]('Percentage')
[Link](title='Gender')
plt.tight_layout()
[Link]()

In [ ]: # 10. Statistical Tests


print("\nStatistical Tests:")

# ANOVA test for age differences among diseases


f_statistic, p_value = stats.f_oneway(*[group['age'].values for name, group in [Link]('diagnosis')])
print("\nANOVA Test for Age Differences among Diseases:")
print(f"F-statistic: {f_statistic:.4f}")
print(f"p-value: {p_value:.4f}")

# Chi-square test for gender and disease association


chi2, p_value, dof, expected = stats.chi2_contingency([Link](df['diagnosis'], df['gender']))
print("\nChi-square Test for Gender and Disease Association:")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p_value:.4f}")

In [ ]: # 11. Descriptive Statistics by Disease


print("\nKey Metrics by Disease:")
disease_stats = [Link]('diagnosis').agg({
'age': ['mean', 'std'],
'bmi': ['mean', 'std'],
'glucose_level': ['mean', 'std'],
'cholesterol': ['mean', 'std']
}).round(2)
display(disease_stats)

In [ ]: # 12. Feature Distribution Plots


def plot_feature_distributions(df, features, ncols=3):
nrows = (len(features) + ncols - 1) // ncols
fig, axes = [Link](nrows, ncols, figsize=(15, 4*nrows))
axes = [Link]()

for i, feature in enumerate(features):


[Link](data=df, x=feature, hue='diagnosis', multiple="stack", ax=axes[i])
axes[i].tick_params(axis='x', rotation=45)

# Remove empty subplots if any


for j in range(i+1, len(axes)):
[Link](axes[j])

plt.tight_layout()
[Link]()

numerical_features = ['age', 'bmi', 'glucose_level', 'cholesterol', 'heart_rate']


plot_feature_distributions(df, numerical_features)

In [ ]: import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from [Link] import StandardScaler, LabelEncoder
from [Link] import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier, GradientBoostingClassifier
from [Link] import SVC
from [Link] import KNeighborsClassifier
import xgboost as xgb
import warnings
# Suppress scikit-learn warnings
[Link]("ignore", category=UserWarning)

In [ ]: # Load and preprocess the data


def prepare_data(file_path='medical_dataset.csv'):
# Read the dataset
df = pd.read_csv(file_path)

# Separate features and target


X = [Link](['diagnosis', 'patient_id'], axis=1)
y = df['diagnosis']

# Get unique classes


unique_classes = [Link]()

# Encode categorical variables


le_gender = LabelEncoder()
le_blood = LabelEncoder()
le_diagnosis = LabelEncoder()

X['gender'] = le_gender.fit_transform(X['gender'])
X['blood_type'] = le_blood.fit_transform(X['blood_type'])
y = le_diagnosis.fit_transform(y)

# Store label encoders and unique classes for future reference


encoders = {
'gender': le_gender,
'blood_type': le_blood,
'diagnosis': le_diagnosis,
'unique_classes': unique_classes
}

# Split the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = [Link](X_test)

return X_train_scaled, X_test_scaled, y_train, y_test, encoders

def train_and_evaluate_models(X_train, X_test, y_train, y_test, encoders):


# Initialize models
models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'SVM': SVC(),
'KNN': KNeighborsClassifier(),
'XGBoost': [Link]()
}

# Dictionary to store results


results = {}

# Get actual class names present in the data


class_names = list(encoders['unique_classes'])

# Train and evaluate each model


for name, model in [Link]():
print(f"\nTraining {name}...")

# Train the model


[Link](X_train, y_train)

# Make predictions
y_pred = [Link](X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
# Store results
results[name] = {
'accuracy': accuracy,
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std(),
'predictions': y_pred,
'model': model # Store the trained model
}

# Print results
print(f"{name} Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# Get the labels actually present in the test set


unique_labels = [Link]([Link]([y_test, y_pred]))
present_class_names = [class_names[i] for i in unique_labels]

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=present_class_names))

# Plot confusion matrix


[Link](figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=present_class_names,
yticklabels=present_class_names)
[Link](f'Confusion Matrix - {name}')
[Link]('True Label')
[Link]('Predicted Label')
[Link](rotation=45)
plt.tight_layout()
[Link]()

return results

def plot_model_comparison(results):
# Prepare data for plotting
models = list([Link]())
accuracies = [results[model]['accuracy'] for model in models]
cv_means = [results[model]['cv_mean'] for model in models]

# Plot comparison
[Link](figsize=(12, 6))
x = [Link](len(models))
width = 0.35

[Link](x - width/2, accuracies, width, label='Test Accuracy')


[Link](x + width/2, cv_means, width, label='CV Mean Accuracy')

[Link]('Models')
[Link]('Accuracy')
[Link]('Model Performance Comparison')
[Link](x, models, rotation=45)
[Link]()
plt.tight_layout()
[Link]()
def get_best_model(results):
# Find the model with the highest cross-validation score
best_model = max([Link](), key=lambda x: x[1]['cv_mean'])
return best_model[0], best_model[1]['cv_mean'], best_model[1]['model']

# Function to make predictions with the best model


def predict_disease(model, new_data, encoders, scaler):
# Preprocess new data
if isinstance(new_data, [Link]):
if 'gender' in new_data.columns:
new_data['gender'] = encoders['gender'].transform(new_data['gender'])
if 'blood_type' in new_data.columns:
new_data['blood_type'] = encoders['blood_type'].transform(new_data['blood_type'])

# Scale the features


new_data_scaled = [Link](new_data)

# Make prediction
prediction_encoded = [Link](new_data_scaled)

# Decode prediction
prediction = encoders['diagnosis'].inverse_transform(prediction_encoded)

return prediction

# Main execution
if __name__ == "__main__":
# Prepare the data
X_train, X_test, y_train, y_test, encoders = prepare_data()

# Train and evaluate models


results = train_and_evaluate_models(X_train, X_test, y_train, y_test, encoders)

# Plot model comparison


plot_model_comparison(results)

# Get the best model


best_model_name, best_score, model = get_best_model(results)
print(f"\nBest performing model: {best_model_name}")
print(f"Cross-validation accuracy: {best_score:.4f}")

In [ ]:

You might also like