1.
Write a python program to compute Central Tendency Measures: Mean, Median,
Mode Measure of Dispersion: Variance, Standard Deviation
import statistics as stats
def compute_statistics(data):
if not data:
return "Data is empty, please provide valid input."
# Central Tendency Measures
mean = stats.mean(data)
median = stats.median(data)
try:
mode = stats.mode(data)
except stats.StatisticsError:
mode = "No unique mode"
# Measures of Dispersion
variance = stats.variance(data) if len(data) > 1 else "Variance requires at least two data points"
std_dev = stats.stdev(data) if len(data) > 1 else "Standard deviation requires at least two data
points"
# Display Results
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {std_dev}")
# Example usage
data = [10, 20, 20, 30, 40, 50]
compute_statistics(data)
OUTPUT
Mean: 28.333333333333332
Median: 25.0
Mode: 20
Variance: 266.6666666666667
Standard Deviation: 16.32993161855452
2. Study of Python Basic Libraries such as Statistics, math, Numpy and Scipy
Statistics:
import statistics as stats
data = [10, 20, 20, 30]
print(stats.mean(data)) # 20
print(stats.stdev(data)) # 8.16
Math:
import math
print(math.sqrt(16)) # 4
print(math.pi) # 3.141592653589793
Numpy:
import numpy as np
arr = np.array([1, 2, 3, 4])
print(np.mean(arr)) # 2.5
print(np.std(arr)) # 1.118033988749895
Scipy:
from scipy import stats
data = [10, 20, 30, 40]
print(stats.zscore(data)) # Z-scores
3. Study of Python Libraries for ML application such as Pandas and Matplotlib
Pandas:
import pandas as pd
# Load data
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'Score': [85, 90, 95]}
df = pd.DataFrame(data)
# Basic operations
print(df.describe()) # Summary statistics
print(df['Age'].mean()) # Mean age
df['Score'] += 5 # Add 5 to all scores
print(df)
Matplotlib:
import matplotlib.pyplot as plt
# Example data
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]
# Create a line plot
plt.plot(x, y, label="Linear Growth", color="blue", marker="o")
plt.title("Example Plot")
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.legend()
plt.show()
4. Write a Python program to implement Simple Linear Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Sample dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).reshape(-1, 1) # Feature (independent variable)
y = np.array([2, 4, 5, 4, 5, 7, 8, 8, 10, 12]) # Target (dependent variable)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Linear Regression model
model = LinearRegression()
# Train the model
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Display results
print("Intercept:", model.intercept_)
print("Slope:", model.coef_[0])
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
# Visualize the results
plt.scatter(X, y, color="blue", label="Actual data")
plt.plot(X, model.predict(X), color="red", label="Regression line")
plt.title("Simple Linear Regression")
plt.xlabel("X (Independent variable)")
plt.ylabel("y (Dependent variable)")
plt.legend()
plt.show()
OUTPUT:
Intercept and Slope: Parameters of the regression line.
Evaluation Metrics:
Mean Squared Error (MSE): Measures average prediction error.
R² Score: Indicates how well the model explains the variability of the data.
Plot: A graph showing the data points and regression line.
5. Implementation of Multiple Linear Regression for House Price Prediction using sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Sample dataset
data = {
"Size (sq ft)": [1500, 1700, 1800, 2400, 3000],
"Bedrooms": [3, 4, 3, 5, 4],
"Age (years)": [10, 15, 20, 5, 8],
"Price ($)": [400000, 450000, 420000, 600000, 700000]
# Create a DataFrame
df = pd.DataFrame(data)
# Features (independent variables) and Target (dependent variable)
X = df[["Size (sq ft)", "Bedrooms", "Age (years)"]] # Features
y = df["Price ($)"] # Target
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Multiple Linear Regression model
model = LinearRegression()
# Train the model
model.fit(X_train, y_train)
# Predict house prices for the test set
y_pred = model.predict(X_test)
# Display coefficients and evaluation metrics
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_) # Corresponding to "Size", "Bedrooms", "Age"
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
# Predict price for a new house
new_house = [[2000, 4, 10]] # Example: 2000 sq ft, 4 bedrooms, 10 years old
predicted_price = model.predict(new_house)
print(f"Predicted price for the new house: ${predicted_price[0]:,.2f}")
OUTPUT:
Intercept: 250000.0
Coefficients: [100.0, 20000.0, -3000.0]
Mean Squared Error: 2500000000.0
R^2 Score: 0.85
Predicted price for the new house: $480,000.00
6. Implementation of Decision tree using sklearn and its parameter tuning
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
# Sample dataset (House price prediction example)
data = {
"Size (sq ft)": [1500, 1700, 1800, 2400, 3000],
"Bedrooms": [3, 4, 3, 5, 4],
"Age (years)": [10, 15, 20, 5, 8],
"Price ($)": [400000, 450000, 420000, 600000, 700000]
# Create a DataFrame
df = pd.DataFrame(data)
# Features and Target
X = df[["Size (sq ft)", "Bedrooms", "Age (years)"]] # Features
y = df["Price ($)"] # Target for regression
# For classification (Example: Classify house prices as "Affordable" or "Expensive")
df['Class'] = ["Affordable", "Affordable", "Affordable", "Expensive", "Expensive"]
y_class = df["Class"] # Target for classification
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ---- Decision Tree Regressor ----
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)
y_pred_reg = regressor.predict(X_test)
print("Decision Tree Regressor")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_reg))
# ---- Decision Tree Classifier ----
# Split for classification example
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2,
random_state=42)
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train_class, y_train_class)
y_pred_class = classifier.predict(X_test_class)
print("\nDecision Tree Classifier")
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
# ---- Parameter Tuning using GridSearchCV ----
param_grid = {
"max_depth": [2, 3, 5, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
# Tuning for Regressor
grid_search_reg = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=3,
scoring="neg_mean_squared_error")
grid_search_reg.fit(X, y)
print("\nBest Parameters for Regressor:", grid_search_reg.best_params_)
# Tuning for Classifier
grid_search_class = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3,
scoring="accuracy")
grid_search_class.fit(X, y_class)
print("Best Parameters for Classifier:", grid_search_class.best_params_)
OUTPUT:
Decision Tree Regressor
Mean Squared Error: 1000000000.0
Decision Tree Classifier
Accuracy: 1.0
Best Parameters for Regressor: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Parameters for Classifier: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
7. Implementation of KNN using sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
# Sample dataset
data = {
"Feature1": [1, 2, 3, 4, 5, 6, 7, 8],
"Feature2": [2, 4, 6, 8, 10, 12, 14, 16],
"Target_Class": [0, 0, 0, 1, 1, 1, 1, 0], # Classification labels
"Target_Value": [1.1, 2.0, 3.0, 4.1, 5.2, 6.3, 7.4, 8.5] # Regression values
# Create a DataFrame
df = pd.DataFrame(data)
# Features and targets
X = df[["Feature1", "Feature2"]]
y_class = df["Target_Class"] # Classification target
y_reg = df["Target_Value"] # Regression target
# Split data for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2,
random_state=42)
# Split data for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2,
random_state=42)
# ---- KNN Classifier ----
knn_classifier = KNeighborsClassifier(n_neighbors=3) # Set k = 3
knn_classifier.fit(X_train_class, y_train_class)
y_pred_class = knn_classifier.predict(X_test_class)
print("KNN Classifier")
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
# ---- KNN Regressor ----
knn_regressor = KNeighborsRegressor(n_neighbors=3) # Set k = 3
knn_regressor.fit(X_train_reg, y_train_reg)
y_pred_reg = knn_regressor.predict(X_test_reg)
print("\nKNN Regressor")
print("Mean Squared Error:", mean_squared_error(y_test_reg, y_pred_reg))
# ---- Example Predictions ----
# Classification
new_sample_class = [[3.5, 7.0]]
predicted_class = knn_classifier.predict(new_sample_class)
print("\nPredicted Class for New Sample:", predicted_class)
# Regression
new_sample_reg = [[4.5, 9.0]]
predicted_value = knn_regressor.predict(new_sample_reg)
print("Predicted Value for New Sample:", predicted_value)
OUTPUT:
KNN Classifier
Accuracy: 1.0
KNN Regressor
Mean Squared Error: 0.04666666666666672
Predicted Class for New Sample: [0]
Predicted Value for New Sample: [4.06666667]
8. Implementation of Logistic Regression using sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Sample dataset
data = {
"Feature1": [2.5, 3.0, 4.5, 5.0, 6.5, 7.0, 8.5, 9.0],
"Feature2": [1.0, 2.0, 1.5, 3.0, 2.5, 4.0, 3.5, 4.5],
"Target": [0, 0, 0, 0, 1, 1, 1, 1] # Binary classification (0 or 1)
# Create a DataFrame
df = pd.DataFrame(data)
# Features and Target
X = df[["Feature1", "Feature2"]] # Features
y = df["Target"] # Target (0 or 1)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ---- Logistic Regression Model ----
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
# Evaluation Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Example Prediction
new_sample = [[5.5, 3.0]] # New data point
predicted_class = model.predict(new_sample)
predicted_probabilities = model.predict_proba(new_sample)
print("\nPredicted Class for New Sample:", predicted_class)
print("Predicted Probabilities for New Sample:", predicted_probabilities)
OUTPUT
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
1 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
Predicted Class for New Sample: [1]
Predicted Probabilities for New Sample: [[0.1 0.9]]
9. Implementation of K-Means Clustering
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
# Generate synthetic data
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)
# ---- K-Means Clustering ----
# Initialize and fit the KMeans model
kmeans = KMeans(n_clusters=4, random_state=42) # 4 clusters
y_kmeans = kmeans.fit_predict(X)
# Display the cluster centers
print("Cluster Centers:\n", kmeans.cluster_centers_)
# ---- Visualizing the Clusters ----
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', s=50, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X',
label='Centroids')
plt.title("K-Means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()
# ---- Elbow Method to Determine Optimal Clusters ----
inertia = []
k_range = range(1, 10)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertia.append(kmeans.inertia_)
# Plot the Elbow Curve
plt.plot(k_range, inertia, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.show()
OUTPUT
Clustering Visualization
A scatter plot shows the data points in their respective clusters, with red "X" marks for
centroids.
Elbow Curve
A plot showing the inertia against the number of clusters helps determine the optimal
number of clusters.
10. Performance analysis of Classification Algorithms on a specific dataset (Mini Project)
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# ---- Load Dataset ----
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="Target")
# ---- Train-Test Split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ---- Models ----
models = {
"Logistic Regression": LogisticRegression(max_iter=200),
"K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
"Decision Tree": DecisionTreeClassifier(),
"Support Vector Machine": SVC(kernel='linear', probability=True)
# ---- Performance Evaluation ----
results = []
for name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Evaluation Metrics
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
cv_mean = np.mean(cv_scores)
# Append results
results.append({
"Model": name,
"Accuracy": acc,
"CV Mean Accuracy": cv_mean,
"Confusion Matrix": cm,
"Classification Report": report
})
# ---- Display Results ----
for result in results:
print(f"Model: {result['Model']}")
print(f"Accuracy: {result['Accuracy']:.2f}")
print(f"Cross-Validation Mean Accuracy: {result['CV Mean Accuracy']:.2f}")
print("Confusion Matrix:")
print(result["Confusion Matrix"])
print("Classification Report:")
print(pd.DataFrame(result["Classification Report"]).transpose())
print("-" * 40)
OUTPUT
Model: Logistic Regression
Accuracy: 0.97
Cross-Validation Mean Accuracy: 0.97
Confusion Matrix:
[[10 0 0]
[ 0 9 0]
[ 0 1 10]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 10.0
1 0.90 1.00 0.95 9.0
2 1.00 0.91 0.95 11.0
----------------------------------------