LAB-5
Build Logistic Regression Model for a given dataset.
OBSERVATION:
CODE:
import math
import numpy as np
import pandas as pd
import [Link] as plt
import seaborn as sns
import [Link] as px
import pprint
import pickle
In [4]:
df = pd.read_csv('[Link]')
In [5]:
[Link]()
In [6]:
[Link]('id', axis=1, inplace=True) #drop redundant columns
In [7]:
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) #encode the label
into 1/0
In [8]:
corr = [Link]()
In [9]:
[Link](figsize=(20,20))
[Link](corr, cmap='mako_r',annot=True)
[Link]()
In [12]:
# Get the absolute value of the correlation
cor_target = abs(corr["diagnosis"])
# Select highly correlated features (thresold = 0.2)
relevant_features = cor_target[cor_target>0.2]
# Collect the names of the features
names = [index for index, value in relevant_features.items()]
# Drop the target variable from the results
[Link]('diagnosis')
# Display the results
[Link](names)
['radius_mean',
'texture_mean',
'perimeter_mean',
'area_mean',
'smoothness_mean',
'compactness_mean',
'concavity_mean',
'concave points_mean',
'symmetry_mean',
'radius_se',
'perimeter_se',
'area_se',
'compactness_se',
'concavity_se',
'concave points_se',
'radius_worst',
'texture_worst',
'perimeter_worst',
'area_worst',
'smoothness_worst',
'compactness_worst',
'concavity_worst',
'concave points_worst',
'symmetry_worst',
'fractal_dimension_worst']
In [13]:
X = df[names].values
y = df['diagnosis'].values
In [14]:
def train_test_split(X, y, random_state=42, test_size=0.2):
"""
Splits the data into training and testing sets.
Parameters:
X ([Link]): Features array of shape (n_samples, n_features).
y ([Link]): Target array of shape (n_samples,).
random_state (int): Seed for the random number generator. Default
is 42.
test_size (float): Proportion of samples to include in the test
set. Default is 0.2.
Returns:
Tuple[[Link]]: A tuple containing X_train, X_test, y_train,
y_test.
"""
# Get number of samples
n_samples = [Link][0]
# Set the seed for the random number generator
[Link](random_state)
# Shuffle the indices
shuffled_indices = [Link]([Link](n_samples))
# Determine the size of the test set
test_size = int(n_samples * test_size)
# Split the indices into test and train
test_indices = shuffled_indices[:test_size]
train_indices = shuffled_indices[test_size:]
# Split the features and target arrays into test and train
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
return X_train, X_test, y_train, y_test
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
In [16]:
def standardize_data(X_train, X_test):
"""
Standardizes the input data using mean and standard deviation.
Parameters:
X_train ([Link]): Training data.
X_test ([Link]): Testing data.
Returns:
Tuple of standardized training and testing data.
"""
# Calculate the mean and standard deviation using the training data
mean = [Link](X_train, axis=0)
std = [Link](X_train, axis=0)
# Standardize the data
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
return X_train, X_test
X_train, X_test = standardize_data(X_train, X_test)
In [17]:
def sigmoid(z):
"""
Compute the sigmoid function for a given input.
The sigmoid function is a mathematical function used in logistic
regression and neural networks
to map any real-valued number to a value between 0 and 1.
Parameters:
z (float or [Link]): The input value(s) for which to compute
the sigmoid.
Returns:
float or [Link]: The sigmoid of the input value(s).
Example:
>>> sigmoid(0)
0.5
"""
# Compute the sigmoid function using the formula: 1 / (1 + e^(-z)).
sigmoid_result = 1 / (1 + [Link](-z))
# Return the computed sigmoid value.
return sigmoid_result
In [18]:
z = [Link](-12, 12, 200)
fig = [Link](x=z, y=sigmoid(z),title='Logistic
Function',template="plotly_dark")
fig.update_layout(
title_font_color="#41BEE9",
xaxis=dict(color="#41BEE9"),
yaxis=dict(color="#41BEE9")
)
[Link]()
In [19]:
class LogisticRegression:
"""
Logistic Regression model.
Parameters:
learning_rate (float): Learning rate for the model.
Methods:
initialize_parameter(): Initializes the parameters of the model.
sigmoid(z): Computes the sigmoid activation function for given
input z.
forward(X): Computes forward propagation for given input X.
compute_cost(predictions): Computes the cost function for given
predictions.
compute_gradient(predictions): Computes the gradients for the model
using given predictions.
fit(X, y, iterations, plot_cost): Trains the model on given input X
and labels y for specified iterations.
predict(X): Predicts the labels for given input X.
"""
def __init__(self, learning_rate=0.0001):
[Link](1)
self.learning_rate = learning_rate
def initialize_parameter(self):
"""
Initializes the parameters of the model.
"""
self.W = [Link]([Link][1])
self.b = 0.0
def forward(self, X):
"""
Computes forward propagation for given input X.
Parameters:
X ([Link]): Input array.
Returns:
[Link]: Output array.
"""
# print([Link], [Link])
Z = [Link](X, self.W) + self.b
A = sigmoid(Z)
return A
def compute_cost(self, predictions):
"""
Computes the cost function for given predictions.
Parameters:
predictions ([Link]): Predictions of the model.
Returns:
float: Cost of the model.
"""
m = [Link][0] # number of training examples
# compute the cost
cost = [Link]((-[Link](predictions + 1e-8) * self.y) + (-[Link](1 -
predictions + 1e-8)) * (
1 - self.y)) # we are adding small value epsilon to avoid
log of 0
cost = cost / m
return cost
def compute_gradient(self, predictions):
"""
Computes the gradients for the model using given predictions.
Parameters:
predictions ([Link]): Predictions of the model.
"""
# get training shape
m = [Link][0]
# compute gradients
[Link] = [Link](self.X.T, (predictions - self.y))
[Link] = [Link]([[Link](grad) for grad in [Link]])
[Link] = [Link]([Link](predictions, self.y))
# scale gradients
[Link] = [Link] * 1 / m
[Link] = [Link] * 1 / m
def fit(self, X, y, iterations, plot_cost=True):
"""
Trains the model on given input X and labels y for specified
iterations.
Parameters:
X ([Link]): Input features array of shape (n_samples,
n )
y ([Link]): Labels array of shape (n_samples, 1)
iterations (int): Number of iterations for training.
plot_cost (bool): Whether to plot cost over iterations or not.
Returns:
None.
"""
self.X = X
self.y = y
self.initialize_parameter()
costs = []
for i in range(iterations):
# forward propagation
predictions = [Link](self.X)
# compute cost
cost = self.compute_cost(predictions)
[Link](cost)
# compute gradients
self.compute_gradient(predictions)
# update parameters
self.W = self.W - self.learning_rate * [Link]
self.b = self.b - self.learning_rate * [Link]
# print cost every 100 iterations
if i % 10000 == 0:
print("Cost after iteration {}: {}".format(i, cost))
if plot_cost:
fig = [Link](y=costs,title="Cost vs
Iteration",template="plotly_dark")
fig.update_layout(
title_font_color="#41BEE9",
xaxis=dict(color="#41BEE9",title="Iterations"),
yaxis=dict(color="#41BEE9",title="cost")
)
[Link]()
def predict(self, X):
"""
Predicts the labels for given input X.
Parameters:
X ([Link]): Input features array.
Returns:
[Link]: Predicted labels.
"""
predictions = [Link](X)
return [Link](predictions)
def save_model(self, filename=None):
"""
Save the trained model to a file using pickle.
Parameters:
filename (str): The name of the file to save the model to.
"""
model_data = {
'learning_rate': self.learning_rate,
'W': self.W,
'b': self.b
}
with open(filename, 'wb') as file:
[Link](model_data, file)
@classmethod
def load_model(cls, filename):
"""
Load a trained model from a file using pickle.
Parameters:
filename (str): The name of the file to load the model from.
Returns:
LogisticRegression: An instance of the LogisticRegression class
with loaded parameters.
"""
with open(filename, 'rb') as file:
model_data = [Link](file)
# Create a new instance of the class and initialize it with the
loaded parameters
loaded_model = cls(model_data['learning_rate'])
loaded_model.W = model_data['W']
loaded_model.b = model_data['b']
return loaded_model
In [21]:
lg = LogisticRegression()
[Link](X_train, y_train,100000)
In [22]:
lg.save_model("[Link]")
In [23]:
class ClassificationMetrics:
@staticmethod
def accuracy(y_true, y_pred):
"""
Computes the accuracy of a classification model.
Parameters:
y_true (numpy array): A numpy array of true labels for each data
point.
y_pred (numpy array): A numpy array of predicted labels for each
data point.
Returns:
float: The accuracy of the model, expressed as a percentage.
"""
y_true = y_true.flatten()
total_samples = len(y_true)
correct_predictions = [Link](y_true == y_pred)
return (correct_predictions / total_samples)
@staticmethod
def precision(y_true, y_pred):
"""
Computes the precision of a classification model.
Parameters:
y_true (numpy array): A numpy array of true labels for each data
point.
y_pred (numpy array): A numpy array of predicted labels for each
data point.
Returns:
float: The precision of the model, which measures the proportion of
true positive predictions
out of all positive predictions made by the model.
"""
true_positives = [Link]((y_true == 1) & (y_pred == 1))
false_positives = [Link]((y_true == 0) & (y_pred == 1))
return true_positives / (true_positives + false_positives)
@staticmethod
def recall(y_true, y_pred):
"""
Computes the recall (sensitivity) of a classification model.
Parameters:
y_true (numpy array): A numpy array of true labels for each data
point.
y_pred (numpy array): A numpy array of predicted labels for each
data point.
Returns:
float: The recall of the model, which measures the proportion of
true positive predictions
out of all actual positive instances in the dataset.
"""
true_positives = [Link]((y_true == 1) & (y_pred == 1))
false_negatives = [Link]((y_true == 1) & (y_pred == 0))
return true_positives / (true_positives + false_negatives)
@staticmethod
def f1_score(y_true, y_pred):
"""
Computes the F1-score of a classification model.
Parameters:
y_true (numpy array): A numpy array of true labels for each data
point.
y_pred (numpy array): A numpy array of predicted labels for each
data point.
Returns:
float: The F1-score of the model, which is the harmonic mean of
precision and recall.
"""
precision_value = [Link](y_true, y_pred)
recall_value = [Link](y_true, y_pred)
return 2 * (precision_value * recall_value) / (precision_value +
recall_value)
In [24]:
model = LogisticRegression.load_model("[Link]")
In [25]:
y_pred = [Link](X_test)
accuracy = [Link](y_test, y_pred)
precision = [Link](y_test, y_pred)
recall = [Link](y_test, y_pred)
f1_score = ClassificationMetrics.f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1_score:.2%}")
OUTPUT: