0% found this document useful (0 votes)
24 views

Data Mining Practicals

a

Uploaded by

Vanshika Gupta
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views

Data Mining Practicals

a

Uploaded by

Vanshika Gupta
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 22

Data mining practicals :-

Underfitting
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Generate synthetic data


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Fit a simple linear regression model


linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Evaluate the model on the training data


y_train_pred = linear_reg.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)

# Evaluate the model on the testing data


y_test_pred = linear_reg.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)

# Plot the data and the model


plt.scatter(X, y, label='Data points')
plt.plot(X, linear_reg.predict(X), color='red', label='Linear Regression
Model')
plt.title(f'Underfitting Example\nTrain MSE: {mse_train:.2f}, Test MSE:
{mse_test:.2f}')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Overfitting :-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

# Generate synthetic data


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Create a polynomial features transformer


poly_features = PolynomialFeatures(degree=15, include_bias=False)
X_poly = poly_features.fit_transform(X_train)

# Fit a linear regression model on the polynomial features


lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

# Visualize the overfitting


X_range = np.linspace(0, 2, 100).reshape(-1, 1)
X_range_poly = poly_features.transform(X_range)
y_pred = lin_reg.predict(X_range_poly)

plt.scatter(X_train, y_train, label='Training Data')


plt.scatter(X_test, y_test, label='Testing Data', color='r')
plt.plot(X_range, y_pred, label='Polynomial Regression', color='g')
plt.title('Overfitting Example')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

# Evaluate the model on training and testing data


y_train_pred = lin_reg.predict(X_poly)
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Error on Training Data: {mse_train}')

X_test_poly = poly_features.transform(X_test)
y_test_pred = lin_reg.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error on Testing Data: {mse_test}')
Cross validation:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset (replace this with your own dataset)


data = load_iris()
X, y = data.data, data.target

# Initialize a classifier (replace this with your own classifier)


classifier = RandomForestClassifier()

# Set up KFold cross-validation with 5 folds


kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(classifier, X, y, cv=kf)

# Print the cross-validation scores


for i, score in enumerate(cv_scores):
print(f'Fold {i+1}: {score}')

# Print the mean and standard deviation of the cross-validation scores


print(f'Mean CV Score: {cv_scores.mean()}')
print(f'Standard Deviation of CV Scores: {cv_scores.std()}')

Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset (replace this with your own dataset)


data = load_iris()
X, y = data.data, data.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize a classifier (replace this with your own classifier)


classifier = RandomForestClassifier()

# Train the classifier on the training data


classifier.fit(X_train, y_train)

# Make predictions on the test data


y_pred = classifier.predict(X_test)

# Create a confusion matrix


cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix


print("Confusion Matrix:")
print(cm)

Gradient descent:
import numpy as np
import matplotlib.pyplot as plt

# Generate some random data for a linear relationship


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Add a bias term to the feature matrix


X_b = np.c_[np.ones((100, 1)), X]

# Set hyperparameters
learning_rate = 0.01
n_iterations = 1000

# Initialize random weights


theta = np.random.randn(2, 1)

# Gradient Descent algorithm


for iteration in range(n_iterations):
gradients = 2/100 * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - learning_rate * gradients

# Print the final learned parameters (theta)


print("Final Parameters (theta):", theta)

# Plot the original data and the linear regression line


plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), color='red', label='Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Grid search:-
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset (you can replace this with your own dataset)
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)

# Define the model


svm_model = SVC()

# Define the hyperparameter grid


param_grid = {
'C': [0.1, 1, 10, 100],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
'degree': [2, 3, 4]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid,
cv=3, scoring='accuracy')

# Fit the model with the data


grid_search.fit(X_train, y_train)

# Print the best hyperparameters


print("Best Hyperparameters: ", grid_search.best_params_)

# Get the best model


best_model = grid_search.best_estimator_

# Evaluate the best model on the test set


y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Randomized search CV:-


# Import necessary libraries
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset (you can replace this with your dataset)


iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)

# Define the model


rf = RandomForestClassifier()

# Define the hyperparameter distributions


param_dist = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf,
param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy',
random_state=42)

# Fit the model with the data


random_search.fit(X_train, y_train)

# Print the best hyperparameters


print("Best Hyperparameters: ", random_search.best_params_)

# Get the best model


best_model = random_search.best_estimator_

# Evaluate the model on the test set


accuracy = best_model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Loss function:-
import numpy as np

def mean_squared_error(y_true, y_pred):


"""
Calculate the mean squared error between the true and predicted
values.

Parameters:
- y_true: numpy array, true values
- y_pred: numpy array, predicted values

Returns:
- mse: float, mean squared error
"""
# Ensure the input arrays have the same shape
assert y_true.shape == y_pred.shape, "Input arrays must have the same
shape"

# Calculate the squared differences


squared_diff = (y_true - y_pred) ** 2
# Calculate the mean squared error
mse = np.mean(squared_diff)

return mse

# Example usage:
# Replace these arrays with your actual true and predicted values
true_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
predicted_values = np.array([1.5, 2.5, 2.8, 3.7, 4.2])

# Calculate the mean squared error


mse_result = mean_squared_error(true_values, predicted_values)

# Print the result


print("Mean Squared Error:", mse_result)

Stochastic gradient descent (sgd): -

import numpy as np

def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=100,


batch_size=1):
"""
Perform Stochastic Gradient Descent for linear regression.

Parameters:
- X: numpy array, feature matrix
- y: numpy array, target values
- learning_rate: float, step size for updating parameters
- epochs: int, number of passes through the entire dataset
- batch_size: int, number of samples in each mini-batch

Returns:
- theta: numpy array, learned parameters
- cost_history: list, history of cost during optimization
"""
# Initialize parameters
num_samples, num_features = X.shape
theta = np.zeros(num_features)
cost_history = []
# Stochastic Gradient Descent
for epoch in range(epochs):
for i in range(0, num_samples, batch_size):
X_batch = X[i:i + batch_size]
y_batch = y[i:i + batch_size]

# Compute predictions
predictions = np.dot(X_batch, theta)

# Compute errors
errors = predictions - y_batch

# Update parameters
gradient = np.dot(X_batch.T, errors) / batch_size
theta -= learning_rate * gradient

# Compute and record the cost


cost = np.mean((np.dot(X, theta) - y) ** 2) / 2.0
cost_history.append(cost)

if epoch % 10 == 0:
print(f"Epoch {epoch}/{epochs}, Cost: {cost}")

return theta, cost_history

# Example usage:
# Replace these arrays with your actual feature matrix (X) and target
values (y)
X = np.array([[1, 2], [1, 3], [1, 4]])
y = np.array([5, 6, 7])

# Set hyperparameters
learning_rate = 0.01
epochs = 100
batch_size = 1

# Run stochastic gradient descent


theta, cost_history = stochastic_gradient_descent(X, y, learning_rate,
epochs, batch_size)

# Print the learned parameters and cost history


print("Learned Parameters (Theta):", theta)
print("Final Cost:", cost_history[-1])
How to Save & Load Machine Learning Model
### Import Libraries
"""

# import libraries
import numpy as np
import pandas as pd

"""### Load Dataset"""

#load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

data.data

data.feature_names

data.target

data.target_names

# create dtaframe
df = pd.DataFrame(np.c_[data.data, data.target],
columns=[list(data.feature_names)+['target']])
df.head()

df.tail()

df.shape

"""### Split Data"""

X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=2020)

print ('Shape of X_train = ', X_train.shape)


print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

"""## Train Random Forest Classification Model"""

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, criterion='gini')


classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

"""## Predict Cancer"""

patient1 = [17.99,
10.38,
122.8,
1001.0,
0.1184,
0.2776,
0.3001,
0.1471,
0.2419,
0.07871,
1.095,
0.9053,
8.589,
153.4,
0.006399,
0.04904,
0.05373,
0.01587,
0.03003,
0.006193,
25.38,
17.33,
184.6,
2019.0,
0.1622,
0.6656,
0.7119,
0.2654,
0.4601,
0.1189]

patient1 = np.array([patient1])
patient1

classifier.predict(patient1)

data.target_names

pred = classifier.predict(patient1)

if pred [0] == 0:
print ('Patient has Cancer (malignant tumor)')
else:
print ('Patient has no Cancer (malignant benign)')

"""# Save Model

## Save Model using Pickle


"""
import pickle

pickle.dump(classifier, open('model_save', 'wb'))

model = pickle.load(open('model_save', 'rb'))

model.predict(patient1)[0]

"""## Save Model using Joblib"""

import joblib

joblib.dump(classifier, 'model_save2')

model2 = joblib.load('model_save2')

model2.predict(patient1)

Recommendation system:
1.Collaborative filtering:-
2. import pandas as pd
3. import numpy as np
4. #Load the u.user file into a dataframe
5. u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
6.
7. users = pd.read_csv('/content/u.user', sep='|', names=u_cols,
8. encoding='latin-1')
9.
10. users.head(3)
11. #Load the u.item file into a dataframe
12. i_cols = ['movie_id', 'title' ,'release date','video release
date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
13. 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
'Drama', 'Fantasy',
14. 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-
Fi', 'Thriller', 'War', 'Western']
15.
16. movies = pd.read_csv('/content/u.item', sep='|', names=i_cols,
encoding='latin-1')
17.
18. movies.head(2)
19. #Remove all information except Movie ID and title
20. movies = movies[['movie_id', 'title']]
21. #Load the u.data file into a dataframe
22. r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
23.
24. ratings = pd.read_csv('/content/u.data', sep='\t',
names=r_cols,
25. encoding='latin-1')
26.
27. ratings.head(2)
28. #Drop the timestamp column
29. ratings = ratings.drop('timestamp', axis=1)
30. #Import the train_test_split function
31. from sklearn.model_selection import train_test_split
32.
33. #Assign X as the original ratings dataframe and y as the
user_id column of ratings.
34. X = ratings.copy()
35. y = ratings['user_id']
36.
37. #Split into training and test datasets, stratified along
user_id
38. X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, stratify=y, random_state=42)
39. #Import the mean_squared_error function
40. from sklearn.metrics import mean_squared_error
41.
42. #Function that computes the root mean squared error (or RMSE)
43. def rmse(y_true, y_pred):
44. return np.sqrt(mean_squared_error(y_true, y_pred))
45. #Define the baseline model to always return 3.
46. def baseline(user_id, movie_id):
47. return 3.0
48. #Function to compute the RMSE score obtained on the testing
set by a model
49. def score(cf_model):
50.
51. #Construct a list of user-movie tuples from the testing
dataset
52. id_pairs = zip(X_test['user_id'], X_test['movie_id'])
53.
54. #Predict the rating for every user-movie tuple
55. y_pred = np.array([cf_model(user, movie) for (user, movie)
in id_pairs])
56.
57. #Extract the actual ratings given by the users in the test
data
58. y_true = np.array(X_test['rating'])
59.
60. #Return the final RMSE score
61. return rmse(y_true, y_pred)
62. score(baseline)
63. #Ratings Matrix
64. #Build the ratings matrix using pivot_table function
65. r_matrix = X_train.pivot_table(values='rating',
index='user_id', columns='movie_id')
66.
67. r_matrix.head()
68. #User Based Collaborative Filter using Mean Ratings
69. def cf_user_mean(user_id, movie_id):
70.
71. #Check if movie_id exists in r_matrix
72. if movie_id in r_matrix:
73. #Compute the mean of all the ratings given to the
movie
74. mean_rating = r_matrix[movie_id].mean()
75.
76. else:
77. #Default to a rating of 3.0 in the absence of any
information
78. mean_rating = 3.0
79.
80. return mean_rating
81. #Compute RMSE for the Mean model
82. score(cf_user_mean)
83. # Weighted Mean
84. #Create a dummy ratings matrix with all null values imputed to
0
85. r_matrix_dummy = r_matrix.copy().fillna(0)
86. # Import cosine_score
87. from sklearn.metrics.pairwise import cosine_similarity
88.
89. #Compute the cosine similarity matrix using the dummy ratings
matrix
90. cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
91. #Convert into pandas dataframe
92. cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index,
columns=r_matrix.index)
93.
94. cosine_sim.head(10)
95. #User Based Collaborative Filter using Weighted Mean Ratings
96.
97. def cf_user_wmean(user_id, movie_id):
98.
99. # Check if movie_id exists in r_matrix
100. if movie_id in r_matrix:
101.
102. # Get the similarity scores for the user in question
with every other user
103. sim_scores = cosine_sim[user_id]
104.
105. # Get the user ratings for the movie in question
106. m_ratings = r_matrix[movie_id]
107.
108. # Extract the indices containing NaN in the m_ratings
series
109. idx = m_ratings[m_ratings.isnull()].index
110.
111. # Check for NaN values in similarity scores
112. if sim_scores.isnull().any() or
m_ratings.isnull().any():
113. # Default to a rating of 3.0 if there are NaN
values
114. wmean_rating = 3.0
115. else:
116. # Drop the NaN values from the m_ratings Series
117. m_ratings = m_ratings.dropna()
118.
119. # Drop the corresponding cosine scores from the
sim_scores series
120. sim_scores = sim_scores.drop(idx)
121.
122. # Compute the final weighted mean
123. wmean_rating = np.dot(sim_scores, m_ratings) /
sim_scores.sum()
124.
125. else:
126. # Default to a rating of 3.0 in the absence of any
information
127. wmean_rating = 3.0
128.
129. return wmean_rating
130.
131. # Now, re-run the score function
132. score(cf_user_wmean)
Content based recommendation system: -
Output
Hybrid recommendation system:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Sample user-item interaction data


user_item_data = {
'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
'item_id': ['A', 'B', 'A', 'C', 'B', 'C', 'D', 'E']
}

df_user_item = pd.DataFrame(user_item_data)

# Sample item content data


item_content_data = {
'item_id': ['A', 'B', 'C', 'D', 'E'],
'description': ['Action movie', 'Drama movie', 'Comedy movie',
'Science fiction book', 'Mystery book']
}

df_item_content = pd.DataFrame(item_content_data)

# Collaborative Filtering
user_item_matrix = df_user_item.pivot_table(index='user_id',
columns='item_id', aggfunc=len, fill_value=0)

# Content-Based Filtering
vectorizer = CountVectorizer()
item_description_matrix =
vectorizer.fit_transform(df_item_content['description'])
cosine_similarities = cosine_similarity(item_description_matrix,
item_description_matrix)

# Hybrid Recommendation
def hybrid_recommendation(user_id, item_id):
# Collaborative Filtering
user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)
item_ratings = user_item_matrix[item_id].values.reshape(1, -1)
collaborative_similarity = cosine_similarity(user_item_matrix.values,
user_ratings)

# Content-Based Filtering
item_index = df_item_content[df_item_content['item_id'] ==
item_id].index[0]
content_similarity = cosine_similarities[item_index]

# Hybrid Score
hybrid_score = 0.7 * collaborative_similarity + 0.3 *
content_similarity

# Get recommended items


recommended_items =
user_item_matrix.columns[np.argsort(hybrid_score[0])[::-1]]

return recommended_items

# Example usage
user_id = 1
item_id = 'B'
recommendations = hybrid_recommendation(user_id, item_id)

print(f"Recommendations for user {user_id} based on item {item_id}:


{recommendations}")

Hyperparameter tuning: -
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
# Load a sample dataset (Iris dataset in this case)
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Define the model and the hyperparameter grid


model = RandomForestClassifier()
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,
scoring='accuracy')

# Fit the model with hyperparameter tuning


grid_search.fit(X_train, y_train)

# Get the best hyperparameters


best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Predict using the best model


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

You might also like