ml
ml
i) Supervised Learning
ii)UnSupervised Learning
iii)Reinforcement Learning
i)Supervised Learning
a) Regression Learning
b) Classification Learning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
for i in dataset.select_dtypes(include="object").columns:
print(i)
for i in dataset.select_dtypes(include="object").columns:
dataset[i].fillna(dataset[i].mode()[0],inplace=True) # to fill all objective
by mode without giving error if they are not objective type then that place are
blank
# To fill the numerical values
dataset.isnull().sum()
dataset.info()
dataset.select_dtypes(include="float64").columns
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy="mean")
si.fit_transform(dataset[[dataset.select_dtypes(include="float64").columns - just
put this output]])
arr = si.fit_transform(dataset[[dataset.select_dtypes(include="float64").columns -
just put this output]])
pd.DataFrame(arr,columns=dataset.select_dtypes(include="float64").columns)
new_dataset =
pd.DataFrame(arr,columns=dataset.select_dtypes(include="float64").columns)
new_dataset.isnull().sum()
new_dataset
dataset["LoanAmount"].mean()
# we genrally try to put numerical value that why we change the catogries value to
numberical value
# ONE HOT Encoding
import pandas as pd
dataset.head()
dataset.isnull().sum()
dataset["Gender"].fillna(dataset["Gender"].mode()[0],inplace=True)
dataset["Married"].fillna(dataset["Married"].mode()[0],inplace=True)
#1st method(ONE Hot Encoding) - get_dummies
en_data = dataset[["Gender","Married"]]
pd.get_dummies(en_data)
pd.get_dummies(en_data).info()
#2nd Method
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit_transform(en_data).toarray()
ar = ohe.fit_transform(en_data).toarray()
pd.DataFrame(ar,columns=["Gender_Female","Gender_Male","Married_No","Married_Yes"])
ohe1 = OneHotEncoder(drop="first")
ar1 = ohe1.fit_transform(en_data).toarray()
ar1
pd.DataFrame(ar,columns=["Gender_Male","Married_Yes"])
#Label Encoding
import pandas as pd
df = pd.DataFrame({"name":["wscube","cow","cat","dog","black"]})
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fir.transform(df["name"])
df["en_name"] = le.fir.transform(df["name"])
df
dataset = pd.read_csv("loan.csv")
dataset.head(3)
dataset["Property_Area].unique()
la = LabelEncoder()
la.fit(dataset["Property_Area"])
la.transform(dataset["Property_Area"])
dataset["Property_Area"] = la.transform(dataset["Property_Area"])
dataset["Property_Area"].unique()
#Ordinal Encoding
import pandas as pd
df = pd.DataFrame({"Size":["s","m","l","xl","s","m","l","s","s","l","xl","m"]})
df.head(3)
ord_data = [["s","m","l","xl"]] # we use double quotes because of 2 dimension
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=ord_data)
oe.fit(df[["Size"]])
oe.transform(df[["Size"]])
df["Size_encoding"] = oe.transform(df[["Size"]])
df
--------
# for example(learning purpose)
ord_data1 = {"s":5,"m":6,"l":7,"xl":8}
df["Size_encoding_map"] = df["Size"].map(ord_data1)
df
----------
#OUTLIER
# how to detect outlier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("loan.csv")
dataset.head(3)
dataset.info()
dataset.describe()
# box plot
plt.figure(figsize=(15,5))
sns.boxplot(x = "CoapplicantIncome", data=dataset)
plt.show()
sns.boxplot(x = "ApplicantIncome", data=dataset)
plt.show()
dataset.shape
q1 = dataset["CoapplicantIncome"].quantile(0.25)
q3 = dataset["CoapplicantIncome"].quantile(0.75)
q1
q3
IQR = q3-q1
min_range = q1 - (1.5*IQR)
max_range = q3 + (1.5*IQR)
min_range,max_range
dataset
dataset[dataset["CoapplicantIncome"]<=max_range ]
new_dataset = dataset[dataset["CoapplicantIncome"]<=max_range]
new_dataset.shape
sns.boxplot(x = "CoapplicantIncome", data=new_dataset)
plt.show()
# Z score
z_score = (dataset["CoapplicantIncome"] -
dataset["CoapplicantIncome"].mean())/(dataset["CoapplicantIncome"].std())
z_score
z_score>3
data["z_score"] = z_score # puting data in orignal dataset
dataset
# removing outlier
dataset[dataset["z_score"]<3]
new_dataset.shape
dataset[dataset["z_score"]<3].shape
# Feature Scaling(standardization)
standardization - It is a very effective technique which re-Scales a feature value
so that it has distribution with 0 mean value and variance equals to 1.
x(new) = x(i)-x(mean)/standard Deviation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("loan.csv")
dataset.head(3)
dataset.isnull().sum()
dataset["ApplicantIncome"].fillna(dataset["ApplicantIncome"].mean(),inplace=True)
sns.distplot(dataset["ApplicantIncome"])
plt.show()
dataset.describe()
# scalling throw scikit-learn
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.title("Before")
sns.distplot(dataset["ApplicantIncome_ss"])
plt.subplot(1,2,2)
plt.title("After")
sns.distplot(dataset["ApplicantIncome"])
plt.show()
# Feature Scaling(normalization)
#min-max scaler(normalization techniques)
normalization - It is a scaling technique in which values are shifted and rescaled
so that they end up ranging between 0 and 1. it is also known as Min-Max scaling.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("loan.csv")
dataset.head(3)
dataset.isnull.sum()
dataset.describe()
sns.distplot(dataset["CoapplicantIncome"])
plt.show()
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
ms.fit(dataset[["CoapplicantIncome"]])
ms.transform(dataset[["CoapplicantIncome"]])
dataset["CoapplicantIncome_min"] = ms.transform(dataset[["CoapplicantIncome"]])
dataset.head(3)
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.title("Before")
sns.distplot(dataset["CoapplicantIncome"])
plt.subplot(1,2,2)
plt.title("After")
sns.distplot(dataset["CoapplicantIncome_min"])
plt.show()
# Function Transformer
# without outlier
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("loan.csv")
dataset.head(3)
dataset.isnull.sum()
sns.distplot(dataset["CoapplicantIncome"])
plt.show()
#IQR
q1 = dataset["CoapplicantIncome"].quantile(0.25)
q3 = dataset["CoapplicantIncome"].quantile(0.75)
iqr = q3 -q1
min_r = q1 - (1.5*iqr)
max_r = q3 + (1.5*iqr)
min_r,max_r
dataset[dataset["CoapplicantIncome"]<=max_r]
dataset = dataset[dataset["CoapplicantIncome"]<=max_r]
sns.distplot(dataset["CoapplicantIncome"])
plt.show()
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.distplot(dataset["CoapplicantIncome"])
plt.title("Before")
plt.subplot(1,2,2)
sns.distplot(dataset["CoapplicantIncome_tf"])
plt.title("After")
plt.show()
# Function Transformer
# with outlier
dataset.head(3)
dataset.isnull.sum()
sns.distplot(dataset["CoapplicantIncome"])
plt.show()
## IQR
q1 = dataset["CoapplicantIncome"].quantile(0.25)
q3 = dataset["CoapplicantIncome"].quantile(0.75)
iqr = q3 -q1
min_r = q1 - (1.5*iqr)
max_r = q3 + (1.5*iqr)
min_r,max_r
#dataset[dataset["CoapplicantIncome"]<=max_r]
#dataset = dataset[dataset["CoapplicantIncome"]<=max_r]
sns.distplot(dataset["CoapplicantIncome"])
plt.show()
## another method
ft1 = FunctionTransformer(func= lambda x : x**2)
ft1.fit(dataset[["CoapplicantIncome"]])
dataset["CoapplicantIncome_tf1"] = ft1.transform(dataset[["CoapplicantIncome"]])
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.distplot(dataset["CoapplicantIncome"])
plt.title("Before")
plt.subplot(1,2,2)
sns.distplot(dataset["CoapplicantIncome_tf1"])
plt.title("After")
plt.show()
# for backword
fs = SequentialFeatureSelector(lr,k_features=5,forward=False)
fs.fit(x,y)
fs.feature_names
fs.k_feature_names_
fs.k_score_
## Train Test Split in Data set ##
import pandas as pd
dataset = pd.read_csv("Boston.csv")
dataset.head(3)
dataset.shape
input_data = dataset.iloc[:,:-1]
output_data = dataset["House_Price"]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =
train_test_split(input_data,output_data,test_size=0.25)
x_test
x_train
y_test
y_train
x_train.shape , y_train.shape
x_test.shape , y_test.shape
#practical
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("placement.csv")
#dataset = pd.read_csv(r"D:\data set\video\placement.csv")
dataset.head(3)
plt.figure(figsize=(5,3))
sns.scatterplot(x="cgpa",y="package",data=dataset)
plt.show()
dataset.isnull.sum()
dataset.ndim # if they are 1 dimensionthen please change in 2 dimension
x = dataset[["cgpa"]]
y = dataset["package"]
y = m1x1+m2x2+m3x3.....+c
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("regression_dataset.csv")
dataset.head(3)
dataset.shape
dataset.isnull.sum()
sns.pairplot(data=dataset)
plt.show()
sns.heatmap(data=dataset.corr(),annot=True)
plt.show()
x = dataset.iloc[:,:-1]
x
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv("multiple_regression_dataset.csv")
dataset.head()
dataset.isnull.sum()
sns.pairplot(data=dataset)
plt.show()
sns.heatmap(data=dataset.corr(),annot=True)
plt.show()
x = dataset.iloc[:,:-1]
y = dataset["Salary"]
x.ndim
dataset.shape
## POLYNOMIAL REGRESSION
Ploynomial Regression is a regression algorithm that models the relationship
between a dependent(y) and independent variable(x) as nth degree ploynomial.
Y = b0+b1x1+b2x1*square+.......bnx1square(n)
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv("ploynomial.csv")
dataset.head(3)
dataset.corr()
plt.scatter(dataset["Level"],dataset["Salary"])
plt.xlabel("Level")
plt.ylabel("Salary")
plt.show()
x = dataset[["Level"]]
y = dataset[["Salary"]]
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2)
pf.fit(x)
pf.transform(x)
x = pf.transform(x)
prd =lr.predict(x)
plt.scatter(dataset["Level"],dataset["Salary"])
plt.plot(dataset["Level"],prd,c='red')
plt.xlabel("Level")
plt.ylabel("Salary")
plt.legend(["org","prd"])
plt.show()
test = pf.transform([[45]])
test # array([[1.000e+00, 4.500e+01, 2.025e+03]])
lr.predict(test) #([1057494.47922994])
# Cost Function
1 - A cost function is an important parameter that determines how well a machine
learning model performs for a given dataset.
2 - Cost function is a measure of how wrong the model is in estimating the
relationship between X(input) and Y(output) Parameter.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
dataset = pd.read_csv(r"houseprice.csv")
dataset.head(3)
plt.figure(figsize=(10,10))
sns.heatmap(data=dataset.corr(),annot=True)
plt.show()
x = dataset.iloc[:,:-1]
y = dataset["price"]
sc = StandardScaler()
sc.fit(x)
sc.transform(x)
x = pd.DataFrame(sc.transform(x),column=x.column)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy
# LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)*100 # 3.2286184...
lr.coef_
print(mean_squared_error(y_test,lr.predict(x_test))) #986919392751.0544
print(mean_absolute_error(y_test,lr.predict(x_test))) #210903.52141518658
print(np.sqrt(mean_squared_error(y_test,lr.predict(x_test)))) #993438.167552996
plt.figure(figsize=(15,5))
plt.bar(x.columns,lr.coef_)
plt.title("LinearRegression")
plt.xlabel("columns")
plt.ylabel("coef")
plt.show()
# Lasso(L1)
la =Lasso(alpha=0.5)
la.fit(x_train,y_train)
la.score(x_test,y_test)*100 # 3.228361
la.coef_
print(mean_squared_error(y_test,la.predict(x_test))) #986921772009.158
print(mean_absolute_error(y_test,la.predict(x_test))) #210908.17447564355
print(np.sqrt(mean_squared_error(y_test,la.predict(x_test)))) #993439.3650390335
plt.figure(figsize=(15,5))
plt.bar(x.columns,la.coef_)
plt.title("Lasso")
plt.xlabel("columns")
plt.ylabel("coef")
plt.show()
# Ridge(L2)
ri = Ridge(alpha = 10)
ri.fit(x_train,y_train)
ri.score(x_test,y_test)*100 #3.2401994171
ri.coef_
print(mean_squared_error(y_test,ri.predict(x_test))) #986801284919.7765
print(mean_absolute_error(y_test,ri.predict(x_test))) #210815.94787357954
print(np.sqrt(mean_squared_error(y_test,ri.predict(x_test)))) #993378.7217973699
plt.figure(figsize=(15,5))
plt.bar(x.columns,ri.coef_)
plt.title("Ridge")
plt.xlabel("columns")
plt.ylabel("coef")
plt.show()
df =
pd.DataFrame({"col_name":x.columns,"LinearRegression":lr.coef_,"Lasso":la.coef_,"Ri
dge":ri.coef_})
## Classification
# Classification Algorithm
. The Classification algorithm is used to identify the category of new observation
on the basis of training data.
. In Classification, a program learns from the given dataset or observations and
then classifies new observation into a number of classes or groups.
. Such as, Yes or No, 0 or 1, Spam or Not Spam, cat or dog, etc. classes can be
called as targets/labels or categories.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv(r"Social_Network_Ads.csv")
dataset.drop(columns=["EstimatedSalary"],inplace=True)
dataset.head(5)
plt.figure(figsize=(4,3))
sns.scatterplot(x="Age",y="Purchased",data=dataset)
plt.show()
x = dataset[["Age"]]
y = dataset["Purchased"]
plt.figure(figsize=(4,3))
sns.scatterplot(x="Age",y="Purchased",data=dataset)
sns.lineplot(x = "Age",y = lr.predict(x),data=dataset,color = 'red')
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv(r"placement.csv")
dataset.head(3)
plt.figure(figsize=(5,4))
sns.scatterplot(x="cgpa",y="score",data=dataset,hue="placed")
plt.legend(loc=1)
plt.show()
x = dataset.iloc[:,:-1]
x.ndim
print(x)
y = dataset["placed"]
lr = LogisticRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)*100
lr.predict([[8.14,6.52]]) # array([1], dtype=int64)
lr.coef_
lr.intercept_
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv(r"Polynomial_classification.csv")
dataset.head(5)
plt.figure(figsize=(5,4))
sns.scatterplot(x="data1",y="data2",data=dataset,hue="output")
plt.show()
x = dataset.iloc[:,:-1]
y = dataset["output"]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv(r"Polynomial_classification.csv")
dataset.head(5)
lt.figure(figsize=(5,4))
sns.scatterplot(x="data1",y="data2",data=dataset,hue="output")
plt.show()
x = dataset.iloc[:,:-1]
y = dataset["output"]
x = dataset.iloc[:,:-1]
y = dataset["species"]
## OVR
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class="ovr")
lr.fix(x_train,y_train)
lr.score(x_test,y_test)*100 #96.66666
# multinomial
lr1 = LogisticRegression(multi_class="multinomial")
lr1.fit(x_train,y_train)
lr1.score(x_test,y_test)*100
lr2 = LogisticRegression()
lr2.fit(x_train,y_train)
lr2.score(x_test,y_test)*100 #100.0
. Accuracy = (TP+TN)/N
. Error = (FN+FP)/n
. False Negative: The model has predicted no, but the actual value was Yes, it is
also called as Type-II error.
. False Positive:The model has predicted Yes, but the actual value was No. it is
also called a Type-I error.
F1 Score = 2*Precision*Recall/(Precision+Recall)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dataset = pd.read_csv(r"placement.csv")
dataset.head(5)
x = dataset.iloc[:,:-1]
x
y = dataset["placed"]
lr = LogisticRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)*100 #100.0
precision_score(y_test,lr.predict(x_test))*100 #100.0
recall_score(y_test,lr.predict(x_test))*100 #100.0
f1_score(y_test,lr.predict(x_test))*100 #100
import pandas as pd
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head(5)
dataset["Purchased"].value_counts()
x = dataset.iloc[:,:-1]
y = dataset["Purchased"]
=> imblearn
import pandas as pd
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head(5)
x = dataset.iloc[:,:-1]
y = dataset["Purchased"]
dataset["Purchased"].value_counts()
## NAIVE BAYES
-> Naive: It is called Naive because it assumes that the occurrence of a certain
feature is independent of the occurrence of other features.
-> Bayes: It is called bayes because it depends on the principle of Bayes' Theorem.
-> Bayes' Theorem: Bayes' theorem is also known as Bayes' Rile or Bayes' law, which
is used to determines the probability of a hypothesis with prior knowledge. It
depends on the conditional probability.
p(a/b)= (p(b/a)p(a))/p(b)
where:
- p(a/b) is Posterior probability: probability of hypothesis A on the observed
event B.
- P(b/a) is Likelihood probability: Probability of the evidence given that the
probability a hypothesis is true.
- p(a) is Prior Probability: Probability of hypothesis before observing the
evidence.
- p(b) is Marginal Probability: Probability of evidence.
-> Practical:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
dataset = pd.read_csv(r"placement.csv")
dataset.head(5)
sns.kdeplot(data=dataset["cgpa"])
plt.show()
sns.kdeplot(data=dataset["score"])
plt.show()
plt.figure(figsize=(4,3))
sns.scatterplot(x="cgpa",y="score",data=dataset,hue="placed")
plt.show()
x = dataset.iloc[:,:-1]
y = dataset["placed"]
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
mnb.score(x_test,y_test)*100 #75.0
mnb.score(x_train,y_train)*100 #73.75
plt.figure(figsize=(4,3))
plot_decision_regions(x.to_numpy(),y.to_numpy(),clf=mnb)
plt.show()
bnb = BernoulliNB()
bnb.fit(x_train,y_train)
bnb.score(x_test,y_test)*100 #50.0
bnb.score(x_train,y_train)*100 #50.0
plt.figure(figsize=(4,3))
plot_decision_regions(x.to_numpy(),y.to_numpy(),clf=bnb)
plt.show()
. In order to build a tree, we use the CART algorithm, which stands for
Classification and Regression Tree algorithm.
. Information Gain
. Entropy / Gini Index