ML Practical 04
ML Practical 04
Practical 04
Importing Libraries
In [1]:
import numpy as np
import pandas as pd
sns.set()
import warnings
yl
In [2]:
diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data
Out[2]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
0 6 148 72 35 0 336 0627 50 1
1 1 85 66 29 0 266 0351 3 0
2 8 183 64 0 0 233 0872 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2288 33 1
763 10 101 76 48 180 32.9 0171 863 0
764 2 122 70 27 0 36.38 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
768 1 126 60 [1] 0 301 0349 47 1
767 1 a3 70 3 0 304 0315 23 0
768 rows x 9 columns
In [3]:
#Print the first 5 rows of the daotaframe.
diabetes_data.head()
out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcomes
0 6 148 72 35 0 336 0.627 50 1
1 1 85 66 29 0 268 0.351 31 0
2 8 183 64 o 0 233 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2288 33 1
1M
10/18/23, 2:51 PM
In [4]:
diabetes_data.info({verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, @ to 767
Data columns (total 9 columns):
# Column
Pregnancies
Glucose
BMI
Pedigree
Age
Outcome
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [5]:
}
1
2
3
4 Insulin
5
6
7
8
Non-Null Count
768 non-null
768 non-null
768 non-null
768 non-null
768 non-null
768 non-null
768 non-null
diabetes_data.describe()
float64
float64
inte4
inte4
Out[5]:
Pregnancles Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
count 768.000000 768.000000 766.000000 768.000000 768.000000 768.000000 768.000000
768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.7990479 31.992578 0.471876
33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.834160 0.331329 11.760232
0476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000
0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000
0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 28.000000
0.000000
75% 6.000000 140.250000 80.000000 32,000000 127.250000 36.600000 0.626250 41.000000
1.000000
max 17.000000 199.000000 122.000000 99.000000 B846.000000 67.100000 2.420000
81.000000 1.000000
In [6]:
diabetes_data.describe().T
Out[6]:
count mean std min 25% 50% 75% max
Pregnancies 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00
Glucose 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00
BloodPressure 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00
SkinThickness 768.0 20.536458 15952218 0.000 0.00000 23.0000 32.00000 99.00
Insulin 768.0 79.799479 116.244002 0.000 0.00000 30.5000 127.25000 846.00
BMI 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10
Pedigree 768.0 0.471876 0.331320 0.078 0.24375 0.3725 0.62625 242
Age 768.0 33.240885 11760232 21.000 24.00000 29.0000 41.00000 81.00
Outcome 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00
In [7]:
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
Pedigree
Age
Outcome
dtype: int64
227
374
localhost:8888/notebooks/ML_Practical 04.ipynb
2111
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [8]: |
p = diabetes_data.hist(figsize = (2e,20))
200
150
150
125 150
100
100
75 100
] : 1 ]
, i 1. . = _N J . | __ mm -——
00 25 50 [1] 25 50 75 100 125 150 20 40 60 80 1
8
8
| >
localhost:8888/notebooks/ML_Practical 04.ipynb 3M
10/18/23, 2:51 PM
In [10]:
p = diabetes_data_copy.hist(figsize = (20,20))
Pregnancies
,
. bs
00 25 50
SkinThickness
20 40
Pedigree
200
150
I
, CE =
00 05 1.0 15 20
In [11]:
25
diabetes_data.shape
out[11]:
(768, 9)
localhost:8888/notebooks/ML_Practical 04.ipynb
BS]
300
0
|
40 60
80
100
Glucose
120
Insulin
140
160
180 200
8g
BloodPressure
I's
20 40 80 80 100 120
0.0
20 30
02
40
Outcome
04
06
08
4/11
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [12]:
sns.countplot(y=diabetes_data.dtypes ,data=diabetes_data)
plt.xlabel( "count of each data type")
plt.ylabel(“"data types")
plt.show()
int64
data types
float64
0 1 2 3 4 5 6 7
In [13]:
2: "#7bce43"}
[] 500
1 268
Name: Outcome, dtype: int64
400
100
localhost:8888/notebooks/ML_Practical 04.ipynb
5111
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [14]:
plt.figure(figsize=(12,10))
Pregnancies ; ; 0.018
BloodPressure . i . 0.041
0.075
SkinThickness { 0.057
Insulin
BMI 0.018
0.034
@
4
Pedigree
Age
Outcome
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
Pedigree
Qutcome
In [15]:
plt.figure(figsize=(12,10))
# on this line I just set the size of figure to 12 by 16.
p=sns.heatmap(diabetes_data_copy.corr(), annot=True,cmap ='RdYlGn'}
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
Pedigree
Age
Outcome
0.082
0.082
0.045
0.034
localhost:8888/notebooks/ML_Practical 04.ipynb
Qutcome
10
-06
-04
7M
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [18]: |
In [19]: |
In [20]: |
In [21]: |
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
In [22]: |
## score that comes from testing on the datapoints that were split in the beginning
to be used for testing solely
max_test_score = max(test_scores)
Result Visualisation
In [23]: |
plt.figure(figsize=(12,5))
p = sns.lineplot(data=(range(1,15),train_scores),marker="'*',label="'Train Score')
p = sns.lineplot(data=(range(1,15),test_scores),marker="0"',label="Test Score’)
14 —.— Train Score ®
— - = Train Score |
y===) be
12
L )
—e— Test Score
10
The best result is captured at k = 11 hence 11 is used for the final mode
In [24]:
knn = KNeighborsClassifier(11)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
out[24]:
0.765625
In [25]:
print(train_scores)
[8.796875]
Model Performance Analysis
confusion Matrix
In [26]:
#import confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(X_test)
confusion _matrix(y_test,y_pred)
out[26]:
Predicted c 1 Al
True
0 142 25 167
1 35 54 89
localhost:8888/notebooks/ML_Practical 04.ipynb
9/11
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [27]:
y_pred = knn.predict(X_test)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
out[27]:
Text(.5, 20.049999999999997, 'Predicted label')
Confusion matrix
o 25
@
a
=
@©
i=)
<
a 35 54
[4] 1
Predicted label
In [28]:
#import classification report
In [29]:
localhost:8888/notebooks/ML_Practical 04.ipynb
140
120
100
10/11
10/18/23, 2:51 PM ML _Practical_04 - Jupyter Notebook
In [30]:
plt.plot([@,1],[0,1], 'k--"}
plt.plot(fpr,tpr, label="Knn')
plt.xlabel(’fpr'}
plt.ylabel('tpr'})
plt.title('Knn{n_neighbors=11) ROC curve")
plt.show()
08
06
tpr
0.4
02
0.0
In [31]:
Out[31]:
©.8193500639171096
In [32]:
#import GridSearchCV
from sklearn.model_ selection import GridSearchcv
knn = KNeighborsClassifier()
knn_cv= GridSearchCv(knn,param_grid,cv=5)
knn_cv.fit(X,y)
print(“"Best Score:
print("Best Parameters:
+ str(knn_cv.best_score_))
" + str(knn_cv.best_params_))
Best Score:9.7721848251252015
Best Parameters: {'n_neighbors': 25}
localhost:8888/notebooks/ML_Practical 04.ipynb
11/11