Kakauikkla
Kakauikkla
b) Visualization
c) Preprocessing
d) Apply Model
e) Evaluate
f) Tune HyperParameter
##EDA
df.head()
{"type":"dataframe","variable_name":"df"}
df.tail()
{"type":"dataframe"}
df.shape
(395, 33)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 school 395 non-null object
1 gender 394 non-null object
2 age 392 non-null float64
3 address 392 non-null object
4 famsize 394 non-null object
5 Parrent_status 395 non-null object
6 Mother_edu 394 non-null float64
7 Father_edu 394 non-null float64
8 Mother_job 394 non-null object
9 Father_job 392 non-null object
10 reason_to_chose_school 392 non-null object
11 guardian 393 non-null object
12 traveltime 393 non-null float64
13 weekly_studytime 394 non-null float64
14 failures 393 non-null float64
15 extra_edu_supp 394 non-null object
16 family_edu_supp 395 non-null object
17 extra_paid_class 394 non-null object
18 extra_curr_activities 393 non-null object
19 nursery 394 non-null object
20 Interested_in_higher_edu 394 non-null object
21 internet_access 394 non-null object
22 romantic_relationship 394 non-null object
23 Family_quality_reln 394 non-null float64
24 freetime_after_school 395 non-null int64
25 goout_with_friends 395 non-null int64
26 workday_alcohol_consum 395 non-null int64
27 weekend_alcohol_consum 395 non-null int64
28 health_status 395 non-null int64
29 absences 395 non-null int64
30 G1 395 non-null int64
31 G2 395 non-null int64
32 G3 395 non-null int64
dtypes: float64(7), int64(9), object(17)
memory usage: 102.0+ KB
df.describe()
df.columns
df.describe(include='all')
{"type":"dataframe"}
Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(df['G1'])
(array([ 2., 31., 37., 72., 51., 74., 63., 24., 30., 11.]),
array([ 3. , 4.6, 6.2, 7.8, 9.4, 11. , 12.6, 14.2, 15.8, 17.4,
19. ]),
<BarContainer object of 10 artists>)
plt.hist(df['G2'])
(array([13., 0., 16., 35., 82., 81., 78., 57., 18., 15.]),
array([ 0. , 1.9, 3.8, 5.7, 7.6, 9.5, 11.4, 13.3, 15.2, 17.1,
19. ]),
<BarContainer object of 10 artists>)
plt.hist(df['G3'])
(array([ 38., 0., 8., 24., 60., 103., 62., 60., 22., 18.]),
array([ 0., 2., 4., 6., 8., 10., 12., 14., 16., 18., 20.]),
<BarContainer object of 10 artists>)
plt.hist(df['absences'])
(array([287., 72., 25., 5., 1., 2., 0., 2., 0., 1.]),
array([ 0. , 7.5, 15. , 22.5, 30. , 37.5, 45. , 52.5, 60. , 67.5,
75. ]),
<BarContainer object of 10 artists>)
plt.hist(df['failures'])
(array([310., 0., 0., 50., 0., 0., 17., 0., 0., 16.]),
array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3. ]),
<BarContainer object of 10 artists>)
sns.scatterplot(x=df['G1'], y=df['G2'])
<matplotlib.legend.Legend at 0x79fe811263f0>
Preprocessing
sns.boxplot(x=df['G1'])
<Axes: xlabel='G1'>
sns.boxplot(x=df['G2'])
<Axes: xlabel='G2'>
numerical_cols = df.select_dtypes(include='number').columns
features_with_outliers=[]
lower_bound = Q1-1.5*IQR
upper_bound = Q3+1.5*IQR
if not outliers.empty:
features_with_outliers.append(col)
weekly_studytime failures
Family_quality_reln \
age 0.014124 NaN
0.055209
Mother_edu 0.068783 NaN
0.022801
Father_edu 0.009156 NaN
0.013962
traveltime -0.115008 NaN -
0.017217
weekly_studytime 1.000000 NaN
0.063992
failures NaN NaN
NaN
Family_quality_reln 0.063992 NaN
1.000000
freetime_after_school -0.141654 NaN
0.136637
goout_with_friends -0.066011 NaN
0.058834
workday_alcohol_consum -0.219130 NaN -
0.079564
weekend_alcohol_consum -0.260562 NaN -
0.122639
health_status -0.071126 NaN
0.077752
absences -0.080753 NaN -
0.080903
G1 0.163235 NaN
0.027758
G2 0.134537 NaN
0.007214
G3 0.099217 NaN
0.058057
freetime_after_school goout_with_friends \
age 0.014538 0.119841
Mother_edu 0.025119 0.061921
Father_edu -0.018703 0.041183
traveltime -0.026569 0.021811
weekly_studytime -0.141654 -0.066011
failures NaN NaN
Family_quality_reln 0.136637 0.058834
freetime_after_school 1.000000 0.281769
goout_with_friends 0.281769 1.000000
workday_alcohol_consum 0.205032 0.266818
weekend_alcohol_consum 0.146665 0.420386
health_status 0.075318 -0.009577
absences 0.007181 0.105672
G1 0.007524 -0.149104
G2 -0.011653 -0.157180
G3 0.008719 -0.132791
workday_alcohol_consum weekend_alcohol_consum
\
age 0.124073 0.110046
G1 -0.101402 -0.126179
G2 -0.087085 -0.102462
G3 -0.066432 -0.051939
health_status absences G1 G2
G3
age -0.069379 0.186068 -0.060287 -0.153019 -
0.156116
Mother_edu -0.043770 0.116780 0.206500 0.228634
0.217775
Father_edu 0.013469 0.018008 0.192346 0.179830
0.154668
traveltime 0.002497 -0.023451 -0.086274 -0.138590 -
0.114356
weekly_studytime -0.071126 -0.080753 0.163235 0.134537
0.099217
failures NaN NaN NaN NaN
NaN
Family_quality_reln 0.077752 -0.080903 0.027758 0.007214
0.058057
freetime_after_school 0.075318 0.007181 0.007524 -0.011653
0.008719
goout_with_friends -0.009577 0.105672 -0.149104 -0.157180 -
0.132791
workday_alcohol_consum 0.080359 0.146541 -0.101402 -0.087085 -
0.066432
weekend_alcohol_consum 0.092476 0.193614 -0.126179 -0.102462 -
0.051939
health_status 1.000000 -0.052585 -0.073172 -0.089461 -
0.061335
absences -0.052585 1.000000 -0.020177 -0.050567
0.068030
G1 -0.073172 -0.020177 1.000000 0.884067
0.801468
G2 -0.089461 -0.050567 0.884067 1.000000
0.905780
G3 -0.061335 0.068030 0.801468 0.905780
1.000000
<Axes: >
Missing Values
df.isna().sum()
school 0
gender 1
age 3
address 3
famsize 1
Parrent_status 0
Mother_edu 1
Father_edu 1
Mother_job 1
Father_job 3
reason_to_chose_school 3
guardian 2
traveltime 2
weekly_studytime 1
failures 2
extra_edu_supp 1
family_edu_supp 0
extra_paid_class 1
extra_curr_activities 2
nursery 1
Interested_in_higher_edu 1
internet_access 1
romantic_relationship 1
Family_quality_reln 1
freetime_after_school 0
goout_with_friends 0
workday_alcohol_consum 0
weekend_alcohol_consum 0
health_status 0
absences 0
G1 0
G2 0
G3 0
dtype: int64
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
df[col] = df[col].fillna(df[col].mode()[0])
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
for col in numeric_cols:
df[col] = df[col].fillna(df[col].median())
df.isna().sum()
school 0
gender 0
age 0
address 0
famsize 0
Parrent_status 0
Mother_edu 0
Father_edu 0
Mother_job 0
Father_job 0
reason_to_chose_school 0
guardian 0
traveltime 0
weekly_studytime 0
failures 0
extra_edu_supp 0
family_edu_supp 0
extra_paid_class 0
extra_curr_activities 0
nursery 0
Interested_in_higher_edu 0
internet_access 0
romantic_relationship 0
Family_quality_reln 0
freetime_after_school 0
goout_with_friends 0
workday_alcohol_consum 0
weekend_alcohol_consum 0
health_status 0
absences 0
G1 0
G2 0
G3 0
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 school 395 non-null object
1 gender 395 non-null object
2 age 395 non-null float64
3 address 395 non-null object
4 famsize 395 non-null object
5 Parrent_status 395 non-null object
6 Mother_edu 395 non-null float64
7 Father_edu 395 non-null float64
8 Mother_job 395 non-null object
9 Father_job 395 non-null object
10 reason_to_chose_school 395 non-null object
11 guardian 395 non-null object
12 traveltime 395 non-null float64
13 weekly_studytime 395 non-null float64
14 failures 395 non-null float64
15 extra_edu_supp 395 non-null object
16 family_edu_supp 395 non-null object
17 extra_paid_class 395 non-null object
18 extra_curr_activities 395 non-null object
19 nursery 395 non-null object
20 Interested_in_higher_edu 395 non-null object
21 internet_access 395 non-null object
22 romantic_relationship 395 non-null object
23 Family_quality_reln 395 non-null float64
24 freetime_after_school 395 non-null float64
25 goout_with_friends 395 non-null int64
26 workday_alcohol_consum 395 non-null float64
27 weekend_alcohol_consum 395 non-null int64
28 health_status 395 non-null int64
29 absences 395 non-null int64
30 G1 395 non-null int64
31 G2 395 non-null int64
32 G3 395 non-null int64
dtypes: float64(9), int64(7), object(17)
memory usage: 102.0+ KB
df.school.value_counts()
school
GP 349
MS 46
Name: count, dtype: int64
Column: school
school
GP 349
MS 46
Name: count, dtype: int64
Column: gender
gender
F 209
M 186
Name: count, dtype: int64
Column: address
address
U 307
R 88
Name: count, dtype: int64
Column: famsize
famsize
GT3 282
LE3 113
Name: count, dtype: int64
Column: Parrent_status
Parrent_status
T 354
A 41
Name: count, dtype: int64
Column: Mother_job
Mother_job
other 142
services 102
at_home 59
teacher 58
health 34
Name: count, dtype: int64
Column: Father_job
Father_job
other 218
services 110
teacher 29
at_home 20
health 18
Name: count, dtype: int64
Column: reason_to_chose_school
reason_to_chose_school
course 148
home 108
reputation 104
other 35
Name: count, dtype: int64
Column: guardian
guardian
mother 274
father 89
other 32
Name: count, dtype: int64
Column: extra_edu_supp
extra_edu_supp
no 345
yes 50
Name: count, dtype: int64
Column: family_edu_supp
family_edu_supp
yes 242
no 153
Name: count, dtype: int64
Column: extra_paid_class
extra_paid_class
no 214
yes 181
Name: count, dtype: int64
Column: extra_curr_activities
extra_curr_activities
yes 203
no 192
Name: count, dtype: int64
Column: nursery
nursery
yes 314
no 81
Name: count, dtype: int64
Column: Interested_in_higher_edu
Interested_in_higher_edu
yes 375
no 20
Name: count, dtype: int64
Column: internet_access
internet_access
yes 329
no 66
Name: count, dtype: int64
Column: romantic_relationship
romantic_relationship
no 264
yes 131
Name: count, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 school 395 non-null int64
1 gender 395 non-null int64
2 age 395 non-null float64
3 address 395 non-null int64
4 famsize 395 non-null int64
5 Parrent_status 395 non-null int64
6 Mother_edu 395 non-null float64
7 Father_edu 395 non-null float64
8 Mother_job 395 non-null int64
9 Father_job 395 non-null int64
10 reason_to_chose_school 395 non-null int64
11 guardian 395 non-null int64
12 traveltime 395 non-null float64
13 weekly_studytime 395 non-null float64
14 failures 395 non-null float64
15 extra_edu_supp 395 non-null int64
16 family_edu_supp 395 non-null int64
17 extra_paid_class 395 non-null int64
18 extra_curr_activities 395 non-null int64
19 nursery 395 non-null int64
20 Interested_in_higher_edu 395 non-null int64
21 internet_access 395 non-null int64
22 romantic_relationship 395 non-null int64
23 Family_quality_reln 395 non-null float64
24 freetime_after_school 395 non-null float64
25 goout_with_friends 395 non-null int64
26 workday_alcohol_consum 395 non-null float64
27 weekend_alcohol_consum 395 non-null int64
28 health_status 395 non-null int64
29 absences 395 non-null int64
30 G1 395 non-null int64
31 G2 395 non-null int64
32 G3 395 non-null int64
dtypes: float64(9), int64(24)
memory usage: 102.0 KB
y_pred
lin.intercept_
np.float64(-3.079114806474344)
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Index: 316 entries, 181 to 102
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 school 316 non-null int64
1 gender 316 non-null int64
2 age 316 non-null float64
3 address 316 non-null int64
4 famsize 316 non-null int64
5 Parrent_status 316 non-null int64
6 Mother_edu 316 non-null float64
7 Father_edu 316 non-null float64
8 Mother_job 316 non-null int64
9 Father_job 316 non-null int64
10 reason_to_chose_school 316 non-null int64
11 guardian 316 non-null int64
12 traveltime 316 non-null float64
13 weekly_studytime 316 non-null float64
14 failures 316 non-null float64
15 extra_edu_supp 316 non-null int64
16 family_edu_supp 316 non-null int64
17 extra_paid_class 316 non-null int64
18 extra_curr_activities 316 non-null int64
19 nursery 316 non-null int64
20 Interested_in_higher_edu 316 non-null int64
21 internet_access 316 non-null int64
22 romantic_relationship 316 non-null int64
23 Family_quality_reln 316 non-null float64
24 freetime_after_school 316 non-null float64
25 goout_with_friends 316 non-null int64
26 workday_alcohol_consum 316 non-null float64
27 weekend_alcohol_consum 316 non-null int64
28 health_status 316 non-null int64
29 absences 316 non-null int64
30 G1 316 non-null int64
31 G2 316 non-null int64
dtypes: float64(9), int64(23)
memory usage: 81.5 KB
ridge = Ridge()
params = {
'alpha': [0.01, 0.1, 1, 10, 100]
}
ridge_pred = grid.predict(X_test)
score
0.7489279687734536
import pickle
model_pkl_file = "ML-Assignment-1.pkl"
with open(model_pkl_file, 'wb') as file:
pickle.dump(grid, file)