In [1]: import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(color_codes=True)
In [2]: train = pd.read_csv(‘train_ctrUa4k.csv')
train
Out[2]:
Lean 10. Gender Maried Dependents Education Self Employed _Appicantincome Coappicantr
0 \Pootoea Wale —~No 0 Gradate No seas
4+ LPoot0es Malo Yee + Graduate No 03
2 LPootoos Male Yes 0 Grate ves 2000
3 UP00%006 Male Yes © raat no ase :
4 \Po01008 Male No 0 rade No coo
so Lpo0as7e Malo Yes 3 Graduate no 06
11 LP002588 Male Yes + raduate no sore
612 Lpooases Male Yes 2 Graduate No 70
618 Lp002380 Female No 0 Graduate ves 4589
614 rows x 13 columnsIn [3]: test = pd.read_csv(‘test_1AUusdG. csv")
test
ouet3}:
Loan Gender Marie Dependents Educaon Sell Employed Appcanncome Coapplcante
2 POI Male Yoo 0 Gee Wo En
1 tpooi0ce Mae Yes 1 Grodute We sors
2 Loos! ale Yoo 2 Geodane Wo sooo
3 Lpoo%03s Mae Yes 2 Grodute We 10
4 UPoO10s1 eee © Gai We sere
362 LP002971 Male Yes * Gracie Yes 4008
36s 1po02s7s Male Yes 0° Greate Wo 88
365 LPotzi86 ale Yes 0 Greate Wo sooo
367 rows x 12 columns
Data Preprocessing Part 1
In [4]: train.drop(columns=[*Loan_Ib"], inplace=True)
train
outta):
Gander Maried Dependents Education Sell Employed Appantinome Coapplentncome Lot
ote Wo 0 Graal Wo oe 00
2 tle Yes 0 Graduate ves 2000 00
3 Mle ves © oyu We 2503 2ss00
4 tte Ne 0 Graduate We sooo 00
608 Female No 0 Graduate We 2200 oo
10 Moe Yes ratte Wo 4108 00
it tae es 1 Groute We sore oa
2 ae Yes 2 Gada Wo 7583 00
613 Female No 0 Grade ves 458s oo
614 rows x 12 columnsIn [5]:
out [5]:
In [6]:
In [7]:
out [7]:
check missing = train.isnull().sum() * 160 / train.shape[0]
check_missing[check missing > 0].sort_values(ascending-False)
Credit History 8.143322
Self_Enployed 5.211726
LoanAmount: 3.583062
Dependents 2.442997
Loan_Anount_Term 2.280130
Gender 2.117264
Narried 0.488599
dtype: floates
train{ ‘Credit History’ ].fillna(@, inplace=True)
train['Self_fmployed'].fillna('No', inplac
train{ ‘Loananount'].fillna(@, inplace=True)
train{ ‘Dependents'].#illna( ‘Other’, inplac
train{ 'Loan_Anount_Tern' ].fillna(@, inplace=True)
train[ 'Gender"].#illna( ‘Other’, inplace=True)
train[ ‘Married’ ].fillna('Other’, inplace=True)
check_missing = train.isnull().sum() * 100 / train.shape[@]
check_missing[check_missing > @].sort_values(ascending=False)
Series({], dtype: floate4)
Exploratory Data AnalysisIn [8]: sns.countplot(data=train, x="Loan_Status", hue="Property Area")
#People with Semiurban has high acceptable chance of Loan Status
Out[8}:
Married
mmm No
250 mm Yes
mE Other
200
$ is
8
100
: | |
0 ——
Y N
Loan_StatusIn [18]: sns.countplot(data-train, x="Loan_status", hue="Gender")
Out[18]:
0 10900 20000 30000 40000 s0000 6000 70000 80000
ApplicantincomeIn [37]: sns.boxplot (x=train[ "CoapplicantIncome" ])
out[37]:
0 70000 20000 30000 40000
CoapplicantincomeIn [38]: sns.boxplot (x=train["LoanAmount” ])
out[38]:
mee m Fe 6 o
0 10 8620000 400 «ssi
LoanAmount
In [39]: import scipy.stats as stats
z = np.abs(stats.zscore(train))
data_clean = train{(z<3).all(axis = 1)]
data_clean. shape
out(39]: (572, 22)
Balanced Class DataIn [40]:
out [49]:
In [41]:
sns. countplot (data_clean[‘Loan_status" ])
data_clean['Loan_status'].value_counts()
D: \anaconda3\1ib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the #
ollowing variable as a keyword arg: x. From version @.12, the only valid positional
argument will be “data’, and passing other arguments without an explicit keyword wi
Ll result in an error or misinterpretation.
warnings .warn(
1 398
e174
Name: Loan_status, dtype: int64
400
Loan_Status
from sklearn.utils import resample
create two different datafrane of majority and minority class
df_majority = data_clean[(data_clean[ ‘Loan_status' ]
df_minority = data_clean[(data_clean[ ‘Loan_status’ }
# upsanple minority class
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples= 398, # to match majority class
random_state=0) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([d#_minority_upsampled, df_majority])In [42]:
out [42]:
sns. countplot (df_upsanpled[ ‘Loan_status" ])
df_upsampled['Loan_status' ].value_counts()
D:\anaconda3\1Lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the
ollowing variable as a keyword arg: x. From version @.12, the only valid positional
argument will be “data’, and passing other arguments without an explicit keyword wi
LU result in an error or misinterpretation.
warnings.warn(
e398
1 398
Name: Loan_status, dtype: int64
400
‘count
8
8
Loan_Status
Data CorrelationIn [43]: sns.heatmap(df_upsampled.corr(), fn
2e')
out[43]:
-10
Gender
Married
os
Dependents
Education
06
Self_ Employed
Applicantincome
S 04
Coapplicantincome
LoanAmount
Loan_Amount_Term oe
Credit_ History
00
Property_Area
Loan_Status
EEEEEE GEE EES
[ B58 3 8 82 &
Bape eee die
SsEEe ES we
a oe $8 8 § 238 8
wee GO
8 =
Machine Learning Model Building
In [44]:
*
4f_upsampled.drop("Loan_status’, axis=1)
4#_upsampled{ 'Loan_status' ]
y
In [45]: #test size 26% and train size 80%
from sklearn.model_selection import train_test_split
fron sklearn.metrics import accuracy_score
X_train, X test, y train, y test = train_test_split(X,y, test_siz
-2,random_state=i
Decision TreeIn [46]:
out [46]:
In [47]:
In [48]:
In [49]:
out [49]:
In [50]:
In [51]:
fron sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=2)
dtree.fit(X_train, y_train)
DecisionTreeClassifier(randon_stati
ypred = dtree.predict(X_test)
print("Accuracy Score :", round(accuracy_score(y test, y pred)*10@ ,2),
Accuracy Score : 61.79 %
from sklearn.metrics import accuracy score, fi_score, precision_score, recall_score
print(‘F-1 Score : ',(f1_score(y_test, y_pred)))
print(‘Precision Score : *, (precision score(y_test, y_pred)))
print(‘Recall Score : ',(recall_score(y test, y_pred)))
F-1 Score : @.7314285714285714
Precision Score : 0.75294117647@5882
Recall Score : @.7111111111111111
Random Forest
from sklearn.ensenble import RandonForestClassifier
rfc = RandonForestClassifier(randon_state=0)
rfc.fit(X_train, y_train)
RandonForestClassifier(randon_stat
y_pred = rfc.predict(X_test)
print("Accuracy Score :", round(accuracy_score(y test, y_pred)*10@ ,2),
Accuracy Score : 74.8 %
from sklearn.metrics import accuracy_score, fi_score, precision_score, recall_score
print('F-1 Score : ',(f1_score(y_test, y_pred)))
print('Precision Score : ',(precision_score(y test, y_pred)))
print(*Recall Score : ',(recall_score(y_test, y_pred)))
F-1 Score : @.8287292817679559
Precision Score : .8241758241758241
Recall Score : @.8333333333333334
Logistic RegressionIn [52]:
out [52]:
In [53]:
In [54]:
from sklearn.linear_model import LogisticRegression
Lp = LogisticRegression(random_state=42)
Ar. fit(X_train, y_train)
D: \anaconda3\1ib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWa
ning: bfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the nunber of iterations (max_iter) or scale the data as shown in:
https://scikit-earn.org/stable/nodules/preprocessing.html (https://scikit-lear
n.org/stable/nodules/preprocessing.html)
Please also refer to the docunentation for alternative solver options:
https: //scikit-learn.org/stable/nodules/linear_nodel .htmiWlogistic-regression
(hetps: //scikit-learn.org/stable/nodules/linear_nodel..html#logistic-regression)
n_iter_i = _check_optimize_result(
Logistickegression(randon_state=42)
y_pred = Ir.predict(Xx_test)
print("Accuracy Score :", round(accuracy_score(y test, y_pred)*10@ ,2),
Accuracy Score : 79.67 %
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print('F-1 Score : ',(f1_score(y test, y_pred)))
print('Precision Score : ',(precision_score(y test, y_pred)))
print(*Recall Score : ',(recall_score(y_test, y_pred)))
F-1 Score : @.8677248677248678
Precision Score : 0.8282828282828283
Recall Score : @.9111111111111111
Decision Tree Feature ImportancesIn [55]: #Feature Importance
imp_df = pd.DataFrame({
Feature Name": X_train.columns,
Importance": dtree. feature_importances_
imp_df.sort_values(by="Importance", ascending-False)
out [55]:
Feature Name Importance
7
5 Applicantincome 0.196718
8 Credit History 0.188198
6 Coapplicantincame 0.148907
2 Dependents 0.055870
3 Education 0.050723
1 Married 0.040836
LoanAmount 0.208848
8 Loan_Amount_Term 0.035561
10 Properly Area 0.033897
° Gender 0.025186,
4 Sell_Employed 0.015264In [56]:
fi2 = fi.head(10)
plt. Figure(Figsize=(10,8))
sns.barplot(data=fi2, x="Inportance’, y="Feature Name")
plt.title('Feature Importance Each Attributes (Decision Tree)', fontsize=18)
plt.xlabel (‘Inportance', fontsize=16)
plt.ylabel (‘Feature Name", fontsize=16)
plt.show()
Feature Importance Each Attributes (Decision Tree)
sepicanticame
rei History
Cospplcanincme
epee
Fsyaton
Feature Name
Loan_Araunterm
Propenty Aves
(Gender
3
‘000 02500500750 ots as
Importance
Random Forest Feature ImportancesIn [57]: #Feature Importance
imp_df = pd.DataFrame({
Feature Name": X_train.columns,
Importance": rfc. feature_importances_
imp_df.sort_values(by="Importance", ascending-False)
out [57]:
Feature Name Importance
7 LoanAmount 0.216710
8 Credit History 0.176489
© Coapplicantincame 0.126380
2 Dependents 0.058595
10 Property Area 0.051157
‘Applicantincome 0.217521
8 Loan_Amount_Term 0.043985
° Gender 0.034534
3 Education 0.027188,
1 Married 0.026270
4 Sell_Employed 0.021778In [58]:
In [59]:
out [59]:
fi2 = fi-head(10)
plt.figure(figsize=(10,8))
sns.barplot(data=fi2, x='Inportance", y="Feature Name")
plt.title('Feature Importance Each Attributes (Random Forest)’, fontsize=18)
plt.xlabel (‘Inportance’, fontsize=16)
plt.ylabel (‘Feature Name’, fontsize=16)
plt.show()
Feature Importance Each Attributes (Random Forest)
‘Aeplicantname
Loanmount
Cred History
Coapplicaniocome
Dependents
Property Aveo
Feature Name
loan_Arount Term
candor
Futon
Mores
fr
0 005 o10 ass oa
Importance
Test File
check missing = test.isnul1().sum() * 100 / test.shape[0]
check_missing[ check missing > 0].sort_values(ascending-False)
Credit_History 7.901987
Self_Employed 6.267038
Gender 2.997275
Dependents 2.724796
Loan_Amount_Term 1.634877
LoanAmount 1.362398
dtype: floatesIn [60]:
In [61]:
out [61]:
In [62]:
out [62]:
In [63]:
out [63]:
In [64]:
out [64]:
In [65]:
out [65]:
In [66]:
out [66]:
In [67]:
out (67):
In [68]:
out [68]:
test['Credit_History’].fillna(@, inplace=True)
test['Self_tmployed'].fillna('No', inplace=True)
test[ ‘LoanAnount'].fillna(®, inplace=True)
test[ ‘Dependents'].fillna( ‘Other’, inplace=True)
test ['Loan_Anount_Tern'].#il1na(@, inplace=True)
test['Gender'].fillna( ‘Other’, inplace=True)
test[ ‘Gender’ ]= label_encoder.fit_transform(test[ ‘Gender’ ])
test[ ‘Gender’ ] unique()
array([1, @ 21)
test[ ‘Merried" ]= label_encoder.fit_transform(test[ ‘Married’ ])
test[ 'Married® ].unique()
array([1, @])
test['Dependents']= label_encoder. fit_transform(test[ ‘Dependents' ])
test[ ‘Dependents’ ] .unique()
array([@, 1, 2, 3, 4])
test[ Education’ ]= label_encoder.fit_transform(test[ ‘Education’ ])
test[ Education’ ].unique()
array([@, 2])
test['Self_Enployed’]= label_encoder. fit_transform(test{ ‘Self Employed" ])
test['Sel# Employed" ] .unique()
array([®, 1])
test['Property_Area']= label_encoder.fit_transform(test[ 'Property_Area'])
test['Property_Area’].unique()
array([2, 1, @])
test['Loan_Anount_Term']= label_encoder.fit_transform(test[‘Loan_Anount_Term’])
test['Loan_Anount_Term'].unique()
array([11, 8, 7, @, 4, 12, 5, 2, 9, 10, 3, 6, 1], dtyperinte4)
no = test[['Loan_10']]
no.head()
Loan 1D
0 LPOOTOIS
+ LPoot0z2
2 LPootost
3 LP091035
4 Leoot9stIn [69]: test_data = test.drop(‘Loan_1D', axis=1)
In [70]: y_pred_prob = rfc.predict(test_data)
y_pred_prob_df = pd.DataFrame(data=y_pred_prob)
y_pred_prob_df
out [70]:
362 1
3631
364
365
366 1
367 rows x 1 columnsIn [71]:
out [71]:
In [72]:
out [72]:
no[ ‘Loan_status’]
no.head(22)
y_pred_prob_df
C: \Users\Michael \Appbata\Local \Temp \ ipykernel_26032\4219528891.py:1: SettingwithCop
yWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https: //pandas.pydata.org/pandas-docs/stable/
user_guide/ indexing. htnl¥returning-a-view-versus-a-copy (https: //pandas.pydata.org/
pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)
no['Loan_Status'] = y_pred_prob_df
Loan_ID Loan_Status
0 LPoo101S 1
1 Lpoot0z2 1
2 Leoot0st 1
3 LPo91035 °
4 Leoot0st °
Loo 1054 1
6 LP00t055 1
7 LP901056 °
8 LP00t059 1
8 LP001067 1
10 Leooi07e °
11 LPoot082 1
42 LPoot08s °
43. LPoot094 °
14 LPoot096 1
45. LPo01099 1
46 LPoott0s 1
47 LPoot107 1
48 LPootto8 °
49. LPoottis °
no.dtypes
Loan_1D object
Loan status int32
dtype: objectIn [73]:
out [73]:
In [74]:
out [74]:
no["Loan_status'
no.dtypes
}o["Loan_Status"] .astype(str)
C: \Users\Michael \AppData\Local \Temp\ipykernel_26032\3928728142.py:1: SettingwithCop
yWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
‘Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https: //pandas.pydata.org/pandas-docs/stable/
user_guide/ indexing. htnl#returning-a-view-versus-a-copy (https: //pandas.pydata.org/
pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)
no["Loan_Status" ]=no["Loan_Status"].astype(str)
Loan_1D object
Loanstatus object
dtype: object
no[ ‘Loan Status'] = no['Loan Status" ].astype(str).str.replace("e", 'N’)
no['Loan_Status"] = no['Loan_Status'].astype(str).str-replace('1", 'Y')
no[ ‘Loan_Status* J unique()
: \Users\Michael \AppData\Local \Temp\ ipykernel_26032\3293911733.py:1: SettingWithCop
ybarning:
‘A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/
user_guide/indexing.htmltireturning-a-view-versus-a-copy (https: //pandas.pydata.org/
pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)
nol ‘Loan_status’] = no["Loan_status'].astype(str).str.replace("@", 'N")
C:\Users\Michael \Appbata\Local\Temp\ipykernel_26032\3293911733.py:2: SettingWithCop
yWarning:
A value is trying to be set on a copy of a slice from a Datafrane.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/
user_guide/indexing.html#returning-a-view-versus-a-copy (https: //pandas.pydata.org/
pandas-docs/stable/user_guide/indexing. htmlsreturning-a-view-versus-a-copy)
no["Loan_Status'] = no['Loan_Status"].astype(str).str.replace("1', 'Y')
array(['Y', 'N'], dtype=object)In [75]: no
out [75]:
LoanID Loan_Status
0 LPoo1015 Y
+ LPoot022
2 LPoot031
3. LP001035
zz<<
4 LPo01051
362 LP00297:
363 LP002975
364 LP002980
365. LP002986