9/20/2018 komal_DT1_EDAWithFunctions_Titanic
Decision Tree and EDA with functions
In [51]: import numpy as np
import pandas as pd
In [52]: datafile = "D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\MACHINE LEARNING RECOR
DINGS\Jul 28 Sat - Aug 25 Sat\Drive downloads\Machine Learning _ Jul 28 - Aug
25 _ Sayan\Decision Trees/[Link]"
In [53]: #BeautifulSoup is the library used for web scrapping
from bs4 import BeautifulSoup
with open(datafile,"r",encoding="Latin-1") as f:
soup = BeautifulSoup(f,"[Link]")
In [54]: table = [Link]('table')
In [55]: import pandas as pd
data = data = pd.read_html(str(table).encode('ascii', errors='replace'), flavo
r='bs4')[0]
In [56]: [Link]()
Out[56]:
Boat Unnamed:
Name Age Class/Dept Ticket Joined Job
[Body] 7
AB??-AL-
MUN??, Mr 3rd Class 2699?18
0 27 Cherbourg ? 15? NaN
N??s??f Passenger 15s 9d
Q??sim
ABBING, Mr 3rd Class 5547?7 Blacksmith
1 42 Southampton ?? NaN
Anthony Passenger 11s ?
ABBOTT,
3rd Class CA2673?
2 Mrs Rhoda 39 Southampton ? A? NaN
Passenger 20 5s
Mary 'Rosa'
ABBOTT, Mr
3rd Class CA2673?
3 Rossmore 16 Southampton Jeweller ? ?[190] NaN
Passenger 20 5s
Edward
ABBOTT, Mr
3rd Class CA2673?
4 Eugene 13 Southampton Scholar ? ?? NaN
Passenger 20 5s
Joseph
[Link] 1/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [57]: def cleanup(value):
return [Link]("?"," ")
In [58]: data['Name'] = data['Name'].apply(cleanup)
data['Boat [Body]'] = data['Boat [Body]'].apply(cleanup)
data['Age'] = data['Age'].apply(pd.to_numeric,errors='coerce')
[Link]()
Out[58]:
Boat Unnamed:
Name Age Class/Dept Ticket Joined Job
[Body] 7
AB -AL-
3rd Class 2699?18
0 MUN , Mr N 27.0 Cherbourg ? 15 NaN
Passenger 15s 9d
s f Q sim
ABBING, Mr 3rd Class 5547?7 Blacksmith
1 42.0 Southampton NaN
Anthony Passenger 11s ?
ABBOTT,
3rd Class CA2673?
2 Mrs Rhoda 39.0 Southampton ? A NaN
Passenger 20 5s
Mary 'Rosa'
ABBOTT, Mr
3rd Class CA2673?
3 Rossmore 16.0 Southampton Jeweller ? [190] NaN
Passenger 20 5s
Edward
ABBOTT, Mr
3rd Class CA2673?
4 Eugene 13.0 Southampton Scholar ? NaN
Passenger 20 5s
Joseph
In [59]: data = data[["Name","Age","Class/Dept","Boat [Body]"]]
[Link]()
Out[59]:
Name Age Class/Dept Boat [Body]
0 AB -AL-MUN , Mr N s f Q sim 27.0 3rd Class Passenger 15
1 ABBING, Mr Anthony 42.0 3rd Class Passenger
2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A
3 ABBOTT, Mr Rossmore Edward 16.0 3rd Class Passenger [190]
4 ABBOTT, Mr Eugene Joseph 13.0 3rd Class Passenger
[Link] 2/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [60]: def checkPass(class_type):
if "Passenger" in class_type:
return "Passenger"
else:
return "Crew"
data["Crew/Pass"]=data["Class/Dept"].apply(checkPass)
[Link]()
Out[60]:
Name Age Class/Dept Boat [Body] Crew/Pass
0 AB -AL-MUN , Mr N s f Q sim 27.0 3rd Class Passenger 15 Passenger
1 ABBING, Mr Anthony 42.0 3rd Class Passenger Passenger
2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A Passenger
3 ABBOTT, Mr Rossmore Edward 16.0 3rd Class Passenger [190] Passenger
4 ABBOTT, Mr Eugene Joseph 13.0 3rd Class Passenger Passenger
In [61]: def class_person(class_type):
if "Passenger" in class_type:
return class_type.split(" ")[0]
else:
return 'crew'
data['Class'] = data['Class/Dept'].apply(class_person)
[Link]()
Out[61]:
Boat
Name Age Class/Dept Crew/Pass Class
[Body]
3rd Class
0 AB -AL-MUN , Mr N s f Q sim 27.0 15 Passenger 3rd
Passenger
3rd Class
1 ABBING, Mr Anthony 42.0 Passenger 3rd
Passenger
ABBOTT, Mrs Rhoda Mary 3rd Class
2 39.0 A Passenger 3rd
'Rosa' Passenger
ABBOTT, Mr Rossmore 3rd Class
3 16.0 [190] Passenger 3rd
Edward Passenger
3rd Class
4 ABBOTT, Mr Eugene Joseph 13.0 Passenger 3rd
Passenger
[Link] 3/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [62]: def child_class(value):
if value>=18:
return 'adult'
else:
return 'child'
data['Adult/Child'] = data['Age'].apply(child_class)
[Link]()
Out[62]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child
[Body]
AB -AL-MUN , Mr N s f 3rd Class
0 27.0 15 Passenger 3rd adult
Q sim Passenger
3rd Class
1 ABBING, Mr Anthony 42.0 Passenger 3rd adult
Passenger
ABBOTT, Mrs Rhoda 3rd Class
2 39.0 A Passenger 3rd adult
Mary 'Rosa' Passenger
ABBOTT, Mr Rossmore 3rd Class
3 16.0 [190] Passenger 3rd child
Edward Passenger
ABBOTT, Mr Eugene 3rd Class
4 13.0 Passenger 3rd child
Joseph Passenger
[Link] 4/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [63]: def gender_determiner(name):
firstname = name[[Link](",")+ 2:]
salutation = [Link](" ")[0]
if salutation in ['Mr','Master']:
return 'Male'
else:
return 'Female'
data['Gender'] = data['Name'].apply(gender_determiner)
[Link]()
Out[63]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child Gender
[Body]
AB -AL-MUN , Mr 3rd Class
0 27.0 15 Passenger 3rd adult Male
N s f Q sim Passenger
ABBING, Mr 3rd Class
1 42.0 Passenger 3rd adult Male
Anthony Passenger
ABBOTT, Mrs
3rd Class
2 Rhoda Mary 39.0 A Passenger 3rd adult Female
Passenger
'Rosa'
ABBOTT, Mr
3rd Class
3 Rossmore 16.0 [190] Passenger 3rd child Male
Passenger
Edward
ABBOTT, Mr 3rd Class
4 13.0 Passenger 3rd child Male
Eugene Joseph Passenger
[Link] 5/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [65]: def checkSurvival(value):
if [Link]() == " " or "[" in value:
return 0
else:
return 1
data["Survival"]=data["Boat [Body]"].apply(checkSurvival)
[Link]()
Out[65]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child Gender Survival
[Body]
AB -AL-
MUN , Mr 3rd Class
0 27.0 15 Passenger 3rd adult Male 1
NsfQ Passenger
sim
ABBING,
3rd Class
1 Mr 42.0 Passenger 3rd adult Male 1
Passenger
Anthony
ABBOTT,
Mrs
3rd Class
2 Rhoda 39.0 A Passenger 3rd adult Female 1
Passenger
Mary
'Rosa'
ABBOTT,
Mr 3rd Class
3 16.0 [190] Passenger 3rd child Male 0
Rossmore Passenger
Edward
ABBOTT,
Mr 3rd Class
4 13.0 Passenger 3rd child Male 1
Eugene Passenger
Joseph
In [67]: [Link](['Crew/Pass'])['Survival'].sum()*100/[Link](['Crew/Pass'])[
'Survival'].count()
Out[67]: Crew/Pass
Crew 90.217391
Passenger 90.310651
Name: Survival, dtype: float64
[Link] 6/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [69]: def compare(group,data):
return [Link]([group])['Survival'].sum()*100/[Link]([group])[
'Survival'].count()
compare("Class",data)
Out[69]: Class
1st 89.714286
2nd 88.395904
3rd 91.396333
crew 90.217391
Name: Survival, dtype: float64
In [70]: compare("Gender",data)
Out[70]: Gender
Female 95.840555
Male 88.557743
Name: Survival, dtype: float64
In [71]: compare("Adult/Child",data)
Out[71]: Adult/Child
adult 89.699955
child 95.964126
Name: Survival, dtype: float64
In [72]: trainingData=data[["Age","Crew/Pass","Class","Adult/Child","Gender","Survival"
]]
[Link]()
Out[72]:
Age Crew/Pass Class Adult/Child Gender Survival
0 27.0 Passenger 3rd adult Male 1
1 42.0 Passenger 3rd adult Male 1
2 39.0 Passenger 3rd adult Female 1
3 16.0 Passenger 3rd child Male 0
4 13.0 Passenger 3rd child Male 1
[Link] 7/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [73]: def catToNum(series):
series = [Link]('category')
return [Link]
catData=trainingData[["Crew/Pass","Class","Adult/Child","Gender"]].apply(catTo
Num)
trainingData[["Crew/Pass","Class","Adult/Child","Gender"]]=catData
[Link]()
C:\Users\hariz\Anaconda3\lib\site-packages\pandas\core\[Link]: Setting
WithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: [Link]
able/[Link]#indexing-view-versus-copy
self[k1] = value[k2]
Out[73]:
Age Crew/Pass Class Adult/Child Gender Survival
0 27.0 1 2 0 1 1
1 42.0 1 2 0 1 1
2 39.0 1 2 0 0 1
3 16.0 1 2 1 1 0
4 13.0 1 2 1 1 1
In [74]: len(trainingData)
Out[74]: 2456
In [75]: trainingData = [Link]()
len(trainingData)
Out[75]: 2426
In [76]: from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)
In [77]: len(train)
Out[77]: 1940
In [78]: len(test)
Out[78]: 486
In [79]: from [Link] import DecisionTreeClassifier
clf=DecisionTreeClassifier(max_leaf_nodes=25)
clf=[Link](train[["Age","Crew/Pass","Class","Adult/Child","Gender"]],train["S
urvival"])
[Link] 8/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic
In [81]: clf
Out[81]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=25,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
In [82]: clf.feature_importances_
Out[82]: array([0.72325166, 0.03119177, 0.15634522, 0. , 0.08921135])
In [83]: predictions = [Link](test[["Age","Crew/Pass","Class","Adult/Child","Gende
r"]])
In [89]: from [Link] import accuracy_score
accuracy_score(test["Survival"], predictions)
Out[89]: 0.8847736625514403
[Link] 9/9