In [1]: # demonstrating working of Decision Tree based on ID3 model
import pandas as pd
from pandas import DataFrame
from collections import Counter # to hold count of each element
In [5]: df_tennis=pd.read_csv('play_tennis.csv')
In [16]: df_tennis.head()
Out[16]:
day outlook temp humidity wind play
0 D1 Sunny Hot High Weak No
1 D2 Sunny Hot High Strong No
2 D3 Overcast Hot High Weak Yes
3 D4 Rain Mild High Weak Yes
4 D5 Rain Cool Normal Weak Yes
In [17]: df_tennis
Out[17]:
day outlook temp humidity wind play
0 D1 Sunny Hot High Weak No
1 D2 Sunny Hot High Strong No
2 D3 Overcast Hot High Weak Yes
3 D4 Rain Mild High Weak Yes
4 D5 Rain Cool Normal Weak Yes
5 D6 Rain Cool Normal Strong No
6 D7 Overcast Cool Normal Strong Yes
7 D8 Sunny Mild High Weak No
8 D9 Sunny Cool Normal Weak Yes
9 D10 Rain Mild Normal Weak Yes
10 D11 Sunny Mild Normal Strong Yes
11 D12 Overcast Mild High Strong Yes
12 D13 Overcast Hot Normal Weak Yes
13 D14 Rain Mild High Strong No
In [6]: df_tennis.keys()[4]
Out[6]: 'wind'
In [7]: # function to compute entropy of individual attribute
def entropy(probs):
import math
return sum([-prob*math.log(prob,2)for prob in probs])
In [8]: # function to compute entropy of given attribute w.r.t. target attribute
def entropy_of_list(a_list):
cnt=Counter(x for x in a_list)
num_instances=len(a_list)
print('\n Number of instances of the current sub class is {0}:'
.format(num_instances))
# to convert into binary form we use .format
probs=[x/num_instances for x in cnt.values()]
print('\n Classes:',min(cnt),max(cnt))
print('\n Probabilities of Class {0} is {1}:'.format(min(cnt),min(probs)))
print('\n Probabilities of Class {0} is {1}:'.format(max(cnt),max(probs)))
return entropy(probs)
In [9]: # wind---strong----yes
# wind---strong---no
# lets make independent and dependent variable i.e. X & Y
# here Y is binary (Play: Yes/No)
print('\n Input dataset for entropy calculation:\n',df_tennis['play'])
Input dataset for entropy calculation:
0 No
1 No
2 Yes
3 Yes
4 Yes
5 No
6 Yes
7 No
8 Yes
9 Yes
10 Yes
11 Yes
12 Yes
13 No
Name: play, dtype: object
In [10]: total_entropy=entropy_of_list(df_tennis['play'])
print('\n Total Entropy of Play Tennis Set is:',total_entropy)
Number of instances of the current sub class is 14:
Classes: No Yes
Probabilities of Class No is 0.35714285714285715:
Probabilities of Class Yes is 0.6428571428571429:
Total Entropy of Play Tennis Set is: 0.9402859586706309
Information Gain = Entropy before splitting - Entropy after splitting IG(S, a) = H(S) – H(S | a)
H(S | a) = sum v in a Sa(v)/S * H(Sa(v)) where
IG(S, a) is the information for the dataset S for the variable a for a random variable H(S) is the
entropy for the dataset before any change H(S | a) is the conditional entropy for the dataset
given the variable a
In [11]: def information_gain(df,split_attribute_name,target_attribute_name):
print("information gain calculation of",split_attribute_name)
df_split=df.groupby(split_attribute_name)
nobs=len(df.index*1.0)
print("NOBS",nobs)
df_agg_ent= df_split.agg({target_attribute_name:
[entropy_of_list,lambda x:len(x)/nobs]})
print('FEATURE',df_agg_ent)
df_agg_ent.columns=['Entropy','PropObservations']
new_entropy=sum(df_agg_ent['Entropy']*df_agg_ent['PropObservations'])
old_entropy=entropy_of_list(df[target_attribute_name])
return old_entropy - new_entropy
NOBS= number of observations .agg function allows you to apply function along one axis
In [12]: print('Information Gain for Outlook is:'
+str(information_gain(df_tennis,'outlook','play')))
information gain calculation of outlook
NOBS 14
Number of instances of the current sub class is 4:
Classes: Yes Yes
Probabilities of Class Yes is 1.0:
Probabilities of Class Yes is 1.0:
Number of instances of the current sub class is 5:
Classes: No Yes
Probabilities of Class No is 0.4:
Probabilities of Class Yes is 0.6:
Number of instances of the current sub class is 5:
Classes: No Yes
Probabilities of Class No is 0.4:
Probabilities of Class Yes is 0.6:
FEATURE play
entropy_of_list <lambda_0>
outlook
Overcast 0.000000 0.285714
Rain 0.970951 0.357143
Sunny 0.970951 0.357143
Number of instances of the current sub class is 14:
Classes: No Yes
Probabilities of Class No is 0.35714285714285715:
Probabilities of Class Yes is 0.6428571428571429:
Information Gain for Outlook is:0.2467498197744391
In [13]: print('Information Gain for Outlook is:'
+str(information_gain(df_tennis,'temp','play')),"\n")
information gain calculation of temp
NOBS 14
Number of instances of the current sub class is 4:
Classes: No Yes
Probabilities of Class No is 0.25:
Probabilities of Class Yes is 0.75:
Number of instances of the current sub class is 4:
Classes: No Yes
Probabilities of Class No is 0.5:
Probabilities of Class Yes is 0.5:
Number of instances of the current sub class is 6:
Classes: No Yes
Probabilities of Class No is 0.3333333333333333:
Probabilities of Class Yes is 0.6666666666666666:
FEATURE play
entropy_of_list <lambda_0>
temp
Cool 0.811278 0.285714
Hot 1.000000 0.285714
Mild 0.918296 0.428571
Number of instances of the current sub class is 14:
Classes: No Yes
Probabilities of Class No is 0.35714285714285715:
Probabilities of Class Yes is 0.6428571428571429:
Information Gain for Outlook is:0.029222565658954647
In [14]: print('Information Gain for Outlook is:'
+str(information_gain(df_tennis,'humidity','play')))
information gain calculation of humidity
NOBS 14
Number of instances of the current sub class is 7:
Classes: No Yes
Probabilities of Class No is 0.42857142857142855:
Probabilities of Class Yes is 0.5714285714285714:
Number of instances of the current sub class is 7:
Classes: No Yes
Probabilities of Class No is 0.14285714285714285:
Probabilities of Class Yes is 0.8571428571428571:
FEATURE play
entropy_of_list <lambda_0>
humidity
High 0.985228 0.5
Normal 0.591673 0.5
Number of instances of the current sub class is 14:
Classes: No Yes
Probabilities of Class No is 0.35714285714285715:
Probabilities of Class Yes is 0.6428571428571429:
Information Gain for Outlook is:0.15183550136234136
In [15]: print('Information Gain for Outlook is:'
+str(information_gain(df_tennis,'wind','play')))
information gain calculation of wind
NOBS 14
Number of instances of the current sub class is 6:
Classes: No Yes
Probabilities of Class No is 0.5:
Probabilities of Class Yes is 0.5:
Number of instances of the current sub class is 8:
Classes: No Yes
Probabilities of Class No is 0.25:
Probabilities of Class Yes is 0.75:
FEATURE play
entropy_of_list <lambda_0>
wind
Strong 1.000000 0.428571
Weak 0.811278 0.571429
Number of instances of the current sub class is 14:
Classes: No Yes
Probabilities of Class No is 0.35714285714285715:
Probabilities of Class Yes is 0.6428571428571429:
Information Gain for Outlook is:0.04812703040826927