id3,c4.5决策树算法python例题
时间: 2025-04-30 18:46:36 浏览: 18
### ID3 和 C4.5 决策树算法的 Python 实现
#### ID3 决策树实现
ID3 决策树基于信息熵和信息增益来进行节点分裂。下面是一个简单的 ID3 决策树构建函数:
```python
import numpy as np
from math import log
def calc_entropy(data_set):
num_entries = len(data_set)
label_counts = {}
for feat_vec in data_set:
current_label = feat_vec[-1]
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / num_entries
entropy -= prob * log(prob, 2)
return entropy
def choose_best_feature_to_split_ID3(data_set):
base_entropy = calc_entropy(data_set)
best_info_gain = 0.0;
best_feature = -1
feature_count = len(data_set[0]) - 1
for i in range(feature_count):
feat_list = [example[i] for example in data_set]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data_set = split_data_set(data_set, i, value)
prob = len(sub_data_set) / float(len(data_set))
new_entropy += prob * calc_entropy(sub_data_set)
info_gain = base_entropy - new_entropy
if (info_gain > best_info_gain):
best_info_gain = info_gain
best_feature = i
return best_feature[^1]
def create_tree_id3(data_set, labels):
class_list = [example[-1] for example in data_set]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(data_set[0]) == 1:
majority_class = max(set(class_list), key=class_list.count)
return majority_class
best_feat = choose_best_feature_to_split_ID3(data_set)
best_feat_label = labels[best_feat]
my_tree = {best_feat_label:{}}
del(labels[best_feat])
feat_values = [example[best_feat] for example in data_set]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feat_label][value] = create_tree_id3(split_data_set(data_set, best_feat, value), sub_labels)
return my_tree
```
#### C4.5 决策树实现
C4.5 是对 ID3 的改进版本,主要在于引入了信息增益率来解决偏向于多值属性的问题。
```python
def gain_ratio(data_set, index):
total_entropy = calc_entropy(data_set)
values = set([feat[index] for feat in data_set])
iv = 0.0
weighted_entropy = 0.0
for val in values:
subset = [feat for feat in data_set if feat[index] == val]
probability = len(subset)/float(len(data_set))
iv -= probability*log(probability, 2)
weighted_entropy += probability*calc_entropy(subset)
gain = total_entropy-weighted_entropy
ratio = gain/iv if iv != 0 else 0
return ratio
def choose_best_feature_to_split_C4_5(data_set):
num_features = len(data_set[0])-1
best_gain_ratio = 0.0
best_feature_index = -1
for i in range(num_features):
curr_gain_ratio = gain_ratio(data_set,i)
if(curr_gain_ratio>best_gain_ratio):
best_gain_ratio = curr_gain_ratio
best_feature_index = i
return best_feature_index
def create_tree_c45(data_set,labels):
class_list=[example[-1]for example in data_set]
if class_list.count(class_list[0])==len(class_list):
return class_list[0]
if len(data_set[0])==1:
major_class=max(set(class_list),key=class_list.count)
return major_class
best_feat=choose_best_feature_to_split_C4_5(data_set)
best_feat_label=labels[best_feat]
tree={best_feat_label:{}}
del(labels[best_feat])
feat_val=[ex[best_feat]for ex in data_set]
uniques=set(feat_val)
for v in uniques:
copy_labels=labels[:]
tree[best_feat_label][v]=create_tree_c45(split_data_set(data_set,best_feat,v),copy_labels)
return tree
```
阅读全文
相关推荐
















