PROTO.H
NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );
void print_tree ( NODE* , CHAR** );
void free_tree ( NODE * );
NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( UINT, UINT );
void free_matrix ( MATRIX * );
void read_matrix ( CHAR *, MATRIX * );
void file_size ( CHAR * , UINT * , UINT * );
CHAR **read_tags ( CHAR * , UINT );
void free_tags ( CHAR **, UINT);
ID3.h
typedef unsigned int UINT;
typedef unsigned long ULONG;
typedef char CHAR;
typedef unsigned char BOOL;
typedef double REAL;
typedef struct node {
UINT idx; /* ID code for attribute */
REAL threshold; /* Numerical threshold for attribute test */
struct node *on; /* Address of 'on' node */
struct node *off; /* Address of 'off' node */
struct node *parent; /* Addess of parent node */
} NODE;
typedef struct ne_struct {
REAL ne;
UINT status;
} NEGENTROPY;
typedef struct matrix {
UINT width;
UINT height;
REAL **data;
} MATRIX;
enum UINT { INACTIVE, OFF, ON };
#define LN_2 0.693147180559945309417
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)
/*
* FILE: id3.c
*
* Author: Andrew Colin
*
* DISCLAIMER: No liability is assumed by the author for any use made
* of this program.
*
* DISTRIBUTION: Any use may be made of this program, as long as the
* clear acknowledgment is made to the author in code and runtime
* executables
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>
#include "id3.h"
#include "proto.h"
/*-------------------------------------------------------------------*/
MATRIX *build_matrix (UINT width, UINT height)
{
MATRIX *_matrix;
UINT i;
_matrix = (MATRIX*) malloc (sizeof (MATRIX));
if (!_matrix)
err_exit (__FILE__, __LINE__);
_matrix->width = width;
_matrix->height = height;
_matrix->data = (REAL**) malloc (height * sizeof (REAL*));
if (_matrix->data == NULL)
err_exit(__FILE__, __LINE__);
for (i=0; i<height; i++)
{
_matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
if (_matrix->data[i] == NULL)
err_exit(__FILE__, __LINE__);
}
return _matrix;
}
/*-------------------------------------------------------------------*/
/*
* Standard error handler function
*/
void err_exit (CHAR* file, UINT line)
{
printf("\n Fatal error in file %s, line %u", file, line);
exit(0);
}
/*-------------------------------------------------------------------*/
void file_size (CHAR *file_name, UINT *width, UINT *height)
/*
* Given the name of a file of numeric data, this routine counts
* the numbers of rows and columns. It's assumed that the number
* of entries is the same in each row, and an error is flagged if this
* is not the case.
*
*/
{
FILE *f;
UINT buf_size = 0xFF, _width = 0;
CHAR *buffer, *ptr;
*width = *height = 0;
buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
if (buffer == NULL)
err_exit (__FILE__, __LINE__);
/* Open price file - abort if filename invalid */
f = fopen(file_name, "r");
if (f == NULL)
{
printf("\n File not found : %s\n", file_name);
err_exit (__FILE__, __LINE__);
}
/* Get number of entries in first row */
if (fgets(buffer, buf_size, f) != NULL)
{
++*height;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++*width;
ptr = strtok (NULL, " ,");
}
}
/* Count numbers of subsequent rows */
while (!feof(f))
{
if (fgets(buffer, buf_size, f) != NULL)
{
if (strlen(buffer) > strlen("\n")) /* if line is more than a NL char */
{
++*height;
_width = 0;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++_width;
ptr = strtok (NULL, " ,");
}
if (*width != _width)
{
printf("\n Number of entries in file %s did not agree", file_name);
err_exit (__FILE__, __LINE__);
}
}
}
}
free (buffer);
}
/*-------------------------------------------------------------------*/
void free_matrix (MATRIX *_matrix)
{
UINT i;
for (i=0; i<_matrix->height; i++)
free (_matrix->data[i]);
free (_matrix->data);
free (_matrix);
}
/*-------------------------------------------------------------------*/
void free_tags ( CHAR** varname, UINT width)
{
UINT i;
for (i=0; i<width; i++)
free(varname[i]);
free (varname);
}
/*-------------------------------------------------------------------*/
void free_tree ( NODE *node )
{
/*
* Frees the memory allocated to a tree structure
*/
if (node == NULL)
return;
else
{
free_tree (node->on);
free_tree (node->off);
free(node);
}
}
/*-------------------------------------------------------------------*/
NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)
/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */
{
NEGENTROPY negentropy_struct;
NODE *node;
UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
REAL **data = matrix->data;
REAL best_threshold, min_negentropy, _negentropy;
/* Allocate memory for this node */
node = (NODE*) malloc (sizeof(NODE));
if (!node)
err_exit (__FILE__, __LINE__);
/* Set up links in decision tree */
node->parent = parent; /* Set address of parent node */
if (parent != NULL) /* parent to child; not relevant for root node */
{
/* Pass address of this node to the parent node */
if (state == ON)
parent->on = node;
else
if (state == OFF)
parent->off = node;
}
/*
* Select attribute with lowest negentropy for splitting. Scan through
* ALL attributes (except the target) and ALL data samples. This is
* pretty inefficient for data sets with repeated values, but will do
* for illustrative purposes
*/
min_negentropy = 1.0;
for (i=0; i<n_vars; i++)
{
for (j=0; j<n_samples; j++)
{
if (i != target)
{
/* Set trial values for this node... */
node->idx = i;
node->threshold = data[j][i];
/* ...and calculate the negentropy of this partition */
negentropy_struct = negentropy (data, n_samples, node, target);
_negentropy = negentropy_struct.ne;
/* If this negentropy is lower than any other, retain the
index and threshold for future use */
if (_negentropy < min_negentropy)
{
min_negentropy = _negentropy;
split = i;
best_threshold = data[j][i];
}
} /*if (i != target)*/
} /*for (j=0; j<n_samples; j++)*/
} /*for (i=0; i<n_vars; i++)*/
/* Save the combination of best attribute and threshold value */
node->idx = split;
node->threshold = best_threshold;
/*
* If the negentropy routine found itself at an end-of-branch
* for the decision tree, the 'status' flag in 'negentropy_struct'
* is set to ON or OFF and the node labelled accordingly. Otherwise,
* ID3 continues to call itself until all end-of-branch nodes are
* found.
- 1
- 2
- 3
- 4
前往页