admin 管理员组文章数量: 887019
2024年1月23日发(作者:wordpress更换域名)
PROTO.H
NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );
void print_tree ( NODE* , CHAR** );
void free_tree ( NODE * );
NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( UINT, UINT );
void free_matrix ( MATRIX * );
void read_matrix ( CHAR *, MATRIX * );
void file_size ( CHAR * , UINT * , UINT * );
CHAR **read_tags ( CHAR * , UINT );
void free_tags ( CHAR **, UINT);
ID3.h
typedef unsigned int UINT;
typedef unsigned long ULONG;
typedef char CHAR;
typedef unsigned char BOOL;
typedef double REAL;
typedef struct node {
UINT idx; /* ID code for attribute */
REAL threshold; /* Numerical threshold for attribute test */
struct node *on; /* Address of 'on' node */
struct node *off; /* Address of 'off' node */
struct node *parent; /* Addess of parent node */
} NODE;
typedef struct ne_struct {
REAL ne;
UINT status;
} NEGENTROPY;
typedef struct matrix {
UINT width;
UINT height;
REAL **data;
} MATRIX;
enum UINT { INACTIVE, OFF, ON };
#define LN_2 0.693147187
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)
/*
* FILE: id3.c
*
* Author: Andrew Colin
*
* DISCLAIMER: No liability is assumed by the author for any use made
* of this program.
*
* DISTRIBUTION: Any use may be made of this program, as long as the
* clear acknowledgment is made to the author in code and runtime
* executables
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "id3.h"
#include "proto.h"
/*-------------------------------------------------------------------*/
MATRIX *build_matrix (UINT width, UINT height)
{
MATRIX *_matrix;
UINT i;
_matrix = (MATRIX*) malloc (sizeof (MATRIX));
if (!_matrix)
err_exit (__FILE__, __LINE__);
_matrix->width = width;
_matrix->height = height;
_matrix->data = (REAL**) malloc (height * sizeof (REAL*));
if (_matrix->data == NULL)
err_exit(__FILE__, __LINE__);
for (i=0; i { _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL)); if (_matrix->data[i] == NULL) err_exit(__FILE__, __LINE__); } return _matrix; } /*-------------------------------------------------------------------*/ /* * Standard error handler function */ void err_exit (CHAR* file, UINT line) { printf("n Fatal error in file %s, line %u", file, line); exit(0); } /*-------------------------------------------------------------------*/ void file_size (CHAR *file_name, UINT *width, UINT *height) /* * Given the name of a file of numeric data, this routine counts * the numbers of rows and columns. It's assumed that the number * of entries is the same in each row, and an error is flagged if this * is not the case. * */ { FILE *f; UINT buf_size = 0xFF, _width = 0; CHAR *buffer, *ptr; *width = *height = 0; buffer = (CHAR*) malloc (buf_size * sizeof (CHAR)); if (buffer == NULL) err_exit (__FILE__, __LINE__); /* Open price file - abort if filename invalid */ f = fopen(file_name, "r"); if (f == NULL) { printf("n File not found : %sn", file_name); err_exit (__FILE__, __LINE__); } /* Get number of entries in first row */ if (fgets(buffer, buf_size, f) != NULL) { ++*height; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++*width; ptr = strtok (NULL, " ,"); } } /* Count numbers of subsequent rows */ while (!feof(f)) { if (fgets(buffer, buf_size, f) != NULL) { if (strlen(buffer) > strlen("n")) /* if line is more than a NL char */ { ++*height; _width = 0; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++_width; ptr = strtok (NULL, " ,"); } if (*width != _width) { printf("n Number of entries in file %s did not agree", file_name); err_exit (__FILE__, __LINE__); } } } } free (buffer); } /*-------------------------------------------------------------------*/ void free_matrix (MATRIX *_matrix) { UINT i; for (i=0; i<_matrix->height; i++) free (_matrix->data[i]); free (_matrix->data); free (_matrix); } /*-------------------------------------------------------------------*/ void free_tags ( CHAR** varname, UINT width) { UINT i; for (i=0; i free(varname[i]); free (varname); } /*-------------------------------------------------------------------*/ void free_tree ( NODE *node ) { /* * Frees the memory allocated to a tree structure */ if (node == NULL) return; else { free_tree (node->on); free_tree (node->off); free(node); } } /*-------------------------------------------------------------------*/ NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state) /* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */ { NEGENTROPY negentropy_struct; NODE *node; UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split; REAL **data = matrix->data; REAL best_threshold, min_negentropy, _negentropy; /* Allocate memory for this node */ node = (NODE*) malloc (sizeof(NODE)); if (!node) err_exit (__FILE__, __LINE__); /* Set up links in decision tree */ node->parent = parent; /* Set address of parent node */ if (parent != NULL) /* parent to child; not relevant for root node */ { /* Pass address of this node to the parent node */ if (state == ON) parent->on = node; else if (state == OFF) parent->off = node; } /* * Select attribute with lowest negentropy for splitting. Scan through * ALL attributes (except the target) and ALL data samples. This is * pretty inefficient for data sets with repeated values, but will do * for illustrative purposes */ min_negentropy = 1.0; for (i=0; i { for (j=0; j { if (i != target) { /* Set trial values for */ node->idx = i; node->threshold = data[j][i]; /* ...and calculate the negentropy of this partition */ negentropy_struct = negentropy (data, n_samples, node, target); _negentropy = negentropy_; /* If this negentropy is lower than any other, retain the index and threshold for future use */ if (_negentropy < min_negentropy) { min_negentropy = _negentropy; split = i; best_threshold = data[j][i]; } } /*if (i != target)*/ } /*for (j=0; j } /*for (i=0; i /* Save the combination of best attribute and threshold value */ node->idx = split; node->threshold = best_threshold; /* * If the negentropy routine found itself at an end-of-branch * for the decision tree, the 'status' flag in 'negentropy_struct' * is set to ON or OFF and the node labelled accordingly. Otherwise, * ID3 continues to call itself until all end-of-branch nodes are * found. */ if (negentropy_ != INACTIVE) { node->on = node->off = NULL; node->idx = negentropy_; } else { node->on = ID3 (matrix, node, target, ON); node->off = ID3 (matrix, node, target, OFF); } return node; } /*-------------------------------------------------------------------*/ void main (int argv, char *argc[]) { MATRIX *matrix; NODE *node; UINT target, n_vars, n_samples; CHAR data_file[13], tag_file[13]; /* Longest file name in DOS */ CHAR **tag_names; /* Set up file names */ if (argv != 2) { printf("nUsage: id3 [datafile]"); exit(0); } else { printf("nWelcome to ID3"); printf("nLast compiled on %s, %s", __DATE__, __TIME__); printf("n"); strcpy(data_file, argc[1]); strcpy(tag_file, argc[1]); strcat(data_file, ".dat"); strcat(tag_file, ".tag"); } /* Read dimensions of data file */ file_size (data_file, &n_vars, &n_samples); /* Read labels for columns of data */ tag_names = read_tags (tag_file, n_vars); /* Allocate storage */ matrix = build_matrix (n_vars, n_samples); /* ...and read it from disk */ read_matrix (data_file, matrix); /* Classification target is last column */ target = n_vars - 1; /* Return root of decision tree - ID3 continues to call itself recursively */ node = ID3 ( matrix, NULL, target, 0 ); print_tree(node, tag_names); printf("n"); free_tags (tag_names, n_vars); free_matrix(matrix); free_tree (node); } /*-------------------------------------------------------------------*/ NEGENTROPY negentropy ( REAL **data, UINT n_samples, NODE *local, UINT target) { /* * Calculates the entropy of classification of an attribute, given * a table of attributes already used, the attribute on which splitting * is to be taken, and the target attribute. Entropy is calculated in * bits, so logs are taken to base 2 by dividing by LN_2. * * The returned value always lies in the (closed) range [0, 1]. */ NEGENTROPY ret_val; NODE *_node, *_parent; UINT on_ctr, off_ctr, p1, p2, i, _match; REAL p_on, p_off, negentropy_on, negentropy_off; on_ctr = off_ctr = p1 = p2 = 0; /* Scan through all supplied data samples */ for (i=0; i { /* * If pattern matches the current position in the decision tree, * then use this vector. The match is determined by passing up * the decision tree and checking whether 'data[idx] >= threshold' * matches at each step, where idx and threshold are taken from * each node in turn. */ _match = 1; _node = local; while (_node->parent != NULL) { /* If at the root node, all entries match*/ _parent = _node->parent; if (_node == _parent->on) { /* if parent node is ON */ if (data[i][_parent->idx] < _parent->threshold) _match = 0; } else if (_node == _parent->off) { /* if parent node is OFF */ if (data[i][_parent->idx] >= _parent->threshold) _match = 0; } _node = _parent; } if (_match) { if (data[i][local->idx] >= local->threshold) { on_ctr++; if (data[i][target] >= 0.5) p1++; } else { off_ctr++; if (data[i][target] >= 0.5) p2++; } } } /* for (i=0; i /* 1: Entropy of subtable with activation ON */ /* * We now have the numbers of samples that match this part of the * decision tree, and the number of samples for which the supplied * condition are true. From these quantities we can find the negentropy of * this partition. */ if (on_ctr > 0) { p_on = (REAL) p1 / (REAL) on_ctr; p_off = 1 - p_on; negentropy_on = -entropy (p_on) - entropy (p_off); } else negentropy_on = 0.0; /* 2: Entropy of subtable with activation OFF */ if (off_ctr > 0) { p_on = (REAL) p2 / (REAL) off_ctr; p_off = 1 - p_on; negentropy_off = -entropy (p_on) - entropy (p_off); } else negentropy_off = 0.0; ret_ = (negentropy_on * on_ctr + negentropy_off * off_ctr); ret_ /= (on_ctr + off_ctr); /* * If all values examined were the same, set 'ret_' to * the target value since this will be an end-of-branch node */ if ((p1 == on_ctr) && (p2 == off_ctr)) ret_ = ON; else if ((p1 == 0) && (p2 == 0)) ret_ = OFF; else ret_ = INACTIVE; return ret_val; } /*-------------------------------------------------------------------*/ void print_tree (NODE *node, CHAR** tag_names) { /* * Displays algebraic equivalent of the decision tree */ if (node->on == NULL) { if (node->idx == ON) printf("ON"); else if (node->idx == OFF) printf("OFF"); return; } else printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold); print_tree ( node->on, tag_names ); printf(" else "); print_tree ( node->off, tag_names ); printf( " } " ); } /*-------------------------------------------------------------------*/ void read_matrix (CHAR filename[], MATRIX *matrix) { UINT i, j; UINT height = matrix->height; UINT width = matrix->width; REAL **data = matrix->data; FILE *f; /* Open price file */ if ((f = fopen(filename, "r")) == NULL) { printf("n File not found : %sn", filename); err_exit (__FILE__, __LINE__); } for (i=0; i for (j=0; j { fscanf(f, "%lfn", &data[i][j] ); } fclose(f); } /*-------------------------------------------------------------------*/ CHAR **read_tags (CHAR *tag_file, UINT width) { FILE *f; CHAR **_varname; UINT i; CHAR buffer[0xFF]; f = fopen(tag_file, "r"); if (f == NULL) { printf("n File not found : %sn", tag_file); err_exit (__FILE__, __LINE__); } _varname = (CHAR**) malloc (width * sizeof (CHAR*)); for (i=0; i _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR)); i = 0; while (!feof(f)) { if (fgets(buffer, 0xFF, f) != NULL) { if (strlen(buffer) > strlen("n")) { if (i>width-1) { printf("nMore variable names were detected than data items."); printf("nPlease correct this problem before proceeding"); exit(0); } sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]); i++; } } } if (i { printf("nFewer variable names than data items were detected."); printf("nPlease correct this problem before proceeding"); exit(0); } fclose (f); return _varname; }
版权声明:本文标题:ID3算法(C语言) 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.freenas.com.cn/jishu/1705958168h495664.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论