31 ClusterTree::ClusterTree(
const UINT numSplittingSteps,
const UINT minNumSamplesPerNode,
const UINT maxDepth,
const bool removeFeaturesAtEachSpilt,
const UINT trainingMode,
const bool useScaling,
const Float minRMSErrorPerNode){
34 this->numSplittingSteps = numSplittingSteps;
35 this->minNumSamplesPerNode = minNumSamplesPerNode;
36 this->maxDepth = maxDepth;
37 this->removeFeaturesAtEachSpilt = removeFeaturesAtEachSpilt;
38 this->trainingMode = trainingMode;
39 this->minRMSErrorPerNode = minRMSErrorPerNode;
40 Clusterer::classType =
"ClusterTree";
41 clustererType = Clusterer::classType;
42 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
43 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
44 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
45 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
51 Clusterer::classType =
"ClusterTree";
52 clustererType = Clusterer::classType;
53 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
54 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
55 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
56 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
75 this->numSplittingSteps = rhs.numSplittingSteps;
76 this->minNumSamplesPerNode = rhs.minNumSamplesPerNode;
77 this->maxDepth = rhs.maxDepth;
78 this->removeFeaturesAtEachSpilt = rhs.removeFeaturesAtEachSpilt;
79 this->trainingMode = rhs.trainingMode;
80 this->minRMSErrorPerNode = rhs.minRMSErrorPerNode;
91 if( clusterer == NULL )
return false;
105 this->numSplittingSteps = ptr->numSplittingSteps;
106 this->minNumSamplesPerNode = ptr->minNumSamplesPerNode;
107 this->maxDepth = ptr->maxDepth;
108 this->removeFeaturesAtEachSpilt = ptr->removeFeaturesAtEachSpilt;
109 this->trainingMode = ptr->trainingMode;
110 this->minRMSErrorPerNode = ptr->minRMSErrorPerNode;
124 const unsigned int M = trainingData.
getNumRows();
125 const unsigned int N = trainingData.
getNumCols();
128 Clusterer::errorLog <<
"train_(MatrixFloat &trainingData) - Training data has zero samples!" << std::endl;
132 numInputDimensions = N;
133 numOutputDimensions = 1;
139 trainingData.
scale(0, 1);
144 for(UINT i=0; i<N; i++){
149 UINT clusterLabel = 0;
151 tree = buildTree( trainingData, NULL, features, clusterLabel, nodeID );
156 Clusterer::errorLog <<
"train_(MatrixFloat &trainingData) - Failed to build tree!" << std::endl;
166 clusterLabels[i] = i+1;
168 clusterLikelihoods.
resize(numClusters,0);
169 clusterDistances.
resize(numClusters,0);
177 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - Model Not Trained!" << std::endl;
182 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - DecisionTree pointer is null!" << std::endl;
186 if( inputVector.size() != numInputDimensions ){
187 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - The size of the input Vector (" << inputVector.size() <<
") does not match the num features in the model (" << numInputDimensions << std::endl;
192 for(UINT n=0; n<numInputDimensions; n++){
193 inputVector[n] =
scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0, 1);
198 if( !tree->
predict( inputVector, clusterLabel ) ){
199 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - Failed to predict!" << std::endl;
223 return tree->
print();
229 if( !file.is_open() )
231 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - The file is not open!" << std::endl;
236 file <<
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0" << std::endl;
240 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
244 file <<
"NumSplittingSteps: " << numSplittingSteps << std::endl;
245 file <<
"MinNumSamplesPerNode: " << minNumSamplesPerNode << std::endl;
246 file <<
"MaxDepth: " << maxDepth << std::endl;
247 file <<
"RemoveFeaturesAtEachSpilt: " << removeFeaturesAtEachSpilt << std::endl;
248 file <<
"TrainingMode: " << trainingMode << std::endl;
249 file <<
"MinRMSErrorPerNode: " << minRMSErrorPerNode << std::endl;
250 file <<
"TreeBuilt: " << (tree != NULL ? 1 : 0) << std::endl;
255 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save tree to file!" << std::endl;
269 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not open file to load model" << std::endl;
277 if(word !=
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0"){
278 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find Model File Header" << std::endl;
284 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Failed to load base settings from file!" << std::endl;
289 if(word !=
"NumSplittingSteps:"){
290 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the NumSplittingSteps!" << std::endl;
293 file >> numSplittingSteps;
296 if(word !=
"MinNumSamplesPerNode:"){
297 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinNumSamplesPerNode!" << std::endl;
300 file >> minNumSamplesPerNode;
303 if(word !=
"MaxDepth:"){
304 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MaxDepth!" << std::endl;
310 if(word !=
"RemoveFeaturesAtEachSpilt:"){
311 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the RemoveFeaturesAtEachSpilt!" << std::endl;
314 file >> removeFeaturesAtEachSpilt;
317 if(word !=
"TrainingMode:"){
318 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TrainingMode!" << std::endl;
321 file >> trainingMode;
324 if(word !=
"MinRMSErrorPerNode:"){
325 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinRMSErrorPerNode!" << std::endl;
328 file >> minRMSErrorPerNode;
331 if(word !=
"TreeBuilt:"){
332 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TreeBuilt!" << std::endl;
340 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the Tree!" << std::endl;
349 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to create new RegressionTreeNode!" << std::endl;
353 tree->setParent( NULL );
356 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to load tree from file!" << std::endl;
363 clusterLabels[i] = i+1;
365 clusterLikelihoods.
resize(numClusters,0);
366 clusterDistances.
resize(numClusters,0);
390 return minRMSErrorPerNode;
394 this->minRMSErrorPerNode = minRMSErrorPerNode;
421 node->initNode( parent, depth, nodeID );
424 if( features.
getSize() == 0 || M < minNumSamplesPerNode || depth >= maxDepth ){
430 node->setIsLeafNode(
true );
433 node->
set( M, 0, 0, clusterLabel );
435 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
441 UINT featureIndex = 0;
444 if( !computeBestSpilt( trainingData, features, featureIndex, threshold, minError ) ){
449 Clusterer::trainingLog <<
"Depth: " << depth <<
" FeatureIndex: " << featureIndex <<
" Threshold: " << threshold <<
" MinError: " << minError << std::endl;
452 if( minError <= minRMSErrorPerNode ){
457 node->setIsLeafNode(
true );
460 node->
set( M, featureIndex, threshold, clusterLabel );
462 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
468 node->
set( M, featureIndex, threshold, 0 );
471 if( removeFeaturesAtEachSpilt ){
472 for(UINT i=0; i<features.
getSize(); i++){
473 if( features[i] == featureIndex ){
474 features.erase( features.begin()+i );
484 for(UINT i=0; i<M; i++){
491 node->setLeftChild( buildTree( lhs, node, features, clusterLabel, nodeID ) );
492 node->setRightChild( buildTree( rhs, node, features, clusterLabel, nodeID ) );
497 bool ClusterTree::computeBestSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
499 switch( trainingMode ){
500 case BEST_ITERATIVE_SPILT:
501 return computeBestSpiltBestIterativeSpilt( trainingData, features, featureIndex, threshold, minError );
503 case BEST_RANDOM_SPLIT:
504 return computeBestSpiltBestRandomSpilt( trainingData, features, featureIndex, threshold, minError );
507 Clusterer::errorLog <<
"Uknown trainingMode!" << std::endl;
515 bool ClusterTree::computeBestSpiltBestIterativeSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
518 const UINT N = (UINT)features.size();
522 if( N == 0 )
return false;
525 UINT bestFeatureIndex = 0;
527 Float bestThreshold = 0;
539 for(UINT n=0; n<N; n++){
540 minRange = ranges[n].minValue;
541 maxRange = ranges[n].maxValue;
542 step = (maxRange-minRange)/Float(numSplittingSteps);
543 threshold = minRange;
544 featureIndex = features[n];
546 while( threshold <= maxRange ){
549 groupCounter[0] = groupCounter[1] = 0;
550 groupMean[0] = groupMean[1] = 0;
551 groupMSE[0] = groupMSE[1] = 0;
554 for(UINT i=0; i<M; i++){
555 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
556 groupIndex[i] = groupID;
559 groupMean[ groupID ] += trainingData[i][featureIndex];
560 groupCounter[ groupID ]++;
564 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
565 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
568 for(UINT i=0; i<M; i++){
569 groupMSE[ groupIndex[i] ] += grt_sqr( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
571 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
572 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
574 error = grt_sqrt( groupMSE[0] + groupMSE[1] );
577 if( error < minError ){
579 bestThreshold = threshold;
580 bestFeatureIndex = featureIndex;
589 featureIndex = bestFeatureIndex;
590 threshold = bestThreshold;
595 bool ClusterTree::computeBestSpiltBestRandomSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
598 const UINT N = (UINT)features.size();
600 Clusterer::debugLog <<
"computeBestSpiltBestRandomSpilt() M: " << M << std::endl;
602 if( N == 0 )
return false;
605 UINT bestFeatureIndex = 0;
607 Float bestThreshold = 0;
616 for(UINT n=0; n<N; n++){
617 featureIndex = features[n];
619 for(UINT m=0; m<numSplittingSteps; m++){
624 groupCounter[0] = groupCounter[1] = 0;
625 groupMean[0] = groupMean[1] = 0;
626 groupMSE[0] = groupMSE[1] = 0;
629 for(UINT i=0; i<M; i++){
630 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
631 groupIndex[i] = groupID;
634 groupMean[ groupID ] += trainingData[i][featureIndex];
635 groupCounter[ groupID ]++;
639 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
640 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
643 for(UINT i=0; i<M; i++){
644 groupMSE[ groupIndex[i] ] += MLBase::SQR( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
646 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
647 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
649 error = sqrt( groupMSE[0] + groupMSE[1] );
652 if( error < minError ){
654 bestThreshold = threshold;
655 bestFeatureIndex = featureIndex;
661 featureIndex = bestFeatureIndex;
662 threshold = bestThreshold;
const ClusterTreeNode * getTree() const
virtual bool train_(MatrixFloat &trainingData)
Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
bool setMinRMSErrorPerNode(const Float minRMSErrorPerNode)
virtual ~ClusterTree(void)
This class implements a Cluster Tree. This can be used to automatically build a cluster model (where ...
virtual bool print() const
std::string getClustererType() const
virtual bool saveModelToFile(std::fstream &file) const
bool scale(const Float minTarget, const Float maxTarget)
virtual bool resize(const unsigned int size)
virtual bool predict(const VectorFloat &x)
bool copyBaseVariables(const Clusterer *clusterer)
bool loadClustererSettingsFromFile(std::fstream &file)
unsigned int getSize() const
bool set(const UINT nodeSize, const UINT featureIndex, const Float threshold, const UINT clusterLabel)
ClusterTreeNode * deepCopyTree() const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
virtual Node * deepCopyNode() const
bool saveClustererSettingsToFile(std::fstream &file) const
virtual bool saveToFile(std::fstream &file) const
ClusterTree & operator=(const ClusterTree &rhs)
virtual bool loadFromFile(std::fstream &file)
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
unsigned int getNumCols() const
virtual bool deepCopyFrom(const Clusterer *cluster)
virtual bool predict_(VectorFloat &inputVector)
virtual bool print() const
virtual bool loadModelFromFile(std::fstream &file)
VectorFloat getRow(const unsigned int r) const
Vector< MinMax > getRanges() const
Float getRandomNumberUniform(Float minRange=0.0, Float maxRange=1.0)
ClusterTree(const UINT numSplittingSteps=100, const UINT minNumSamplesPerNode=5, const UINT maxDepth=10, const bool removeFeaturesAtEachSpilt=false, const UINT trainingMode=BEST_ITERATIVE_SPILT, const bool useScaling=false, const Float minRMSErrorPerNode=0.01)
UINT getPredictedClusterLabel() const
bool push_back(const Vector< T > &sample)
virtual bool predict(const VectorFloat &x)
Float getMinRMSErrorPerNode() const