21 #define GRT_DLL_EXPORTS
32 ClusterTree::ClusterTree(
const UINT numSplittingSteps,
const UINT minNumSamplesPerNode,
const UINT maxDepth,
const bool removeFeaturesAtEachSpilt,
const UINT trainingMode,
const bool useScaling,
const Float minRMSErrorPerNode){
35 this->numSplittingSteps = numSplittingSteps;
36 this->minNumSamplesPerNode = minNumSamplesPerNode;
37 this->maxDepth = maxDepth;
38 this->removeFeaturesAtEachSpilt = removeFeaturesAtEachSpilt;
39 this->trainingMode = trainingMode;
40 this->minRMSErrorPerNode = minRMSErrorPerNode;
41 Clusterer::classType =
"ClusterTree";
42 clustererType = Clusterer::classType;
43 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
44 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
45 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
46 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
52 Clusterer::classType =
"ClusterTree";
53 clustererType = Clusterer::classType;
54 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
55 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
56 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
57 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
76 this->numSplittingSteps = rhs.numSplittingSteps;
77 this->minNumSamplesPerNode = rhs.minNumSamplesPerNode;
78 this->maxDepth = rhs.maxDepth;
79 this->removeFeaturesAtEachSpilt = rhs.removeFeaturesAtEachSpilt;
80 this->trainingMode = rhs.trainingMode;
81 this->minRMSErrorPerNode = rhs.minRMSErrorPerNode;
92 if( clusterer == NULL )
return false;
106 this->numSplittingSteps = ptr->numSplittingSteps;
107 this->minNumSamplesPerNode = ptr->minNumSamplesPerNode;
108 this->maxDepth = ptr->maxDepth;
109 this->removeFeaturesAtEachSpilt = ptr->removeFeaturesAtEachSpilt;
110 this->trainingMode = ptr->trainingMode;
111 this->minRMSErrorPerNode = ptr->minRMSErrorPerNode;
125 const unsigned int M = trainingData.
getNumRows();
126 const unsigned int N = trainingData.
getNumCols();
129 Clusterer::errorLog <<
"train_(MatrixFloat &trainingData) - Training data has zero samples!" << std::endl;
133 numInputDimensions = N;
134 numOutputDimensions = 1;
140 trainingData.
scale(0, 1);
145 for(UINT i=0; i<N; i++){
150 UINT clusterLabel = 0;
152 tree = buildTree( trainingData, NULL, features, clusterLabel, nodeID );
157 Clusterer::errorLog <<
"train_(MatrixFloat &trainingData) - Failed to build tree!" << std::endl;
167 clusterLabels[i] = i+1;
169 clusterLikelihoods.
resize(numClusters,0);
170 clusterDistances.
resize(numClusters,0);
178 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - Model Not Trained!" << std::endl;
183 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - DecisionTree pointer is null!" << std::endl;
187 if( inputVector.size() != numInputDimensions ){
188 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - The size of the input Vector (" << inputVector.size() <<
") does not match the num features in the model (" << numInputDimensions << std::endl;
193 for(UINT n=0; n<numInputDimensions; n++){
194 inputVector[n] =
scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0, 1);
199 if( !tree->
predict( inputVector, clusterLabel ) ){
200 Clusterer::errorLog <<
"predict_(VectorFloat &inputVector) - Failed to predict!" << std::endl;
224 return tree->
print();
230 if( !file.is_open() )
232 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - The file is not open!" << std::endl;
237 file <<
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0" << std::endl;
241 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
245 file <<
"NumSplittingSteps: " << numSplittingSteps << std::endl;
246 file <<
"MinNumSamplesPerNode: " << minNumSamplesPerNode << std::endl;
247 file <<
"MaxDepth: " << maxDepth << std::endl;
248 file <<
"RemoveFeaturesAtEachSpilt: " << removeFeaturesAtEachSpilt << std::endl;
249 file <<
"TrainingMode: " << trainingMode << std::endl;
250 file <<
"MinRMSErrorPerNode: " << minRMSErrorPerNode << std::endl;
251 file <<
"TreeBuilt: " << (tree != NULL ? 1 : 0) << std::endl;
255 if( !tree->
save( file ) ){
256 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save tree to file!" << std::endl;
270 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not open file to load model" << std::endl;
278 if(word !=
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0"){
279 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find Model File Header" << std::endl;
285 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Failed to load base settings from file!" << std::endl;
290 if(word !=
"NumSplittingSteps:"){
291 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the NumSplittingSteps!" << std::endl;
294 file >> numSplittingSteps;
297 if(word !=
"MinNumSamplesPerNode:"){
298 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinNumSamplesPerNode!" << std::endl;
301 file >> minNumSamplesPerNode;
304 if(word !=
"MaxDepth:"){
305 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MaxDepth!" << std::endl;
311 if(word !=
"RemoveFeaturesAtEachSpilt:"){
312 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the RemoveFeaturesAtEachSpilt!" << std::endl;
315 file >> removeFeaturesAtEachSpilt;
318 if(word !=
"TrainingMode:"){
319 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TrainingMode!" << std::endl;
322 file >> trainingMode;
325 if(word !=
"MinRMSErrorPerNode:"){
326 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinRMSErrorPerNode!" << std::endl;
329 file >> minRMSErrorPerNode;
332 if(word !=
"TreeBuilt:"){
333 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TreeBuilt!" << std::endl;
341 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the Tree!" << std::endl;
350 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to create new RegressionTreeNode!" << std::endl;
354 tree->setParent( NULL );
355 if( !tree->
load( file ) ){
357 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to load tree from file!" << std::endl;
364 clusterLabels[i] = i+1;
366 clusterLikelihoods.
resize(numClusters,0);
367 clusterDistances.
resize(numClusters,0);
391 return minRMSErrorPerNode;
395 this->minRMSErrorPerNode = minRMSErrorPerNode;
422 node->initNode( parent, depth, nodeID );
425 if( features.
getSize() == 0 || M < minNumSamplesPerNode || depth >= maxDepth ){
431 node->setIsLeafNode(
true );
434 node->
set( M, 0, 0, clusterLabel );
436 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
442 UINT featureIndex = 0;
445 if( !computeBestSpilt( trainingData, features, featureIndex, threshold, minError ) ){
450 Clusterer::trainingLog <<
"Depth: " << depth <<
" FeatureIndex: " << featureIndex <<
" Threshold: " << threshold <<
" MinError: " << minError << std::endl;
453 if( minError <= minRMSErrorPerNode ){
458 node->setIsLeafNode(
true );
461 node->
set( M, featureIndex, threshold, clusterLabel );
463 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
469 node->
set( M, featureIndex, threshold, 0 );
472 if( removeFeaturesAtEachSpilt ){
473 for(UINT i=0; i<features.
getSize(); i++){
474 if( features[i] == featureIndex ){
475 features.erase( features.begin()+i );
485 for(UINT i=0; i<M; i++){
492 node->setLeftChild( buildTree( lhs, node, features, clusterLabel, nodeID ) );
493 node->setRightChild( buildTree( rhs, node, features, clusterLabel, nodeID ) );
498 bool ClusterTree::computeBestSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
500 switch( trainingMode ){
501 case BEST_ITERATIVE_SPILT:
502 return computeBestSpiltBestIterativeSpilt( trainingData, features, featureIndex, threshold, minError );
504 case BEST_RANDOM_SPLIT:
505 return computeBestSpiltBestRandomSpilt( trainingData, features, featureIndex, threshold, minError );
508 Clusterer::errorLog <<
"Uknown trainingMode!" << std::endl;
516 bool ClusterTree::computeBestSpiltBestIterativeSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
519 const UINT N = (UINT)features.size();
523 if( N == 0 )
return false;
526 UINT bestFeatureIndex = 0;
528 Float bestThreshold = 0;
540 for(UINT n=0; n<N; n++){
541 minRange = ranges[n].minValue;
542 maxRange = ranges[n].maxValue;
543 step = (maxRange-minRange)/Float(numSplittingSteps);
544 threshold = minRange;
545 featureIndex = features[n];
547 while( threshold <= maxRange ){
550 groupCounter[0] = groupCounter[1] = 0;
551 groupMean[0] = groupMean[1] = 0;
552 groupMSE[0] = groupMSE[1] = 0;
555 for(UINT i=0; i<M; i++){
556 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
557 groupIndex[i] = groupID;
560 groupMean[ groupID ] += trainingData[i][featureIndex];
561 groupCounter[ groupID ]++;
565 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
566 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
569 for(UINT i=0; i<M; i++){
570 groupMSE[ groupIndex[i] ] += grt_sqr( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
572 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
573 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
575 error = grt_sqrt( groupMSE[0] + groupMSE[1] );
578 if( error < minError ){
580 bestThreshold = threshold;
581 bestFeatureIndex = featureIndex;
590 featureIndex = bestFeatureIndex;
591 threshold = bestThreshold;
596 bool ClusterTree::computeBestSpiltBestRandomSpilt(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
599 const UINT N = (UINT)features.size();
601 Clusterer::debugLog <<
"computeBestSpiltBestRandomSpilt() M: " << M << std::endl;
603 if( N == 0 )
return false;
606 UINT bestFeatureIndex = 0;
608 Float bestThreshold = 0;
617 for(UINT n=0; n<N; n++){
618 featureIndex = features[n];
620 for(UINT m=0; m<numSplittingSteps; m++){
625 groupCounter[0] = groupCounter[1] = 0;
626 groupMean[0] = groupMean[1] = 0;
627 groupMSE[0] = groupMSE[1] = 0;
630 for(UINT i=0; i<M; i++){
631 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
632 groupIndex[i] = groupID;
635 groupMean[ groupID ] += trainingData[i][featureIndex];
636 groupCounter[ groupID ]++;
640 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
641 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
644 for(UINT i=0; i<M; i++){
645 groupMSE[ groupIndex[i] ] += MLBase::SQR( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
647 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
648 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
650 error = sqrt( groupMSE[0] + groupMSE[1] );
653 if( error < minError ){
655 bestThreshold = threshold;
656 bestFeatureIndex = featureIndex;
662 featureIndex = bestFeatureIndex;
663 threshold = bestThreshold;
const ClusterTreeNode * getTree() const
virtual bool train_(MatrixFloat &trainingData)
Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
bool setMinRMSErrorPerNode(const Float minRMSErrorPerNode)
virtual ~ClusterTree(void)
This class implements a Cluster Tree. This can be used to automatically build a cluster model (where ...
virtual bool print() const
std::string getClustererType() const
virtual bool saveModelToFile(std::fstream &file) const
bool scale(const Float minTarget, const Float maxTarget)
virtual bool resize(const unsigned int size)
virtual bool predict(const VectorFloat &x)
bool copyBaseVariables(const Clusterer *clusterer)
bool loadClustererSettingsFromFile(std::fstream &file)
bool set(const UINT nodeSize, const UINT featureIndex, const Float threshold, const UINT clusterLabel)
ClusterTreeNode * deepCopyTree() const
virtual bool save(std::fstream &file) const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
virtual Node * deepCopyNode() const
bool saveClustererSettingsToFile(std::fstream &file) const
ClusterTree & operator=(const ClusterTree &rhs)
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
unsigned int getNumCols() const
virtual bool deepCopyFrom(const Clusterer *cluster)
virtual bool predict_(VectorFloat &inputVector)
virtual bool print() const
virtual bool loadModelFromFile(std::fstream &file)
VectorFloat getRow(const unsigned int r) const
Vector< MinMax > getRanges() const
Float getRandomNumberUniform(Float minRange=0.0, Float maxRange=1.0)
ClusterTree(const UINT numSplittingSteps=100, const UINT minNumSamplesPerNode=5, const UINT maxDepth=10, const bool removeFeaturesAtEachSpilt=false, const UINT trainingMode=BEST_ITERATIVE_SPILT, const bool useScaling=false, const Float minRMSErrorPerNode=0.01)
UINT getPredictedClusterLabel() const
bool push_back(const Vector< T > &sample)
virtual bool load(std::fstream &file)
virtual bool predict(const VectorFloat &x)
Float getMinRMSErrorPerNode() const