21 #define GRT_DLL_EXPORTS 27 const std::string ClusterTree::id =
"ClusterTree";
36 ClusterTree::ClusterTree(
const UINT numSplittingSteps,
const UINT minNumSamplesPerNode,
const UINT maxDepth,
const bool removeFeaturesAtEachSplit,
const Tree::TrainingMode trainingMode,
const bool useScaling,
const Float minRMSErrorPerNode) :
Clusterer(
ClusterTree::
getId() )
40 this->numSplittingSteps = numSplittingSteps;
41 this->minNumSamplesPerNode = minNumSamplesPerNode;
42 this->maxDepth = maxDepth;
43 this->removeFeaturesAtEachSplit = removeFeaturesAtEachSplit;
44 this->trainingMode = trainingMode;
45 this->minRMSErrorPerNode = minRMSErrorPerNode;
70 this->numSplittingSteps = rhs.numSplittingSteps;
71 this->minNumSamplesPerNode = rhs.minNumSamplesPerNode;
72 this->maxDepth = rhs.maxDepth;
73 this->removeFeaturesAtEachSplit = rhs.removeFeaturesAtEachSplit;
74 this->trainingMode = rhs.trainingMode;
75 this->minRMSErrorPerNode = rhs.minRMSErrorPerNode;
86 if( clusterer == NULL )
return false;
100 this->numSplittingSteps = ptr->numSplittingSteps;
101 this->minNumSamplesPerNode = ptr->minNumSamplesPerNode;
102 this->maxDepth = ptr->maxDepth;
103 this->removeFeaturesAtEachSplit = ptr->removeFeaturesAtEachSplit;
104 this->trainingMode = ptr->trainingMode;
105 this->minRMSErrorPerNode = ptr->minRMSErrorPerNode;
119 const unsigned int M = trainingData.
getNumRows();
120 const unsigned int N = trainingData.
getNumCols();
123 errorLog << __GRT_LOG__ <<
" Training data has zero samples!" << std::endl;
127 numInputDimensions = N;
128 numOutputDimensions = 1;
134 trainingData.
scale(0, 1);
139 for(UINT i=0; i<N; i++){
144 UINT clusterLabel = 0;
146 tree = buildTree( trainingData, NULL, features, clusterLabel, nodeID );
151 errorLog << __GRT_LOG__ <<
" Failed to build tree!" << std::endl;
161 clusterLabels[i] = i+1;
163 clusterLikelihoods.
resize(numClusters,0);
164 clusterDistances.
resize(numClusters,0);
172 errorLog << __GRT_LOG__ <<
" Model Not Trained!" << std::endl;
177 errorLog << __GRT_LOG__ <<
" DecisionTree pointer is null!" << std::endl;
181 if( inputVector.
getSize() != numInputDimensions ){
182 errorLog << __GRT_LOG__ <<
" The size of the input Vector (" << inputVector.
getSize() <<
") does not match the num features in the model (" << numInputDimensions << std::endl;
187 for(UINT n=0; n<numInputDimensions; n++){
188 inputVector[n] =
scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0, 1);
193 if( !tree->
predict_( inputVector, clusterLabel ) ){
194 errorLog << __GRT_LOG__ <<
" Failed to predict!" << std::endl;
218 return tree->
print();
224 if( !file.is_open() )
226 errorLog <<
"saveModelToFile(fstream &file) - The file is not open!" << std::endl;
231 file <<
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0" << std::endl;
235 errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
239 file <<
"NumSplittingSteps: " << numSplittingSteps << std::endl;
240 file <<
"MinNumSamplesPerNode: " << minNumSamplesPerNode << std::endl;
241 file <<
"MaxDepth: " << maxDepth << std::endl;
242 file <<
"RemoveFeaturesAtEachSpilt: " << removeFeaturesAtEachSplit << std::endl;
243 file <<
"TrainingMode: " << trainingMode << std::endl;
244 file <<
"MinRMSErrorPerNode: " << minRMSErrorPerNode << std::endl;
245 file <<
"TreeBuilt: " << (tree != NULL ? 1 : 0) << std::endl;
249 if( !tree->
save( file ) ){
250 errorLog <<
"saveModelToFile(fstream &file) - Failed to save tree to file!" << std::endl;
264 errorLog <<
"loadModelFromFile(string filename) - Could not open file to load model" << std::endl;
272 if(word !=
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0"){
273 errorLog <<
"loadModelFromFile(string filename) - Could not find Model File Header" << std::endl;
279 errorLog <<
"loadModelFromFile(string filename) - Failed to load base settings from file!" << std::endl;
284 if(word !=
"NumSplittingSteps:"){
285 errorLog <<
"loadModelFromFile(string filename) - Could not find the NumSplittingSteps!" << std::endl;
288 file >> numSplittingSteps;
291 if(word !=
"MinNumSamplesPerNode:"){
292 errorLog <<
"loadModelFromFile(string filename) - Could not find the MinNumSamplesPerNode!" << std::endl;
295 file >> minNumSamplesPerNode;
298 if(word !=
"MaxDepth:"){
299 errorLog <<
"loadModelFromFile(string filename) - Could not find the MaxDepth!" << std::endl;
305 if(word !=
"RemoveFeaturesAtEachSpilt:"){
306 errorLog <<
"loadModelFromFile(string filename) - Could not find the RemoveFeaturesAtEachSpilt!" << std::endl;
309 file >> removeFeaturesAtEachSplit;
312 if(word !=
"TrainingMode:"){
313 errorLog <<
"loadModelFromFile(string filename) - Could not find the TrainingMode!" << std::endl;
316 UINT tempTrainingMode = 0;
317 file >> tempTrainingMode;
318 trainingMode =
static_cast<Tree::TrainingMode
>(tempTrainingMode);
321 if(word !=
"MinRMSErrorPerNode:"){
322 errorLog <<
"loadModelFromFile(string filename) - Could not find the MinRMSErrorPerNode!" << std::endl;
325 file >> minRMSErrorPerNode;
328 if(word !=
"TreeBuilt:"){
329 errorLog <<
"loadModelFromFile(string filename) - Could not find the TreeBuilt!" << std::endl;
337 errorLog <<
"loadModelFromFile(string filename) - Could not find the Tree!" << std::endl;
346 errorLog <<
"loadModelFromFile(fstream &file) - Failed to create new RegressionTreeNode!" << std::endl;
350 tree->setParent( NULL );
351 if( !tree->
load( file ) ){
353 errorLog <<
"loadModelFromFile(fstream &file) - Failed to load tree from file!" << std::endl;
360 clusterLabels[i] = i+1;
362 clusterLikelihoods.
resize(numClusters,0);
363 clusterDistances.
resize(numClusters,0);
387 return minRMSErrorPerNode;
395 return numSplittingSteps;
399 return minNumSamplesPerNode;
416 return removeFeaturesAtEachSplit;
420 if( trainingMode >= Tree::BEST_ITERATIVE_SPILT && trainingMode < Tree::NUM_TRAINING_MODES ){
421 this->trainingMode = trainingMode;
424 warningLog <<
"Unknown trainingMode: " << trainingMode << std::endl;
429 if( numSplittingSteps > 0 ){
430 this->numSplittingSteps = numSplittingSteps;
433 warningLog <<
"setNumSplittingSteps(const UINT numSplittingSteps) - The number of splitting steps must be greater than zero!" << std::endl;
438 if( minNumSamplesPerNode > 0 ){
439 this->minNumSamplesPerNode = minNumSamplesPerNode;
442 warningLog <<
"setMinNumSamplesPerNode(const UINT minNumSamplesPerNode) - The minimum number of samples per node must be greater than zero!" << std::endl;
448 this->maxDepth = maxDepth;
451 warningLog <<
"setMaxDepth(const UINT maxDepth) - The maximum depth must be greater than zero!" << std::endl;
456 this->removeFeaturesAtEachSplit = removeFeaturesAtEachSplit;
461 this->minRMSErrorPerNode = minRMSErrorPerNode;
488 node->initNode( parent, depth, nodeID );
491 if( features.
getSize() == 0 || M < minNumSamplesPerNode || depth >= maxDepth ){
497 node->setIsLeafNode(
true );
500 node->
set( M, 0, 0, clusterLabel );
502 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
508 UINT featureIndex = 0;
511 if( !computeBestSplit( trainingData, features, featureIndex, threshold, minError ) ){
516 Clusterer::trainingLog <<
"Depth: " << depth <<
" FeatureIndex: " << featureIndex <<
" Threshold: " << threshold <<
" MinError: " << minError << std::endl;
519 if( minError <= minRMSErrorPerNode ){
524 node->setIsLeafNode(
true );
527 node->
set( M, featureIndex, threshold, clusterLabel );
529 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << std::endl;
535 node->
set( M, featureIndex, threshold, 0 );
538 if( removeFeaturesAtEachSplit ){
539 for(UINT i=0; i<features.
getSize(); i++){
540 if( features[i] == featureIndex ){
541 features.erase( features.begin()+i );
551 for(UINT i=0; i<M; i++){
558 node->setLeftChild( buildTree( lhs, node, features, clusterLabel, nodeID ) );
559 node->setRightChild( buildTree( rhs, node, features, clusterLabel, nodeID ) );
564 bool ClusterTree::computeBestSplit(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
566 switch( trainingMode ){
567 case Tree::BEST_ITERATIVE_SPILT:
568 return computeBestSplitBestIterativeSplit( trainingData, features, featureIndex, threshold, minError );
570 case Tree::BEST_RANDOM_SPLIT:
571 return computeBestSplitBestRandomSplit( trainingData, features, featureIndex, threshold, minError );
574 errorLog <<
"Uknown trainingMode!" << std::endl;
582 bool ClusterTree::computeBestSplitBestIterativeSplit(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
585 const UINT N = (UINT)features.size();
589 if( N == 0 )
return false;
592 UINT bestFeatureIndex = 0;
594 Float bestThreshold = 0;
606 for(UINT n=0; n<N; n++){
607 minRange = ranges[n].minValue;
608 maxRange = ranges[n].maxValue;
609 step = (maxRange-minRange)/Float(numSplittingSteps);
610 threshold = minRange;
611 featureIndex = features[n];
613 while( threshold <= maxRange ){
616 groupCounter[0] = groupCounter[1] = 0;
617 groupMean[0] = groupMean[1] = 0;
618 groupMSE[0] = groupMSE[1] = 0;
621 for(UINT i=0; i<M; i++){
622 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
623 groupIndex[i] = groupID;
626 groupMean[ groupID ] += trainingData[i][featureIndex];
627 groupCounter[ groupID ]++;
631 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
632 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
635 for(UINT i=0; i<M; i++){
636 groupMSE[ groupIndex[i] ] += grt_sqr( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
638 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
639 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
641 error = grt_sqrt( groupMSE[0] + groupMSE[1] );
644 if( error < minError ){
646 bestThreshold = threshold;
647 bestFeatureIndex = featureIndex;
656 featureIndex = bestFeatureIndex;
657 threshold = bestThreshold;
662 bool ClusterTree::computeBestSplitBestRandomSplit(
const MatrixFloat &trainingData,
const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){
665 const UINT N = (UINT)features.size();
667 debugLog <<
"computeBestSpiltBestRandomSpilt() M: " << M << std::endl;
669 if( N == 0 )
return false;
672 UINT bestFeatureIndex = 0;
674 Float bestThreshold = 0;
683 for(UINT n=0; n<N; n++){
684 featureIndex = features[n];
686 for(UINT m=0; m<numSplittingSteps; m++){
691 groupCounter[0] = groupCounter[1] = 0;
692 groupMean[0] = groupMean[1] = 0;
693 groupMSE[0] = groupMSE[1] = 0;
696 for(UINT i=0; i<M; i++){
697 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
698 groupIndex[i] = groupID;
701 groupMean[ groupID ] += trainingData[i][featureIndex];
702 groupCounter[ groupID ]++;
706 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
707 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
710 for(UINT i=0; i<M; i++){
711 groupMSE[ groupIndex[i] ] += MLBase::SQR( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
713 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
714 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
716 error = sqrt( groupMSE[0] + groupMSE[1] );
719 if( error < minError ){
721 bestThreshold = threshold;
722 bestFeatureIndex = featureIndex;
728 featureIndex = bestFeatureIndex;
729 threshold = bestThreshold;
std::string getId() const
const ClusterTreeNode * getTree() const
virtual bool predict(VectorFloat inputVector)
bool setMinRMSErrorPerNode(const Float minRMSErrorPerNode)
virtual ~ClusterTree(void)
UINT getNumSplittingSteps() const
virtual bool predict_(VectorFloat &x) override
This class implements a Cluster Tree. This can be used to automatically build a cluster model (where ...
bool setTrainingMode(const Tree::TrainingMode trainingMode)
bool scale(const Float minTarget, const Float maxTarget)
virtual bool clear() override
virtual bool resize(const unsigned int size)
virtual bool train_(MatrixFloat &trainingData) override
virtual bool saveModelToFile(std::fstream &file) const override
virtual bool clear() override
virtual bool deepCopyFrom(const Clusterer *cluster) override
bool copyBaseVariables(const Clusterer *clusterer)
bool loadClustererSettingsFromFile(std::fstream &file)
bool setNumSplittingSteps(const UINT numSplittingSteps)
UINT getPredictedNodeID() const
bool set(const UINT nodeSize, const UINT featureIndex, const Float threshold, const UINT clusterLabel)
ClusterTreeNode * deepCopyTree() const
bool setMaxDepth(const UINT maxDepth)
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
ClusterTree(const UINT numSplittingSteps=100, const UINT minNumSamplesPerNode=5, const UINT maxDepth=10, const bool removeFeaturesAtEachSplit=false, const Tree::TrainingMode trainingMode=Tree::BEST_ITERATIVE_SPILT, const bool useScaling=false, const Float minRMSErrorPerNode=0.01)
bool saveClustererSettingsToFile(std::fstream &file) const
ClusterTree & operator=(const ClusterTree &rhs)
bool getRemoveFeaturesAtEachSplit() const
virtual bool save(std::fstream &file) const override
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
unsigned int getNumCols() const
UINT getPredictedNodeID() const
virtual bool print() const override
virtual bool print() const override
VectorFloat getRow(const unsigned int r) const
Vector< MinMax > getRanges() const
Float getRandomNumberUniform(Float minRange=0.0, Float maxRange=1.0)
virtual bool load(std::fstream &file) override
static std::string getId()
virtual bool loadModelFromFile(std::fstream &file) override
UINT getPredictedClusterLabel() const
bool push_back(const Vector< T > &sample)
virtual bool predict_(VectorFloat &inputVector) override
virtual bool clear() override
Tree::TrainingMode getTrainingMode() const
bool setRemoveFeaturesAtEachSplit(const bool removeFeaturesAtEachSplit)
bool setMinNumSamplesPerNode(const UINT minNumSamplesPerNode)
UINT getMinNumSamplesPerNode() const
virtual Node * deepCopy() const
Float getMinRMSErrorPerNode() const
Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)