21 #define GRT_DLL_EXPORTS 27 const std::string HierarchicalClustering::id =
"HierarchicalClustering";
53 this->clusters = rhs.clusters;
54 this->distanceMatrix = rhs.distanceMatrix;
65 if( clusterer == NULL )
return false;
73 this->clusters = ptr->clusters;
74 this->distanceMatrix = ptr->distanceMatrix;
96 distanceMatrix.
clear();
112 for(UINT i=0; i<M; i++){
113 for(UINT j=0; j<N; j++){
114 data[i][j] = trainingData[i][j];
132 for(UINT i=0; i<M; i++){
133 for(UINT j=0; j<N; j++){
134 data[i][j] = trainingData[i][j];
138 return train( data );
145 distanceMatrix.
clear();
156 distanceMatrix.
resize(M,M);
159 for(UINT i=0; i<M; i++){
160 for(UINT j=0; j<M; j++){
163 distanceMatrix[i][j] = squaredEuclideanDistance(data[i], data[j]);
169 UINT uniqueClusterID = 0;
171 for(UINT i=0; i<M; i++){
172 clusterData[i].uniqueClusterID = uniqueClusterID++;
173 clusterData[i].addSampleToCluster(i);
176 trainingLog <<
"Starting clustering..." << std::endl;
181 newLevel.level = level;
182 for(UINT i=0; i<M; i++){
183 newLevel.clusters.push_back( clusterData[i] );
185 clusters.push_back( newLevel );
189 bool keepClustering =
true;
191 while( keepClustering ){
196 UINT K = (UINT)clusterData.size();
197 for(UINT i=0; i<K; i++){
198 for(UINT j=0; j<K; j++){
200 Float dist = computeClusterDistance( clusterData[i], clusterData[j] );
202 if( dist < minDist ){
207 clusterPairs.clear();
208 clusterPairs.push_back( clusterPair );
216 keepClustering =
false;
217 warningLog <<
"train_(MatrixFloat &data) - Failed to find any cluster at level: " << level << std::endl;
223 newLevel.level = level;
227 newCluster.uniqueClusterID = uniqueClusterID++;
229 const UINT numClusterPairs = clusterPairs.
getSize();
231 for(UINT k=0; k<numClusterPairs; k++){
233 UINT numSamplesInClusterA = clusterData[ clusterPairs[k][0] ].getNumSamplesInCluster();
234 for(UINT i=0; i<numSamplesInClusterA; i++){
235 UINT index = clusterData[ clusterPairs[k][0] ][ i ];
236 newCluster.addSampleToCluster( index );
240 UINT numSamplesInClusterB = clusterData[ clusterPairs[k][1] ].getNumSamplesInCluster();
241 for(UINT i=0; i<numSamplesInClusterB; i++){
242 UINT index = clusterData[ clusterPairs[k][1] ][ i ];
243 newCluster.addSampleToCluster( index );
247 newCluster.clusterVariance = computeClusterVariance( newCluster, data );
250 UINT idA = clusterData[ clusterPairs[k][0] ].getUniqueClusterID();
251 UINT idB = clusterData[ clusterPairs[k][1] ].getUniqueClusterID();
254 while( iter != clusterData.end() ){
255 if( iter->getUniqueClusterID() == idA || iter->getUniqueClusterID() == idB ){
256 iter = clusterData.erase( iter );
257 if( ++numRemoved >= 2 )
break;
263 clusterData.push_back( newCluster );
266 newLevel.clusters.push_back( newCluster );
268 clusters.push_back( newLevel );
276 keepClustering =
false;
279 if( clusterData.size() == 0 ){
280 keepClustering =
false;
283 trainingLog <<
"Cluster level: " << level <<
" Number of clusters: " << clusters.back().getNumClusters() << std::endl;
292 clusterLabels[i] = i+1;
294 clusterLikelihoods.
resize(numClusters,0);
295 clusterDistances.
resize(numClusters,0);
300 bool HierarchicalClustering::printModel(){
302 UINT K = (UINT)clusters.size();
304 std::cout <<
"Hierarchical Clustering Model\n\n";
305 for(UINT k=0; k<K; k++){
309 numSamples += clusters[k][i].getNumSamplesInCluster();
312 std::cout <<
"Level: " << clusters[k].level <<
"\tNumClusters: " << numClusters <<
"\tNumSamples: " << numSamples << std::endl;
314 std::cout <<
"ClusterVariance: " << clusters[k][i].clusterVariance << std::endl;
315 std::cout <<
"Indexs: ";
316 UINT numSamplesInCluster = clusters[k][i].getNumSamplesInCluster();
317 for(UINT j=0; j<numSamplesInCluster; j++){
318 std::cout << clusters[k][i][j] <<
"\t";
320 std::cout << std::endl;
327 Float HierarchicalClustering::squaredEuclideanDistance(
const Float *a,
const Float *b){
329 for(UINT i=0; i<N; i++){
330 dist += SQR( a[i] - b[i] );
335 Float HierarchicalClustering::computeClusterDistance(
const ClusterInfo &clusterA,
const ClusterInfo &clusterB ){
338 const UINT numSamplesA = clusterA.getNumSamplesInCluster();
339 const UINT numSamplesB = clusterB.getNumSamplesInCluster();
342 for(UINT i=0; i<numSamplesA; i++){
343 for(UINT j=0; j<numSamplesB; j++){
344 if( distanceMatrix[ clusterA[i] ][ clusterB[j] ] < minDist ){
345 minDist = distanceMatrix[ clusterA[i] ][ clusterB[j] ];
353 Float HierarchicalClustering::computeClusterVariance(
const ClusterInfo &cluster,
const MatrixFloat &data ){
359 UINT numSamples = cluster.getNumSamplesInCluster();
360 for(UINT j=0; j<N; j++){
361 for(UINT i=0; i<numSamples; i++){
362 UINT index = cluster[i];
363 mean[j] += data[ index ][j];
365 mean[j] /= Float( numSamples );
369 for(UINT j=0; j<N; j++){
370 for(UINT i=0; i<numSamples; i++){
371 std[j] += grt_sqr( data[ cluster[i] ][j] - mean[j] );
373 std[j] = grt_sqrt( std[j] / Float( numSamples-1 ) );
377 for(UINT j=0; j<N; j++){
385 if( !file.is_open() ){
386 errorLog <<
"saveModelToFile(string filename) - Failed to open file!" << std::endl;
390 file <<
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0\n";
393 errorLog <<
"saveModelToFile(fstream &file) - Failed to save cluster settings to file!" << std::endl;
398 file <<
"M: " << M << std::endl;
399 file <<
"N: " << N << std::endl;
400 file <<
"NumLevels: " << clusters.
getSize() << std::endl;
402 for(UINT i=0; i<clusters.
getSize(); i++){
403 file <<
"Level: " << clusters[i].getLevel() << std::endl;
404 file <<
"NumClusters: " << clusters[i].getNumClusters() << std::endl;
420 if( word !=
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0" ){
425 errorLog <<
"loadModelFromFile(fstream &file) - Failed to load cluster settings from file!" << std::endl;
std::string getId() const
virtual bool reset() override
virtual bool loadModelFromFile(std::fstream &file)
virtual bool saveModelToFile(std::fstream &file) const
virtual bool clear() override
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
virtual bool train(ClassificationData trainingData)
virtual ~HierarchicalClustering()
UINT getNumSamples() const
bool copyBaseVariables(const Clusterer *clusterer)
bool loadClustererSettingsFromFile(std::fstream &file)
UINT getNumSamples() const
virtual bool train_(MatrixFloat &trainingData)
bool saveClustererSettingsToFile(std::fstream &file) const
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
UINT getNumDimensions() const
unsigned int getNumCols() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
HierarchicalClustering & operator=(const HierarchicalClustering &rhs)
static std::string getId()
virtual bool resize(const unsigned int r, const unsigned int c)
This class implements a basic Hierarchial Clustering algorithm.