21 #define GRT_DLL_EXPORTS
31 classType =
"HierarchicalClustering";
32 clustererType = classType;
33 debugLog.setProceedingText(
"[DEBUG HierarchicalClustering]");
34 errorLog.setProceedingText(
"[ERROR HierarchicalClustering]");
35 trainingLog.setProceedingText(
"[TRAINING HierarchicalClustering]");
36 warningLog.setProceedingText(
"[WARNING HierarchicalClustering]");
40 classType =
"HierarchicalClustering";
41 clustererType = classType;
42 debugLog.setProceedingText(
"[DEBUG HierarchicalClustering]");
43 errorLog.setProceedingText(
"[ERROR HierarchicalClustering]");
44 trainingLog.setProceedingText(
"[TRAINING HierarchicalClustering]");
45 warningLog.setProceedingText(
"[WARNING HierarchicalClustering]");
59 this->clusters = rhs.clusters;
60 this->distanceMatrix = rhs.distanceMatrix;
71 if( clusterer == NULL )
return false;
79 this->clusters = ptr->clusters;
80 this->distanceMatrix = ptr->distanceMatrix;
102 distanceMatrix.
clear();
118 for(UINT i=0; i<M; i++){
119 for(UINT j=0; j<N; j++){
120 data[i][j] = trainingData[i][j];
138 for(UINT i=0; i<M; i++){
139 for(UINT j=0; j<N; j++){
140 data[i][j] = trainingData[i][j];
144 return train( data );
151 distanceMatrix.
clear();
162 distanceMatrix.
resize(M,M);
165 for(UINT i=0; i<M; i++){
166 for(UINT j=0; j<M; j++){
169 distanceMatrix[i][j] = squaredEuclideanDistance(data[i], data[j]);
175 UINT uniqueClusterID = 0;
177 for(UINT i=0; i<M; i++){
178 clusterData[i].uniqueClusterID = uniqueClusterID++;
179 clusterData[i].addSampleToCluster(i);
182 trainingLog <<
"Starting clustering..." << std::endl;
187 newLevel.level = level;
188 for(UINT i=0; i<M; i++){
189 newLevel.clusters.push_back( clusterData[i] );
191 clusters.push_back( newLevel );
195 bool keepClustering =
true;
197 while( keepClustering ){
202 UINT K = (UINT)clusterData.size();
203 for(UINT i=0; i<K; i++){
204 for(UINT j=0; j<K; j++){
206 Float dist = computeClusterDistance( clusterData[i], clusterData[j] );
208 if( dist < minDist ){
213 clusterPairs.clear();
214 clusterPairs.push_back( clusterPair );
222 keepClustering =
false;
223 warningLog <<
"train_(MatrixFloat &data) - Failed to find any cluster at level: " << level << std::endl;
229 newLevel.level = level;
233 newCluster.uniqueClusterID = uniqueClusterID++;
235 const UINT numClusterPairs = clusterPairs.
getSize();
237 for(UINT k=0; k<numClusterPairs; k++){
239 UINT numSamplesInClusterA = clusterData[ clusterPairs[k][0] ].getNumSamplesInCluster();
240 for(UINT i=0; i<numSamplesInClusterA; i++){
241 UINT index = clusterData[ clusterPairs[k][0] ][ i ];
242 newCluster.addSampleToCluster( index );
246 UINT numSamplesInClusterB = clusterData[ clusterPairs[k][1] ].getNumSamplesInCluster();
247 for(UINT i=0; i<numSamplesInClusterB; i++){
248 UINT index = clusterData[ clusterPairs[k][1] ][ i ];
249 newCluster.addSampleToCluster( index );
253 newCluster.clusterVariance = computeClusterVariance( newCluster, data );
256 UINT idA = clusterData[ clusterPairs[k][0] ].getUniqueClusterID();
257 UINT idB = clusterData[ clusterPairs[k][1] ].getUniqueClusterID();
260 while( iter != clusterData.end() ){
261 if( iter->getUniqueClusterID() == idA || iter->getUniqueClusterID() == idB ){
262 iter = clusterData.erase( iter );
263 if( ++numRemoved >= 2 )
break;
269 clusterData.push_back( newCluster );
272 newLevel.clusters.push_back( newCluster );
274 clusters.push_back( newLevel );
282 keepClustering =
false;
285 if( clusterData.size() == 0 ){
286 keepClustering =
false;
289 trainingLog <<
"Cluster level: " << level <<
" Number of clusters: " << clusters.back().getNumClusters() << std::endl;
298 clusterLabels[i] = i+1;
300 clusterLikelihoods.
resize(numClusters,0);
301 clusterDistances.
resize(numClusters,0);
306 bool HierarchicalClustering::printModel(){
308 UINT K = (UINT)clusters.size();
310 std::cout <<
"Hierarchical Clustering Model\n\n";
311 for(UINT k=0; k<K; k++){
315 numSamples += clusters[k][i].getNumSamplesInCluster();
318 std::cout <<
"Level: " << clusters[k].level <<
"\tNumClusters: " << numClusters <<
"\tNumSamples: " << numSamples << std::endl;
320 std::cout <<
"ClusterVariance: " << clusters[k][i].clusterVariance << std::endl;
321 std::cout <<
"Indexs: ";
322 UINT numSamplesInCluster = clusters[k][i].getNumSamplesInCluster();
323 for(UINT j=0; j<numSamplesInCluster; j++){
324 std::cout << clusters[k][i][j] <<
"\t";
326 std::cout << std::endl;
333 Float HierarchicalClustering::squaredEuclideanDistance(
const Float *a,
const Float *b){
335 for(UINT i=0; i<N; i++){
336 dist += SQR( a[i] - b[i] );
341 Float HierarchicalClustering::computeClusterDistance(
const ClusterInfo &clusterA,
const ClusterInfo &clusterB ){
344 const UINT numSamplesA = clusterA.getNumSamplesInCluster();
345 const UINT numSamplesB = clusterB.getNumSamplesInCluster();
348 for(UINT i=0; i<numSamplesA; i++){
349 for(UINT j=0; j<numSamplesB; j++){
350 if( distanceMatrix[ clusterA[i] ][ clusterB[j] ] < minDist ){
351 minDist = distanceMatrix[ clusterA[i] ][ clusterB[j] ];
359 Float HierarchicalClustering::computeClusterVariance(
const ClusterInfo &cluster,
const MatrixFloat &data ){
365 UINT numSamples = cluster.getNumSamplesInCluster();
366 for(UINT j=0; j<N; j++){
367 for(UINT i=0; i<numSamples; i++){
368 UINT index = cluster[i];
369 mean[j] += data[ index ][j];
371 mean[j] /= Float( numSamples );
375 for(UINT j=0; j<N; j++){
376 for(UINT i=0; i<numSamples; i++){
377 std[j] += grt_sqr( data[ cluster[i] ][j] - mean[j] );
379 std[j] = grt_sqrt( std[j] / Float( numSamples-1 ) );
383 for(UINT j=0; j<N; j++){
391 if( !file.is_open() ){
392 errorLog <<
"saveModelToFile(string filename) - Failed to open file!" << std::endl;
396 file <<
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0\n";
399 errorLog <<
"saveModelToFile(fstream &file) - Failed to save cluster settings to file!" << std::endl;
404 file <<
"M: " << M << std::endl;
405 file <<
"N: " << N << std::endl;
406 file <<
"NumLevels: " << clusters.
getSize() << std::endl;
408 for(UINT i=0; i<clusters.
getSize(); i++){
409 file <<
"Level: " << clusters[i].getLevel() << std::endl;
410 file <<
"NumClusters: " << clusters[i].getNumClusters() << std::endl;
426 if( word !=
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0" ){
431 errorLog <<
"loadModelFromFile(fstream &file) - Failed to load cluster settings from file!" << std::endl;
virtual bool loadModelFromFile(std::fstream &file)
std::string getClustererType() const
virtual bool saveModelToFile(std::fstream &file) const
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
virtual bool train(ClassificationData trainingData)
virtual ~HierarchicalClustering()
UINT getNumSamples() const
bool copyBaseVariables(const Clusterer *clusterer)
bool loadClustererSettingsFromFile(std::fstream &file)
UINT getNumSamples() const
virtual bool train_(MatrixFloat &trainingData)
bool saveClustererSettingsToFile(std::fstream &file) const
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
UINT getNumDimensions() const
unsigned int getNumCols() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
HierarchicalClustering & operator=(const HierarchicalClustering &rhs)
virtual bool resize(const unsigned int r, const unsigned int c)
This class implements a basic Hierarchial Clustering algorithm.