21 #define GRT_DLL_EXPORTS 27 const std::string KMeans::id =
"KMeans";
38 this->minNumEpochs = minNumEpochs;
39 this->maxNumEpochs = maxNumEpochs;
40 this->minChange = minChange;
41 this->computeTheta = computeTheta;
46 numTrainingIterationsToConverge = 0;
56 this->computeTheta = rhs.computeTheta;
57 this->finalTheta = rhs.finalTheta;
58 this->clusters = rhs.clusters;
59 this->assign = rhs.assign;
60 this->count = rhs.count;
61 this->thetaTracker = rhs.thetaTracker;
77 this->computeTheta = rhs.computeTheta;
78 this->finalTheta = rhs.finalTheta;
79 this->clusters = rhs.clusters;
80 this->assign = rhs.assign;
81 this->count = rhs.count;
82 this->thetaTracker = rhs.thetaTracker;
93 if( clusterer == NULL )
return false;
97 const KMeans *ptr =
dynamic_cast<const KMeans*
>(clusterer);
101 this->computeTheta = ptr->computeTheta;
102 this->finalTheta = ptr->finalTheta;
103 this->clusters = ptr->clusters;
104 this->assign = ptr->assign;
105 this->count = ptr->count;
106 this->thetaTracker = ptr->thetaTracker;
117 errorLog <<
"train_(ClassificationData &trainingData) - The training data is empty!" << std::endl;
128 for(UINT i=0; i<M; i++){
129 for(UINT j=0; j<N; j++){
130 data[i][j] = trainingData[i][j];
144 for(UINT i=0; i<M; i++){
145 for(UINT j=0; j<N; j++){
146 data[i][j] = trainingData[i][j];
158 errorLog <<
"train_(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
163 errorLog <<
"train_(MatrixFloat &data) - The number of rows or columns in the data is zero!" << std::endl;
178 std::random_shuffle(randIndexs.begin(), randIndexs.end());
182 for(UINT j=0; j<numInputDimensions; j++){
183 clusters[k][j] = data[ randIndexs[k] ][j];
196 if( inputVector.
getSize() != numInputDimensions ){
201 for(UINT n=0; n<numInputDimensions; n++){
202 inputVector[n] = grt_scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0.0, 1.0);
206 const Float sigma = 1.0;
207 const Float gamma = 1.0 / (2.0*grt_sqr(sigma));
223 for(UINT j=0; j<numInputDimensions; j++){
224 dist += grt_sqr( inputVector[j]-clusters[i][j] );
227 clusterDistances[i] = dist;
228 clusterLikelihoods[i] = exp( - grt_sqr(gamma * dist) );
230 sum += clusterLikelihoods[i];
232 if( dist < bestDistance ){
240 clusterLikelihoods[i] /= sum;
244 maxLikelihood = clusterLikelihoods[ minIndex ];
252 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
257 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of rows in the cluster matrix does not match the number of clusters! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
261 if( clusters.
getNumCols() != numInputDimensions ){
262 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of columns in the cluster matrix does not match the number of input dimensions! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
267 UINT currentIter = 0;
269 bool keepTraining =
true;
274 thetaTracker.clear();
276 numTrainingIterationsToConverge = 0;
293 while( keepTraining ){
297 numChanged = estep( data );
307 theta = calculateTheta(data);
308 delta = lastTheta - theta;
310 }
else theta = delta = 0;
313 if( numChanged == 0 && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
314 if( currentIter >= maxNumEpochs ){ keepTraining =
false; }
315 if( fabs( delta ) < minChange && computeTheta && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
316 if( computeTheta ) thetaTracker.push_back( theta );
318 trainingLog <<
"Epoch: " << currentIter <<
"/" << maxNumEpochs;
319 trainingLog <<
" Epoch time: " << (timer.
getMilliSeconds()-startTime)/1000.0 <<
" seconds";
320 trainingLog <<
" Theta: " << theta <<
" Delta: " << delta << std::endl;
322 trainingLog <<
"Model Trained at epoch: " << currentIter <<
" with a theta value of: " << theta << std::endl;
325 numTrainingIterationsToConverge = currentIter;
329 clusterLabels.
resize(numClusters);
331 clusterLabels[i] = i+1;
333 clusterLikelihoods.
resize(numClusters,0);
334 clusterDistances.
resize(numClusters,0);
353 for (n=0; n < numInputDimensions; n++)
354 d += grt_sqr( data[m][n]-clusters[k][n] );
355 if (d <= dmin){ dmin = d; kmin = k; }
357 if ( kmin != assign[m] ){
371 for (n=0;n<numInputDimensions;n++)
376 for(n=0; n < numInputDimensions; n++)
377 clusters[ assign[m] ][n] += data[m][n];
381 Float countNorm = 1.0 / count[k];
382 for (n=0; n < numInputDimensions; n++){
383 clusters[k][n] *= countNorm;
389 Float KMeans::calculateTheta(
const MatrixFloat &data){
397 for(n=0; n < numInputDimensions; n++){
398 sum += grt_sqr(clusters[k][n] - data[m][n]);
400 theta += grt_sqrt(sum);
410 if( !file.is_open() ){
411 errorLog <<
"saveModelToFile(fstream &file) - Failed to save model, file is not open!" << std::endl;
415 file <<
"GRT_KMEANS_MODEL_FILE_V1.0\n";
418 errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
423 file <<
"Clusters:\n";
426 for(UINT n=0; n<numInputDimensions; n++){
427 file << clusters[k][n] <<
"\t";
442 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
448 if( word !=
"GRT_KMEANS_MODEL_FILE_V1.0" ){
453 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
459 if( word !=
"Clusters:" ){
468 for(UINT n=0; n<numInputDimensions; n++){
469 file >> clusters[k][n];
483 thetaTracker.clear();
496 thetaTracker.clear();
504 bool KMeans::setComputeTheta(
const bool computeTheta){
505 this->computeTheta = computeTheta;
513 this->clusters = clusters;
std::string getId() const
virtual bool saveModelToFile(std::fstream &file) const
KMeans(const UINT numClusters=10, const UINT minNumEpochs=5, const UINT maxNumEpochs=1000, const Float minChange=1.0e-5, const bool computeTheta=true)
virtual bool predict_(VectorFloat &inputVector)
virtual bool reset() override
This file contains the Random class, a useful wrapper for generating cross platform random functions...
bool scale(const Float minTarget, const Float maxTarget)
virtual bool clear() override
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
virtual bool train_(MatrixFloat &data)
UINT getNumSamples() const
bool copyBaseVariables(const Clusterer *clusterer)
signed long getMilliSeconds()
UINT nchg
Number of values changes.
bool loadClustererSettingsFromFile(std::fstream &file)
bool setClusters(const MatrixFloat &clusters)
This class implements the KMeans clustering algorithm.
virtual bool loadModelFromFile(std::fstream &file)
UINT getNumSamples() const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
UINT numTrainingSamples
Number of training examples.
bool saveClustererSettingsToFile(std::fstream &file) const
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
UINT getNumDimensions() const
UINT getNumClasses() const
unsigned int getNumCols() const
bool trainModel(MatrixFloat &data)
Vector< MinMax > getRanges() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
virtual bool resize(const unsigned int r, const unsigned int c)
static std::string getId()
KMeans & operator=(const KMeans &rhs)