21 #define GRT_DLL_EXPORTS
30 KMeans::KMeans(
const UINT numClusters,
const UINT minNumEpochs,
const UINT maxNumEpochs,
const Float minChange,
const bool computeTheta){
33 this->minNumEpochs = minNumEpochs;
34 this->maxNumEpochs = maxNumEpochs;
35 this->minChange = minChange;
36 this->computeTheta = computeTheta;
41 numTrainingIterationsToConverge = 0;
45 clustererType = classType;
46 debugLog.setProceedingText(
"[DEBUG KMeans]");
47 errorLog.setProceedingText(
"[ERROR KMeans]");
48 trainingLog.setProceedingText(
"[TRAINING KMeans]");
49 warningLog.setProceedingText(
"[WARNING KMeans]");
55 clustererType = classType;
56 debugLog.setProceedingText(
"[DEBUG KMeans]");
57 errorLog.setProceedingText(
"[ERROR KMeans]");
58 trainingLog.setProceedingText(
"[TRAINING KMeans]");
59 warningLog.setProceedingText(
"[WARNING KMeans]");
65 this->computeTheta = rhs.computeTheta;
66 this->finalTheta = rhs.finalTheta;
67 this->clusters = rhs.clusters;
68 this->assign = rhs.assign;
69 this->count = rhs.count;
70 this->thetaTracker = rhs.thetaTracker;
87 this->computeTheta = rhs.computeTheta;
88 this->finalTheta = rhs.finalTheta;
89 this->clusters = rhs.clusters;
90 this->assign = rhs.assign;
91 this->count = rhs.count;
92 this->thetaTracker = rhs.thetaTracker;
103 if( clusterer == NULL )
return false;
111 this->computeTheta = ptr->computeTheta;
112 this->finalTheta = ptr->finalTheta;
113 this->clusters = ptr->clusters;
114 this->assign = ptr->assign;
115 this->count = ptr->count;
116 this->thetaTracker = ptr->thetaTracker;
127 errorLog <<
"train_(ClassificationData &trainingData) - The training data is empty!" << std::endl;
138 for(UINT i=0; i<M; i++){
139 for(UINT j=0; j<N; j++){
140 data[i][j] = trainingData[i][j];
154 for(UINT i=0; i<M; i++){
155 for(UINT j=0; j<N; j++){
156 data[i][j] = trainingData[i][j];
168 errorLog <<
"train_(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
173 errorLog <<
"train_(MatrixFloat &data) - The number of rows or columns in the data is zero!" << std::endl;
188 std::random_shuffle(randIndexs.begin(), randIndexs.end());
192 for(UINT j=0; j<numInputDimensions; j++){
193 clusters[k][j] = data[ randIndexs[k] ][j];
206 if( inputVector.
getSize() != numInputDimensions ){
211 for(UINT n=0; n<numInputDimensions; n++){
212 inputVector[n] = grt_scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0.0, 1.0);
216 const Float sigma = 1.0;
217 const Float gamma = 1.0 / (2.0*grt_sqr(sigma));
233 for(UINT j=0; j<numInputDimensions; j++){
234 dist += grt_sqr( inputVector[j]-clusters[i][j] );
237 clusterDistances[i] = dist;
238 clusterLikelihoods[i] = exp( - grt_sqr(gamma * dist) );
240 sum += clusterLikelihoods[i];
242 if( dist < bestDistance ){
250 clusterLikelihoods[i] /= sum;
254 maxLikelihood = clusterLikelihoods[ minIndex ];
262 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
267 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of rows in the cluster matrix does not match the number of clusters! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
271 if( clusters.
getNumCols() != numInputDimensions ){
272 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of columns in the cluster matrix does not match the number of input dimensions! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
277 UINT currentIter = 0;
279 bool keepTraining =
true;
284 thetaTracker.clear();
286 numTrainingIterationsToConverge = 0;
303 while( keepTraining ){
307 numChanged = estep( data );
317 theta = calculateTheta(data);
318 delta = lastTheta - theta;
320 }
else theta = delta = 0;
323 if( numChanged == 0 && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
324 if( currentIter >= maxNumEpochs ){ keepTraining =
false; }
325 if( fabs( delta ) < minChange && computeTheta && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
326 if( computeTheta ) thetaTracker.push_back( theta );
328 trainingLog <<
"Epoch: " << currentIter <<
"/" << maxNumEpochs;
329 trainingLog <<
" Epoch time: " << (timer.
getMilliSeconds()-startTime)/1000.0 <<
" seconds";
330 trainingLog <<
" Theta: " << theta <<
" Delta: " << delta << std::endl;
332 trainingLog <<
"Model Trained at epoch: " << currentIter <<
" with a theta value of: " << theta << std::endl;
335 numTrainingIterationsToConverge = currentIter;
339 clusterLabels.
resize(numClusters);
341 clusterLabels[i] = i+1;
343 clusterLikelihoods.
resize(numClusters,0);
344 clusterDistances.
resize(numClusters,0);
363 for (n=0; n < numInputDimensions; n++)
364 d += grt_sqr( data[m][n]-clusters[k][n] );
365 if (d <= dmin){ dmin = d; kmin = k; }
367 if ( kmin != assign[m] ){
381 for (n=0;n<numInputDimensions;n++)
386 for(n=0; n < numInputDimensions; n++)
387 clusters[ assign[m] ][n] += data[m][n];
391 Float countNorm = 1.0 / count[k];
392 for (n=0; n < numInputDimensions; n++){
393 clusters[k][n] *= countNorm;
399 Float KMeans::calculateTheta(
const MatrixFloat &data){
407 for(n=0; n < numInputDimensions; n++){
408 sum += grt_sqr(clusters[k][n] - data[m][n]);
410 theta += grt_sqrt(sum);
420 if( !file.is_open() ){
421 errorLog <<
"saveModelToFile(fstream &file) - Failed to save model, file is not open!" << std::endl;
425 file <<
"GRT_KMEANS_MODEL_FILE_V1.0\n";
428 errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
433 file <<
"Clusters:\n";
436 for(UINT n=0; n<numInputDimensions; n++){
437 file << clusters[k][n] <<
"\t";
452 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
458 if( word !=
"GRT_KMEANS_MODEL_FILE_V1.0" ){
463 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
469 if( word !=
"Clusters:" ){
474 clusters.
resize(numClusters,numInputDimensions);
478 for(UINT n=0; n<numInputDimensions; n++){
479 file >> clusters[k][n];
490 numTrainingSamples = 0;
493 thetaTracker.clear();
503 numTrainingSamples = 0;
506 thetaTracker.clear();
514 bool KMeans::setComputeTheta(
const bool computeTheta){
515 this->computeTheta = computeTheta;
523 this->clusters = clusters;
virtual bool saveModelToFile(std::fstream &file) const
KMeans(const UINT numClusters=10, const UINT minNumEpochs=5, const UINT maxNumEpochs=1000, const Float minChange=1.0e-5, const bool computeTheta=true)
virtual bool predict_(VectorFloat &inputVector)
std::string getClustererType() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
virtual bool train_(MatrixFloat &data)
UINT getNumSamples() const
bool copyBaseVariables(const Clusterer *clusterer)
signed long getMilliSeconds()
UINT nchg
Number of values changes.
bool loadClustererSettingsFromFile(std::fstream &file)
bool setClusters(const MatrixFloat &clusters)
This class implements the KMeans clustering algorithm.
virtual bool loadModelFromFile(std::fstream &file)
UINT getNumSamples() const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
UINT numTrainingSamples
Number of training examples.
bool saveClustererSettingsToFile(std::fstream &file) const
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
UINT getNumDimensions() const
UINT getNumClasses() const
unsigned int getNumCols() const
bool trainModel(MatrixFloat &data)
Vector< MinMax > getRanges() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
virtual bool resize(const unsigned int r, const unsigned int c)
KMeans & operator=(const KMeans &rhs)