29 KMeans::KMeans(
const UINT numClusters,
const UINT minNumEpochs,
const UINT maxNumEpochs,
const Float minChange,
const bool computeTheta){
32 this->minNumEpochs = minNumEpochs;
33 this->maxNumEpochs = maxNumEpochs;
34 this->minChange = minChange;
35 this->computeTheta = computeTheta;
40 numTrainingIterationsToConverge = 0;
44 clustererType = classType;
45 debugLog.setProceedingText(
"[DEBUG KMeans]");
46 errorLog.setProceedingText(
"[ERROR KMeans]");
47 trainingLog.setProceedingText(
"[TRAINING KMeans]");
48 warningLog.setProceedingText(
"[WARNING KMeans]");
54 clustererType = classType;
55 debugLog.setProceedingText(
"[DEBUG KMeans]");
56 errorLog.setProceedingText(
"[ERROR KMeans]");
57 trainingLog.setProceedingText(
"[TRAINING KMeans]");
58 warningLog.setProceedingText(
"[WARNING KMeans]");
64 this->computeTheta = rhs.computeTheta;
65 this->finalTheta = rhs.finalTheta;
66 this->clusters = rhs.clusters;
67 this->assign = rhs.assign;
68 this->count = rhs.count;
69 this->thetaTracker = rhs.thetaTracker;
86 this->computeTheta = rhs.computeTheta;
87 this->finalTheta = rhs.finalTheta;
88 this->clusters = rhs.clusters;
89 this->assign = rhs.assign;
90 this->count = rhs.count;
91 this->thetaTracker = rhs.thetaTracker;
102 if( clusterer == NULL )
return false;
110 this->computeTheta = ptr->computeTheta;
111 this->finalTheta = ptr->finalTheta;
112 this->clusters = ptr->clusters;
113 this->assign = ptr->assign;
114 this->count = ptr->count;
115 this->thetaTracker = ptr->thetaTracker;
126 errorLog <<
"train_(ClassificationData &trainingData) - The training data is empty!" << std::endl;
137 for(UINT i=0; i<M; i++){
138 for(UINT j=0; j<N; j++){
139 data[i][j] = trainingData[i][j];
153 for(UINT i=0; i<M; i++){
154 for(UINT j=0; j<N; j++){
155 data[i][j] = trainingData[i][j];
167 errorLog <<
"train_(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
172 errorLog <<
"train_(MatrixFloat &data) - The number of rows or columns in the data is zero!" << std::endl;
187 std::random_shuffle(randIndexs.begin(), randIndexs.end());
191 for(UINT j=0; j<numInputDimensions; j++){
192 clusters[k][j] = data[ randIndexs[k] ][j];
205 if( inputVector.
getSize() != numInputDimensions ){
210 for(UINT n=0; n<numInputDimensions; n++){
211 inputVector[n] = grt_scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0.0, 1.0);
215 const Float sigma = 1.0;
216 const Float gamma = 1.0 / (2.0*grt_sqr(sigma));
232 for(UINT j=0; j<numInputDimensions; j++){
233 dist += grt_sqr( inputVector[j]-clusters[i][j] );
236 clusterDistances[i] = dist;
237 clusterLikelihoods[i] = exp( - grt_sqr(gamma * dist) );
239 sum += clusterLikelihoods[i];
241 if( dist < bestDistance ){
249 clusterLikelihoods[i] /= sum;
253 maxLikelihood = clusterLikelihoods[ minIndex ];
261 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. NumClusters is zero!" << std::endl;
266 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of rows in the cluster matrix does not match the number of clusters! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
270 if( clusters.
getNumCols() != numInputDimensions ){
271 errorLog <<
"trainModel(MatrixFloat &data) - Failed to train model. The number of columns in the cluster matrix does not match the number of input dimensions! You should need to initalize the clusters matrix first before calling this function!" << std::endl;
276 UINT currentIter = 0;
278 bool keepTraining =
true;
283 thetaTracker.clear();
285 numTrainingIterationsToConverge = 0;
302 while( keepTraining ){
306 numChanged = estep( data );
316 theta = calculateTheta(data);
317 delta = lastTheta - theta;
319 }
else theta = delta = 0;
322 if( numChanged == 0 && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
323 if( currentIter >= maxNumEpochs ){ keepTraining =
false; }
324 if( fabs( delta ) < minChange && computeTheta && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
325 if( computeTheta ) thetaTracker.push_back( theta );
327 trainingLog <<
"Epoch: " << currentIter <<
"/" << maxNumEpochs;
328 trainingLog <<
" Epoch time: " << (timer.
getMilliSeconds()-startTime)/1000.0 <<
" seconds";
329 trainingLog <<
" Theta: " << theta <<
" Delta: " << delta << std::endl;
331 trainingLog <<
"Model Trained at epoch: " << currentIter <<
" with a theta value of: " << theta << std::endl;
334 numTrainingIterationsToConverge = currentIter;
338 clusterLabels.
resize(numClusters);
340 clusterLabels[i] = i+1;
342 clusterLikelihoods.
resize(numClusters,0);
343 clusterDistances.
resize(numClusters,0);
362 for (n=0; n < numInputDimensions; n++)
363 d += grt_sqr( data[m][n]-clusters[k][n] );
364 if (d <= dmin){ dmin = d; kmin = k; }
366 if ( kmin != assign[m] ){
380 for (n=0;n<numInputDimensions;n++)
385 for(n=0; n < numInputDimensions; n++)
386 clusters[ assign[m] ][n] += data[m][n];
390 Float countNorm = 1.0 / count[k];
391 for (n=0; n < numInputDimensions; n++){
392 clusters[k][n] *= countNorm;
398 Float KMeans::calculateTheta(
const MatrixFloat &data){
406 for(n=0; n < numInputDimensions; n++){
407 sum += grt_sqr(clusters[k][n] - data[m][n]);
409 theta += grt_sqrt(sum);
419 if( !file.is_open() ){
420 errorLog <<
"saveModelToFile(fstream &file) - Failed to save model, file is not open!" << std::endl;
424 file <<
"GRT_KMEANS_MODEL_FILE_V1.0\n";
427 errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << std::endl;
432 file <<
"Clusters:\n";
435 for(UINT n=0; n<numInputDimensions; n++){
436 file << clusters[k][n] <<
"\t";
451 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
457 if( word !=
"GRT_KMEANS_MODEL_FILE_V1.0" ){
462 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << std::endl;
468 if( word !=
"Clusters:" ){
473 clusters.
resize(numClusters,numInputDimensions);
477 for(UINT n=0; n<numInputDimensions; n++){
478 file >> clusters[k][n];
489 numTrainingSamples = 0;
492 thetaTracker.clear();
502 numTrainingSamples = 0;
505 thetaTracker.clear();
513 bool KMeans::setComputeTheta(
const bool computeTheta){
514 this->computeTheta = computeTheta;
522 this->clusters = clusters;
virtual bool saveModelToFile(std::fstream &file) const
KMeans(const UINT numClusters=10, const UINT minNumEpochs=5, const UINT maxNumEpochs=1000, const Float minChange=1.0e-5, const bool computeTheta=true)
virtual bool predict_(VectorFloat &inputVector)
std::string getClustererType() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
virtual bool train_(MatrixFloat &data)
UINT getNumSamples() const
bool copyBaseVariables(const Clusterer *clusterer)
signed long getMilliSeconds()
UINT nchg
Number of values changes.
bool loadClustererSettingsFromFile(std::fstream &file)
unsigned int getSize() const
bool setClusters(const MatrixFloat &clusters)
This class implements the KMeans clustering algorithm.
virtual bool loadModelFromFile(std::fstream &file)
UINT getNumSamples() const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )
UINT numTrainingSamples
Number of training examples.
bool saveClustererSettingsToFile(std::fstream &file) const
UINT numClusters
Number of clusters in the model.
unsigned int getNumRows() const
UINT getNumDimensions() const
UINT getNumClasses() const
unsigned int getNumCols() const
bool trainModel(MatrixFloat &data)
Vector< MinMax > getRanges() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
virtual bool resize(const unsigned int r, const unsigned int c)
KMeans & operator=(const KMeans &rhs)