28 KNN::KNN(
unsigned int K,
bool useScaling,
bool useNullRejection,Float nullRejectionCoeff,
bool searchForBestKValue,UINT minKSearchValue,UINT maxKSearchValue){
31 this->useScaling = useScaling;
32 this->useNullRejection = useNullRejection;
33 this->nullRejectionCoeff = nullRejectionCoeff;
37 supportsNullRejection =
true;
39 classifierType = classType;
40 classifierMode = STANDARD_CLASSIFIER_MODE;
42 debugLog.setProceedingText(
"[DEBUG KNN]");
43 errorLog.setProceedingText(
"[ERROR KNN]");
44 trainingLog.setProceedingText(
"[TRAINING KNN]");
45 warningLog.setProceedingText(
"[WARNING KNN]");
50 classifierType = classType;
51 classifierMode = STANDARD_CLASSIFIER_MODE;
52 debugLog.setProceedingText(
"[DEBUG KNN]");
53 errorLog.setProceedingText(
"[ERROR KNN]");
54 trainingLog.setProceedingText(
"[TRAINING KNN]");
55 warningLog.setProceedingText(
"[WARNING KNN]");
83 if( classifier == NULL )
return false;
87 KNN *ptr = (
KNN*)classifier;
110 errorLog <<
"train_(ClassificationData &trainingData) - Training data has zero samples!" << std::endl;
118 trainingData.
scale(0, 1);
129 classLabels.
resize( numClasses );
130 for(UINT k=0; k<numClasses; k++){
135 if( !searchForBestKValue ){
136 return train_(trainingData,K);
141 Float bestAccuracy = 0;
149 if( !
train_(trainingSet, k) ){
150 errorLog <<
"Failed to train model for a k value of " << k << std::endl;
159 if( !predict( sample , k) ){
160 errorLog <<
"Failed to predict label for test sample with a k value of " << k << std::endl;
164 if( testSet[i].getClassLabel() == predictedClassLabel ){
169 accuracy = accuracy /Float( testSet.
getNumSamples() ) * 100.0;
172 trainingLog <<
"K:\t" << k <<
"\tAccuracy:\t" << accuracy << std::endl;
174 if( accuracy > bestAccuracy ){
175 bestAccuracy = accuracy;
183 if( bestAccuracy > 0 ){
185 std::sort(trainingAccuracyLog.begin(),trainingAccuracyLog.end(),IndexedDouble::sortIndexedDoubleByValueDescending);
191 tempLog.push_back( trainingAccuracyLog[0] );
194 for(UINT i=1; i<trainingAccuracyLog.size(); i++){
195 if( trainingAccuracyLog[i].value == tempLog[0].value ){
196 tempLog.push_back( trainingAccuracyLog[i] );
201 std::sort(tempLog.begin(),tempLog.end(),IndexedDouble::sortIndexedDoubleByIndexAscending);
203 trainingLog <<
"Best K Value: " << tempLog[0].index <<
"\tAccuracy:\t" << tempLog[0].value << std::endl;
207 return train_(trainingData,tempLog[0].index);
222 if( useNullRejection ){
225 useNullRejection =
false;
226 nullRejectionThresholds.clear();
232 nullRejectionThresholds.
resize( numClasses, 0 );
235 const unsigned int numTrainingExamples = trainingData.
getNumSamples();
237 for(UINT i=0; i<numTrainingExamples; i++){
238 predict( trainingData[i].getSample(), K);
240 UINT classLabelIndex = 0;
241 for(UINT k=0; k<numClasses; k++){
242 if( predictedClassLabel == classLabels[k] ){
248 predictionResults[ i ].index = classLabelIndex;
249 predictionResults[ i ].value = classDistances[ classLabelIndex ];
251 trainingMu[ classLabelIndex ] += predictionResults[ i ].value;
252 counter[ classLabelIndex ]++;
255 for(UINT j=0; j<numClasses; j++){
260 for(UINT i=0; i<numTrainingExamples; i++){
261 trainingSigma[predictionResults[i].index] += SQR(predictionResults[i].value -
trainingMu[predictionResults[i].index]);
264 for(UINT j=0; j<numClasses; j++){
265 Float count = counter[j];
274 bool errorFound =
false;
275 for(UINT j=0; j<numClasses; j++){
277 warningLog <<
"TrainingMu[ " << j <<
" ] is zero for a K value of " << K << std::endl;
280 warningLog <<
"TrainingSigma[ " << j <<
" ] is zero for a K value of " << K << std::endl;
283 errorLog <<
"TrainingMu[ " << j <<
" ] is NAN for a K value of " << K << std::endl;
287 errorLog <<
"TrainingSigma[ " << j <<
" ] is NAN for a K value of " << K << std::endl;
298 for(
unsigned int j=0; j<numClasses; j++){
303 useNullRejection =
true;
307 nullRejectionThresholds.clear();
308 nullRejectionThresholds.
resize( numClasses, 0 );
317 errorLog <<
"predict_(VectorFloat &inputVector) - KNN model has not been trained" << std::endl;
321 if( inputVector.size() != numInputDimensions ){
322 errorLog <<
"predict_(VectorFloat &inputVector) - the size of the input vector " << inputVector.size() <<
" does not match the number of features " << numInputDimensions << std::endl;
328 for(UINT i=0; i<numInputDimensions; i++){
329 inputVector[i] =
scale(inputVector[i], ranges[i].minValue, ranges[i].maxValue, 0, 1);
334 return predict(inputVector,K);
337 bool KNN::predict(
const VectorFloat &inputVector,
const UINT K){
340 errorLog <<
"predict(VectorFloat inputVector,UINT K) - KNN model has not been trained" << std::endl;
344 if( inputVector.size() != numInputDimensions ){
345 errorLog <<
"predict(VectorFloat inputVector) - the size of the input vector " << inputVector.size() <<
" does not match the number of features " << numInputDimensions << std::endl;
350 errorLog <<
"predict(VectorFloat inputVector,UINT K) - K Is Greater Than The Number Of Training Samples" << std::endl;
358 for(UINT i=0; i<M; i++){
360 UINT classLabel = trainingData[i].getClassLabel();
361 VectorFloat trainingSample = trainingData[i].getSample();
364 case EUCLIDEAN_DISTANCE:
365 dist = computeEuclideanDistance(inputVector,trainingSample);
367 case COSINE_DISTANCE:
368 dist = computeCosineDistance(inputVector,trainingSample);
370 case MANHATTAN_DISTANCE:
371 dist = computeManhattanDistance(inputVector, trainingSample);
374 errorLog <<
"predict(vector< Float > inputVector) - unkown distance measure!" << std::endl;
379 if( neighbours.size() < K ){
383 Float maxValue = neighbours[0].value;
385 for(UINT n=1; n<neighbours.size(); n++){
386 if( neighbours[n].value > maxValue ){
387 maxValue = neighbours[n].value;
393 if( dist < maxValue ){
400 if( classLikelihoods.size() != numClasses ) classLikelihoods.
resize(numClasses);
401 if( classDistances.size() != numClasses ) classDistances.
resize(numClasses);
403 std::fill(classLikelihoods.begin(),classLikelihoods.end(),0);
404 std::fill(classDistances.begin(),classDistances.end(),0);
407 for(UINT k=0; k<neighbours.size(); k++){
408 UINT classLabel = neighbours[k].index;
409 if( classLabel == 0 ){
410 errorLog <<
"predict(VectorFloat inputVector) - Class label of training example can not be zero!" << std::endl;
415 UINT classLabelIndex = 0;
416 for(UINT j=0; j<numClasses; j++){
417 if( classLabel == classLabels[j] ){
422 classLikelihoods[ classLabelIndex ] += 1;
423 classDistances[ classLabelIndex ] += neighbours[k].value;
427 Float maxCount = classLikelihoods[0];
429 for(UINT i=1; i<classLikelihoods.size(); i++){
430 if( classLikelihoods[i] > maxCount ){
431 maxCount = classLikelihoods[i];
437 for(UINT i=0; i<numClasses; i++){
438 if( classLikelihoods[i] > 0 ) classDistances[i] /= classLikelihoods[i];
443 for(UINT i=0; i<numClasses; i++){
444 classLikelihoods[i] /= Float( neighbours.size() );
448 maxLikelihood = classLikelihoods[ maxIndex ];
450 if( useNullRejection ){
451 if( classDistances[ maxIndex ] <= nullRejectionThresholds[ maxIndex ] ){
452 predictedClassLabel = classLabels[maxIndex];
454 predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL;
457 predictedClassLabel = classLabels[maxIndex];
469 trainingData.
clear();
480 errorLog <<
"saveModelToFile(fstream &file) - Could not open file to save model!" << std::endl;
485 file <<
"GRT_KNN_MODEL_FILE_V2.0\n";
489 errorLog <<
"saveModelToFile(fstream &file) - Failed to save classifier base settings to file!" << std::endl;
493 file <<
"K: " << K << std::endl;
495 file <<
"SearchForBestKValue: " << searchForBestKValue << std::endl;
496 file <<
"MinKSearchValue: " << minKSearchValue << std::endl;
497 file <<
"MaxKSearchValue: " << maxKSearchValue << std::endl;
500 if( useNullRejection ){
501 file <<
"TrainingMu: ";
506 file <<
"TrainingSigma: ";
512 file <<
"NumTrainingSamples: " << trainingData.
getNumSamples() << std::endl;
513 file <<
"TrainingData: \n";
517 file<< trainingData[i].getClassLabel() <<
"\t";
519 for(UINT j=0; j<numInputDimensions; j++){
520 file << trainingData[i][j] <<
"\t";
533 errorLog <<
"loadModelFromFile(fstream &file) - Could not open file to load model!" << std::endl;
542 if( word ==
"GRT_KNN_MODEL_FILE_V1.0" ){
547 if(word !=
"GRT_KNN_MODEL_FILE_V2.0"){
548 errorLog <<
"loadModelFromFile(fstream &file) - Could not find Model File Header!" << std::endl;
554 errorLog <<
"loadModelFromFile(string filename) - Failed to load base settings from file!" << std::endl;
560 errorLog <<
"loadModelFromFile(fstream &file) - Could not find K!" << std::endl;
566 if(word !=
"DistanceMethod:"){
567 errorLog <<
"loadModelFromFile(fstream &file) - Could not find DistanceMethod!" << std::endl;
573 if(word !=
"SearchForBestKValue:"){
574 errorLog <<
"loadModelFromFile(fstream &file) - Could not find SearchForBestKValue!" << std::endl;
580 if(word !=
"MinKSearchValue:"){
581 errorLog <<
"loadModelFromFile(fstream &file) - Could not find MinKSearchValue!" << std::endl;
587 if(word !=
"MaxKSearchValue:"){
588 errorLog <<
"loadModelFromFile(fstream &file) - Could not find MaxKSearchValue!" << std::endl;
599 if( useNullRejection ){
601 if(word !=
"TrainingMu:"){
602 errorLog <<
"loadModelFromFile(fstream &file) - Could not find TrainingMu!" << std::endl;
607 for(UINT j=0; j<numClasses; j++){
612 if(word !=
"TrainingSigma:"){
613 errorLog <<
"loadModelFromFile(fstream &file) - Could not find TrainingSigma!" << std::endl;
618 for(UINT j=0; j<numClasses; j++){
624 if(word !=
"NumTrainingSamples:"){
625 errorLog <<
"loadModelFromFile(fstream &file) - Could not find NumTrainingSamples!" << std::endl;
628 unsigned int numTrainingSamples = 0;
629 file >> numTrainingSamples;
632 if(word !=
"TrainingData:"){
633 errorLog <<
"loadModelFromFile(fstream &file) - Could not find TrainingData!" << std::endl;
639 unsigned int classLabel = 0;
641 for(UINT i=0; i<numTrainingSamples; i++){
646 for(UINT j=0; j<numInputDimensions; j++){
651 trainingData.
addSample(classLabel, sample);
655 bestDistance = DEFAULT_NULL_DISTANCE_VALUE;
657 classDistances.
resize(numClasses,DEFAULT_NULL_DISTANCE_VALUE);
669 nullRejectionThresholds.
resize(numClasses,0);
675 for(
unsigned int j=0; j<numClasses; j++){
706 if( nullRejectionCoeff > 0 ){
707 this->nullRejectionCoeff = nullRejectionCoeff;
715 if( distanceMethod == EUCLIDEAN_DISTANCE || distanceMethod == COSINE_DISTANCE || distanceMethod == MANHATTAN_DISTANCE ){
724 for(UINT j=0; j<numInputDimensions; j++){
725 dist += SQR( a[j] - b[j] );
737 for(UINT j=0; j<numInputDimensions; j++){
738 dotAB += a[j] * b[j];
743 dist = dotAB / (sqrt(magA) * sqrt(magB));
751 for(UINT j=0; j<numInputDimensions; j++){
752 dist += fabs( a[j] - b[j] );
764 if(word !=
"NumFeatures:"){
765 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumFeatures!" << std::endl;
768 file >> numInputDimensions;
771 if(word !=
"NumClasses:"){
772 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumClasses!" << std::endl;
779 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find K!" << std::endl;
785 if(word !=
"DistanceMethod:"){
786 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find DistanceMethod!" << std::endl;
792 if(word !=
"SearchForBestKValue:"){
793 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find SearchForBestKValue!" << std::endl;
799 if(word !=
"MinKSearchValue:"){
800 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find MinKSearchValue!" << std::endl;
806 if(word !=
"MaxKSearchValue:"){
807 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find MaxKSearchValue!" << std::endl;
813 if(word !=
"UseScaling:"){
814 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find UseScaling!" << std::endl;
820 if(word !=
"UseNullRejection:"){
821 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find UseNullRejection!" << std::endl;
824 file >> useNullRejection;
827 if(word !=
"NullRejectionCoeff:"){
828 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NullRejectionCoeff!" << std::endl;
831 file >> nullRejectionCoeff;
836 ranges.
resize( numInputDimensions );
839 if(word !=
"Ranges:"){
840 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find Ranges!" << std::endl;
841 std::cout <<
"Word: " << word << std::endl;
844 for(UINT n=0; n<ranges.size(); n++){
845 file >> ranges[n].minValue;
846 file >> ranges[n].maxValue;
855 if(word !=
"TrainingMu:"){
856 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingMu!" << std::endl;
861 for(UINT j=0; j<numClasses; j++){
866 if(word !=
"TrainingSigma:"){
867 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingSigma!" << std::endl;
872 for(UINT j=0; j<numClasses; j++){
877 if(word !=
"NumTrainingSamples:"){
878 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumTrainingSamples!" << std::endl;
881 unsigned int numTrainingSamples = 0;
882 file >> numTrainingSamples;
885 if(word !=
"TrainingData:"){
886 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingData!" << std::endl;
892 unsigned int classLabel = 0;
894 for(UINT i=0; i<numTrainingSamples; i++){
899 for(UINT j=0; j<numInputDimensions; j++){
904 trainingData.
addSample(classLabel, sample);
bool saveBaseSettingsToFile(std::fstream &file) const
VectorFloat trainingSigma
Holds the average max-class distance of the training data for each of classes
#define DEFAULT_NULL_LIKELIHOOD_VALUE
Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
bool addSample(UINT classLabel, const VectorFloat &sample)
bool searchForBestKValue
The distance method used to compute the distance between each data point
std::string getClassifierType() const
Vector< ClassTracker > getClassTracker() const
This class implements the K-Nearest Neighbor classification algorithm (http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm). KNN is a simple but powerful classifier, based on finding the closest K training examples in the feature space for the new input vector. The KNN algorithm is amongst the simplest of all machine learning algorithms: an object is classified by a majority vote of its neighbors, with the object being assigned to the class most common amongst its k nearest neighbors (k is a positive integer, typically small). If k = 1, then the object is simply assigned to the class of its nearest neighbor.
virtual bool resize(const unsigned int size)
bool setNumDimensions(UINT numDimensions)
virtual bool loadModelFromFile(std::fstream &file)
virtual bool train_(ClassificationData &trainingData)
UINT distanceMethod
The number of neighbours to search for
KNN(UINT K=10, bool useScaling=false, bool useNullRejection=false, Float nullRejectionCoeff=10.0, bool searchForBestKValue=false, UINT minKSearchValue=1, UINT maxKSearchValue=10)
ClassificationData trainingData
The maximum K value to end the search at
UINT maxKSearchValue
The minimum K value to start the search from
bool setDistanceMethod(UINT distanceMethod)
virtual bool recomputeNullRejectionThresholds()
KNN & operator=(const KNN &rhs)
bool setMaxKSearchValue(UINT maxKSearchValue)
virtual bool saveModelToFile(std::fstream &file) const
bool enableBestKValueSearch(bool searchForBestKValue)
UINT getNumSamples() const
virtual bool predict_(VectorFloat &inputVector)
virtual bool deepCopyFrom(const Classifier *classifier)
UINT minKSearchValue
Sets if the best K value should be searched for or if the model should be trained with K ...
static RegisterClassifierModule< KNN > registerModule
Holds the stddev of the max-class distance of the training data for each of classes ...
bool copyBaseVariables(const Classifier *classifier)
bool loadBaseSettingsFromFile(std::fstream &file)
ClassificationData partition(const UINT partitionPercentage, const bool useStratifiedSampling=false)
VectorFloat trainingMu
Holds the trainingData to perform the predictions
UINT getNumDimensions() const
UINT getNumClasses() const
Vector< MinMax > getRanges() const
bool setNullRejectionCoeff(Float nullRejectionCoeff)
bool setMinKSearchValue(UINT minKSearchValue)
bool scale(const Float minTarget, const Float maxTarget)
bool loadLegacyModelFromFile(std::fstream &file)