21 #define GRT_DLL_EXPORTS
29 KNN::KNN(
unsigned int K,
bool useScaling,
bool useNullRejection,Float nullRejectionCoeff,
bool searchForBestKValue,UINT minKSearchValue,UINT maxKSearchValue){
32 this->useScaling = useScaling;
33 this->useNullRejection = useNullRejection;
34 this->nullRejectionCoeff = nullRejectionCoeff;
38 supportsNullRejection =
true;
40 classifierType = classType;
41 classifierMode = STANDARD_CLASSIFIER_MODE;
43 debugLog.setProceedingText(
"[DEBUG KNN]");
44 errorLog.setProceedingText(
"[ERROR KNN]");
45 trainingLog.setProceedingText(
"[TRAINING KNN]");
46 warningLog.setProceedingText(
"[WARNING KNN]");
51 classifierType = classType;
52 classifierMode = STANDARD_CLASSIFIER_MODE;
53 debugLog.setProceedingText(
"[DEBUG KNN]");
54 errorLog.setProceedingText(
"[ERROR KNN]");
55 trainingLog.setProceedingText(
"[TRAINING KNN]");
56 warningLog.setProceedingText(
"[WARNING KNN]");
84 if( classifier == NULL )
return false;
88 KNN *ptr = (
KNN*)classifier;
111 errorLog <<
"train_(ClassificationData &trainingData) - Training data has zero samples!" << std::endl;
119 trainingData.
scale(0, 1);
130 classLabels.
resize( numClasses );
131 for(UINT k=0; k<numClasses; k++){
136 if( !searchForBestKValue ){
137 return train_(trainingData,K);
142 Float bestAccuracy = 0;
150 if( !
train_(trainingSet, k) ){
151 errorLog <<
"Failed to train model for a k value of " << k << std::endl;
160 if( !predict( sample , k) ){
161 errorLog <<
"Failed to predict label for test sample with a k value of " << k << std::endl;
165 if( testSet[i].getClassLabel() == predictedClassLabel ){
170 accuracy = accuracy /Float( testSet.
getNumSamples() ) * 100.0;
173 trainingLog <<
"K:\t" << k <<
"\tAccuracy:\t" << accuracy << std::endl;
175 if( accuracy > bestAccuracy ){
176 bestAccuracy = accuracy;
184 if( bestAccuracy > 0 ){
186 std::sort(trainingAccuracyLog.begin(),trainingAccuracyLog.end(),IndexedDouble::sortIndexedDoubleByValueDescending);
192 tempLog.push_back( trainingAccuracyLog[0] );
195 for(UINT i=1; i<trainingAccuracyLog.size(); i++){
196 if( trainingAccuracyLog[i].value == tempLog[0].value ){
197 tempLog.push_back( trainingAccuracyLog[i] );
202 std::sort(tempLog.begin(),tempLog.end(),IndexedDouble::sortIndexedDoubleByIndexAscending);
204 trainingLog <<
"Best K Value: " << tempLog[0].index <<
"\tAccuracy:\t" << tempLog[0].value << std::endl;
208 return train_(trainingData,tempLog[0].index);
223 if( useNullRejection ){
226 useNullRejection =
false;
227 nullRejectionThresholds.clear();
233 nullRejectionThresholds.
resize( numClasses, 0 );
236 const unsigned int numTrainingExamples = trainingData.
getNumSamples();
238 for(UINT i=0; i<numTrainingExamples; i++){
239 predict( trainingData[i].getSample(), K);
241 UINT classLabelIndex = 0;
242 for(UINT k=0; k<numClasses; k++){
243 if( predictedClassLabel == classLabels[k] ){
249 predictionResults[ i ].index = classLabelIndex;
250 predictionResults[ i ].value = classDistances[ classLabelIndex ];
252 trainingMu[ classLabelIndex ] += predictionResults[ i ].value;
253 counter[ classLabelIndex ]++;
256 for(UINT j=0; j<numClasses; j++){
261 for(UINT i=0; i<numTrainingExamples; i++){
262 trainingSigma[predictionResults[i].index] += SQR(predictionResults[i].value -
trainingMu[predictionResults[i].index]);
265 for(UINT j=0; j<numClasses; j++){
266 Float count = counter[j];
275 bool errorFound =
false;
276 for(UINT j=0; j<numClasses; j++){
278 warningLog <<
"TrainingMu[ " << j <<
" ] is zero for a K value of " << K << std::endl;
281 warningLog <<
"TrainingSigma[ " << j <<
" ] is zero for a K value of " << K << std::endl;
284 errorLog <<
"TrainingMu[ " << j <<
" ] is NAN for a K value of " << K << std::endl;
288 errorLog <<
"TrainingSigma[ " << j <<
" ] is NAN for a K value of " << K << std::endl;
299 for(
unsigned int j=0; j<numClasses; j++){
304 useNullRejection =
true;
308 nullRejectionThresholds.clear();
309 nullRejectionThresholds.
resize( numClasses, 0 );
318 errorLog <<
"predict_(VectorFloat &inputVector) - KNN model has not been trained" << std::endl;
322 if( inputVector.size() != numInputDimensions ){
323 errorLog <<
"predict_(VectorFloat &inputVector) - the size of the input vector " << inputVector.size() <<
" does not match the number of features " << numInputDimensions << std::endl;
329 for(UINT i=0; i<numInputDimensions; i++){
330 inputVector[i] =
scale(inputVector[i], ranges[i].minValue, ranges[i].maxValue, 0, 1);
335 return predict(inputVector,K);
338 bool KNN::predict(
const VectorFloat &inputVector,
const UINT K){
341 errorLog <<
"predict(VectorFloat inputVector,UINT K) - KNN model has not been trained" << std::endl;
345 if( inputVector.size() != numInputDimensions ){
346 errorLog <<
"predict(VectorFloat inputVector) - the size of the input vector " << inputVector.size() <<
" does not match the number of features " << numInputDimensions << std::endl;
351 errorLog <<
"predict(VectorFloat inputVector,UINT K) - K Is Greater Than The Number Of Training Samples" << std::endl;
359 for(UINT i=0; i<M; i++){
361 UINT classLabel = trainingData[i].getClassLabel();
362 VectorFloat trainingSample = trainingData[i].getSample();
365 case EUCLIDEAN_DISTANCE:
366 dist = computeEuclideanDistance(inputVector,trainingSample);
368 case COSINE_DISTANCE:
369 dist = computeCosineDistance(inputVector,trainingSample);
371 case MANHATTAN_DISTANCE:
372 dist = computeManhattanDistance(inputVector, trainingSample);
375 errorLog <<
"predict(vector< Float > inputVector) - unkown distance measure!" << std::endl;
380 if( neighbours.size() < K ){
384 Float maxValue = neighbours[0].value;
386 for(UINT n=1; n<neighbours.size(); n++){
387 if( neighbours[n].value > maxValue ){
388 maxValue = neighbours[n].value;
394 if( dist < maxValue ){
401 if( classLikelihoods.size() != numClasses ) classLikelihoods.
resize(numClasses);
402 if( classDistances.size() != numClasses ) classDistances.
resize(numClasses);
404 std::fill(classLikelihoods.begin(),classLikelihoods.end(),0);
405 std::fill(classDistances.begin(),classDistances.end(),0);
408 for(UINT k=0; k<neighbours.size(); k++){
409 UINT classLabel = neighbours[k].index;
410 if( classLabel == 0 ){
411 errorLog <<
"predict(VectorFloat inputVector) - Class label of training example can not be zero!" << std::endl;
416 UINT classLabelIndex = 0;
417 for(UINT j=0; j<numClasses; j++){
418 if( classLabel == classLabels[j] ){
423 classLikelihoods[ classLabelIndex ] += 1;
424 classDistances[ classLabelIndex ] += neighbours[k].value;
428 Float maxCount = classLikelihoods[0];
430 for(UINT i=1; i<classLikelihoods.size(); i++){
431 if( classLikelihoods[i] > maxCount ){
432 maxCount = classLikelihoods[i];
438 for(UINT i=0; i<numClasses; i++){
439 if( classLikelihoods[i] > 0 ) classDistances[i] /= classLikelihoods[i];
444 for(UINT i=0; i<numClasses; i++){
445 classLikelihoods[i] /= Float( neighbours.size() );
449 maxLikelihood = classLikelihoods[ maxIndex ];
451 if( useNullRejection ){
452 if( classDistances[ maxIndex ] <= nullRejectionThresholds[ maxIndex ] ){
453 predictedClassLabel = classLabels[maxIndex];
455 predictedClassLabel = GRT_DEFAULT_NULL_CLASS_LABEL;
458 predictedClassLabel = classLabels[maxIndex];
470 trainingData.
clear();
481 errorLog <<
"save(fstream &file) - Could not open file to save model!" << std::endl;
486 file <<
"GRT_KNN_MODEL_FILE_V2.0\n";
490 errorLog <<
"save(fstream &file) - Failed to save classifier base settings to file!" << std::endl;
494 file <<
"K: " << K << std::endl;
496 file <<
"SearchForBestKValue: " << searchForBestKValue << std::endl;
497 file <<
"MinKSearchValue: " << minKSearchValue << std::endl;
498 file <<
"MaxKSearchValue: " << maxKSearchValue << std::endl;
501 if( useNullRejection ){
502 file <<
"TrainingMu: ";
507 file <<
"TrainingSigma: ";
513 file <<
"NumTrainingSamples: " << trainingData.
getNumSamples() << std::endl;
514 file <<
"TrainingData: \n";
518 file<< trainingData[i].getClassLabel() <<
"\t";
520 for(UINT j=0; j<numInputDimensions; j++){
521 file << trainingData[i][j] <<
"\t";
534 errorLog <<
"load(fstream &file) - Could not open file to load model!" << std::endl;
543 if( word ==
"GRT_KNN_MODEL_FILE_V1.0" ){
548 if(word !=
"GRT_KNN_MODEL_FILE_V2.0"){
549 errorLog <<
"load(fstream &file) - Could not find Model File Header!" << std::endl;
555 errorLog <<
"load(string filename) - Failed to load base settings from file!" << std::endl;
561 errorLog <<
"load(fstream &file) - Could not find K!" << std::endl;
567 if(word !=
"DistanceMethod:"){
568 errorLog <<
"load(fstream &file) - Could not find DistanceMethod!" << std::endl;
574 if(word !=
"SearchForBestKValue:"){
575 errorLog <<
"load(fstream &file) - Could not find SearchForBestKValue!" << std::endl;
581 if(word !=
"MinKSearchValue:"){
582 errorLog <<
"load(fstream &file) - Could not find MinKSearchValue!" << std::endl;
588 if(word !=
"MaxKSearchValue:"){
589 errorLog <<
"load(fstream &file) - Could not find MaxKSearchValue!" << std::endl;
600 if( useNullRejection ){
602 if(word !=
"TrainingMu:"){
603 errorLog <<
"load(fstream &file) - Could not find TrainingMu!" << std::endl;
608 for(UINT j=0; j<numClasses; j++){
613 if(word !=
"TrainingSigma:"){
614 errorLog <<
"load(fstream &file) - Could not find TrainingSigma!" << std::endl;
619 for(UINT j=0; j<numClasses; j++){
625 if(word !=
"NumTrainingSamples:"){
626 errorLog <<
"load(fstream &file) - Could not find NumTrainingSamples!" << std::endl;
629 unsigned int numTrainingSamples = 0;
630 file >> numTrainingSamples;
633 if(word !=
"TrainingData:"){
634 errorLog <<
"load(fstream &file) - Could not find TrainingData!" << std::endl;
640 unsigned int classLabel = 0;
642 for(UINT i=0; i<numTrainingSamples; i++){
647 for(UINT j=0; j<numInputDimensions; j++){
652 trainingData.
addSample(classLabel, sample);
656 bestDistance = DEFAULT_NULL_DISTANCE_VALUE;
658 classDistances.
resize(numClasses,DEFAULT_NULL_DISTANCE_VALUE);
670 nullRejectionThresholds.
resize(numClasses,0);
676 for(
unsigned int j=0; j<numClasses; j++){
707 if( nullRejectionCoeff > 0 ){
708 this->nullRejectionCoeff = nullRejectionCoeff;
716 if( distanceMethod == EUCLIDEAN_DISTANCE || distanceMethod == COSINE_DISTANCE || distanceMethod == MANHATTAN_DISTANCE ){
725 for(UINT j=0; j<numInputDimensions; j++){
726 dist += SQR( a[j] - b[j] );
738 for(UINT j=0; j<numInputDimensions; j++){
739 dotAB += a[j] * b[j];
744 dist = dotAB / (sqrt(magA) * sqrt(magB));
752 for(UINT j=0; j<numInputDimensions; j++){
753 dist += fabs( a[j] - b[j] );
765 if(word !=
"NumFeatures:"){
766 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumFeatures!" << std::endl;
769 file >> numInputDimensions;
772 if(word !=
"NumClasses:"){
773 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumClasses!" << std::endl;
780 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find K!" << std::endl;
786 if(word !=
"DistanceMethod:"){
787 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find DistanceMethod!" << std::endl;
793 if(word !=
"SearchForBestKValue:"){
794 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find SearchForBestKValue!" << std::endl;
800 if(word !=
"MinKSearchValue:"){
801 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find MinKSearchValue!" << std::endl;
807 if(word !=
"MaxKSearchValue:"){
808 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find MaxKSearchValue!" << std::endl;
814 if(word !=
"UseScaling:"){
815 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find UseScaling!" << std::endl;
821 if(word !=
"UseNullRejection:"){
822 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find UseNullRejection!" << std::endl;
825 file >> useNullRejection;
828 if(word !=
"NullRejectionCoeff:"){
829 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NullRejectionCoeff!" << std::endl;
832 file >> nullRejectionCoeff;
837 ranges.
resize( numInputDimensions );
840 if(word !=
"Ranges:"){
841 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find Ranges!" << std::endl;
842 std::cout <<
"Word: " << word << std::endl;
845 for(UINT n=0; n<ranges.size(); n++){
846 file >> ranges[n].minValue;
847 file >> ranges[n].maxValue;
856 if(word !=
"TrainingMu:"){
857 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingMu!" << std::endl;
862 for(UINT j=0; j<numClasses; j++){
867 if(word !=
"TrainingSigma:"){
868 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingSigma!" << std::endl;
873 for(UINT j=0; j<numClasses; j++){
878 if(word !=
"NumTrainingSamples:"){
879 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find NumTrainingSamples!" << std::endl;
882 unsigned int numTrainingSamples = 0;
883 file >> numTrainingSamples;
886 if(word !=
"TrainingData:"){
887 errorLog <<
"loadLegacyModelFromFile(fstream &file) - Could not find TrainingData!" << std::endl;
893 unsigned int classLabel = 0;
895 for(UINT i=0; i<numTrainingSamples; i++){
900 for(UINT j=0; j<numInputDimensions; j++){
905 trainingData.
addSample(classLabel, sample);
bool saveBaseSettingsToFile(std::fstream &file) const
VectorFloat trainingSigma
Holds the average max-class distance of the training data for each of classes
#define DEFAULT_NULL_LIKELIHOOD_VALUE
Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
bool addSample(UINT classLabel, const VectorFloat &sample)
virtual bool save(std::fstream &file) const
virtual bool load(std::fstream &file)
bool searchForBestKValue
The distance method used to compute the distance between each data point
std::string getClassifierType() const
Vector< ClassTracker > getClassTracker() const
This class implements the K-Nearest Neighbor classification algorithm (http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm). KNN is a simple but powerful classifier, based on finding the closest K training examples in the feature space for the new input vector. The KNN algorithm is amongst the simplest of all machine learning algorithms: an object is classified by a majority vote of its neighbors, with the object being assigned to the class most common amongst its k nearest neighbors (k is a positive integer, typically small). If k = 1, then the object is simply assigned to the class of its nearest neighbor.
virtual bool resize(const unsigned int size)
bool setNumDimensions(UINT numDimensions)
virtual bool train_(ClassificationData &trainingData)
UINT distanceMethod
The number of neighbours to search for
KNN(UINT K=10, bool useScaling=false, bool useNullRejection=false, Float nullRejectionCoeff=10.0, bool searchForBestKValue=false, UINT minKSearchValue=1, UINT maxKSearchValue=10)
ClassificationData trainingData
The maximum K value to end the search at
UINT maxKSearchValue
The minimum K value to start the search from
bool setDistanceMethod(UINT distanceMethod)
virtual bool recomputeNullRejectionThresholds()
KNN & operator=(const KNN &rhs)
bool setMaxKSearchValue(UINT maxKSearchValue)
bool enableBestKValueSearch(bool searchForBestKValue)
UINT getNumSamples() const
virtual bool predict_(VectorFloat &inputVector)
virtual bool deepCopyFrom(const Classifier *classifier)
UINT minKSearchValue
Sets if the best K value should be searched for or if the model should be trained with K ...
static RegisterClassifierModule< KNN > registerModule
Holds the stddev of the max-class distance of the training data for each of classes ...
bool copyBaseVariables(const Classifier *classifier)
bool loadBaseSettingsFromFile(std::fstream &file)
VectorFloat trainingMu
Holds the trainingData to perform the predictions
UINT getNumDimensions() const
UINT getNumClasses() const
Vector< MinMax > getRanges() const
ClassificationData split(const UINT splitPercentage, const bool useStratifiedSampling=false)
bool setNullRejectionCoeff(Float nullRejectionCoeff)
bool setMinKSearchValue(UINT minKSearchValue)
bool scale(const Float minTarget, const Float maxTarget)
bool loadLegacyModelFromFile(std::fstream &file)