21 #define GRT_DLL_EXPORTS
27 this->datasetName = datasetName;
28 this->numDimensions = numDimensions;
29 this->infoText = infoText;
31 crossValidationSetup =
false;
32 useExternalRanges =
false;
33 allowNullGestureClass =
true;
35 infoLog.setProceedingText(
"[ClassificationData]");
36 debugLog.setProceedingText(
"[DEBUG ClassificationData]");
37 errorLog.setProceedingText(
"[ERROR ClassificationData]");
38 warningLog.setProceedingText(
"[WARNING ClassificationData]");
50 this->datasetName = rhs.datasetName;
51 this->infoText = rhs.infoText;
52 this->numDimensions = rhs.numDimensions;
53 this->totalNumSamples = rhs.totalNumSamples;
54 this->kFoldValue = rhs.kFoldValue;
55 this->crossValidationSetup = rhs.crossValidationSetup;
56 this->useExternalRanges = rhs.useExternalRanges;
57 this->allowNullGestureClass = rhs.allowNullGestureClass;
58 this->externalRanges = rhs.externalRanges;
59 this->classTracker = rhs.classTracker;
60 this->data = rhs.data;
61 this->crossValidationIndexs = rhs.crossValidationIndexs;
62 this->infoLog = rhs.infoLog;
63 this->debugLog = rhs.debugLog;
64 this->errorLog = rhs.errorLog;
65 this->warningLog = rhs.warningLog;
74 crossValidationSetup =
false;
75 crossValidationIndexs.clear();
80 if( numDimensions > 0 ){
85 this->numDimensions = numDimensions;
88 useExternalRanges =
false;
89 externalRanges.clear();
94 errorLog <<
"setNumDimensions(const UINT numDimensions) - The number of dimensions of the dataset must be greater than zero!" << std::endl;
101 if( datasetName.find(
" ") == std::string::npos ){
102 this->datasetName = datasetName;
106 errorLog <<
"setDatasetName(const std::string datasetName) - The dataset name cannot contain any spaces!" << std::endl;
111 this->infoText = infoText;
117 for(UINT i=0; i<classTracker.
getSize(); i++){
118 if( classTracker[i].classLabel == classLabel ){
119 classTracker[i].className = className;
124 errorLog <<
"setClassNameForCorrespondingClassLabel(const std::string className,const UINT classLabel) - Failed to find class with label: " << classLabel << std::endl;
129 this->allowNullGestureClass = allowNullGestureClass;
135 if( sample.
getSize() != numDimensions ){
136 if( totalNumSamples == 0 ){
137 warningLog <<
"addSample(const UINT classLabel, VectorFloat &sample) - the size of the new sample (" << sample.
getSize() <<
") does not match the number of dimensions of the dataset (" << numDimensions <<
"), setting dimensionality to: " << numDimensions << std::endl;
138 numDimensions = sample.
getSize();
140 errorLog <<
"addSample(const UINT classLabel, VectorFloat &sample) - the size of the new sample (" << sample.
getSize() <<
") does not match the number of dimensions of the dataset (" << numDimensions <<
")" << std::endl;
146 if( classLabel == GRT_DEFAULT_NULL_CLASS_LABEL && !allowNullGestureClass ){
147 errorLog <<
"addSample(const UINT classLabel, VectorFloat &sample) - the class label can not be 0!" << std::endl;
152 crossValidationSetup =
false;
153 crossValidationIndexs.clear();
156 data.push_back( newSample );
159 if( classTracker.
getSize() == 0 ){
161 classTracker.push_back(tracker);
163 bool labelFound =
false;
164 for(UINT i=0; i<classTracker.
getSize(); i++){
165 if( classLabel == classTracker[i].classLabel ){
166 classTracker[i].counter++;
173 classTracker.push_back(tracker);
185 if( totalNumSamples == 0 ){
186 warningLog <<
"removeSample( const UINT index ) - Failed to remove sample, the training dataset is empty!" << std::endl;
190 if( index >= totalNumSamples ){
191 warningLog <<
"removeSample( const UINT index ) - Failed to remove sample, the index is out of bounds! Number of training samples: " << totalNumSamples <<
" index: " << index << std::endl;
196 crossValidationSetup =
false;
197 crossValidationIndexs.clear();
200 UINT classLabel = data[ index ].getClassLabel();
203 data.erase( data.begin()+index );
205 totalNumSamples = data.
getSize();
208 for(
size_t i=0; i<classTracker.
getSize(); i++){
209 if( classTracker[i].classLabel == classLabel ){
210 classTracker[i].counter--;
220 if( totalNumSamples == 0 ){
221 warningLog <<
"removeLastSample() - Failed to remove sample, the training dataset is empty!" << std::endl;
232 if( data.capacity() >= N )
return true;
244 for(
size_t i=0; i<classTracker.
getSize(); i++){
245 if( classTracker[i].classLabel == classLabel ){
246 warningLog <<
"addClass(const UINT classLabel,const std::string className) - Failed to add class, it already exists! Class label: " << classLabel << std::endl;
252 classTracker.push_back(
ClassTracker(classLabel,0,className) );
262 UINT numExamplesRemoved = 0;
263 UINT numExamplesToRemove = 0;
266 crossValidationSetup =
false;
267 crossValidationIndexs.clear();
270 for(UINT i=0; i<classTracker.
getSize(); i++){
271 if( classTracker[i].classLabel == classLabel ){
272 numExamplesToRemove = classTracker[i].counter;
273 classTracker.erase(classTracker.begin()+i);
279 if( numExamplesToRemove > 0 ){
281 while( numExamplesRemoved < numExamplesToRemove ){
282 if( data[i].getClassLabel() == classLabel ){
283 data.erase(data.begin()+i);
284 numExamplesRemoved++;
285 }
else if( ++i == data.
getSize() )
break;
289 totalNumSamples = data.
getSize();
291 return numExamplesRemoved;
295 bool oldClassLabelFound =
false;
296 bool newClassLabelAllReadyExists =
false;
297 UINT indexOfOldClassLabel = 0;
298 UINT indexOfNewClassLabel = 0;
301 for(UINT i=0; i<classTracker.
getSize(); i++){
302 if( classTracker[i].classLabel == oldClassLabel ){
303 indexOfOldClassLabel = i;
304 oldClassLabelFound =
true;
306 if( classTracker[i].classLabel == newClassLabel ){
307 indexOfNewClassLabel = i;
308 newClassLabelAllReadyExists =
true;
313 if( !oldClassLabelFound ){
318 for(UINT i=0; i<totalNumSamples; i++){
319 if( data[i].getClassLabel() == oldClassLabel ){
320 data[i].setClassLabel(newClassLabel);
325 if( newClassLabelAllReadyExists ){
327 classTracker[ indexOfNewClassLabel ].counter += classTracker[ indexOfOldClassLabel ].counter;
330 classTracker.push_back(
ClassTracker(newClassLabel,classTracker[ indexOfOldClassLabel ].counter,classTracker[ indexOfOldClassLabel ].className) );
334 classTracker.erase( classTracker.begin() + indexOfOldClassLabel );
344 if( externalRanges.size() != numDimensions )
return false;
346 this->externalRanges = externalRanges;
347 this->useExternalRanges = useExternalRanges;
353 if( externalRanges.
getSize() == numDimensions ){
354 this->useExternalRanges = useExternalRanges;
362 return scale(ranges,minTarget,maxTarget);
366 if( ranges.
getSize() != numDimensions )
return false;
369 for(UINT i=0; i<totalNumSamples; i++){
370 for(UINT j=0; j<numDimensions; j++){
371 data[i][j] = grt_scale(data[i][j],ranges[j].minValue,ranges[j].maxValue,minTarget,maxTarget);
403 file.open(filename.c_str(), std::ios::out);
405 if( !file.is_open() ){
409 file <<
"GRT_LABELLED_CLASSIFICATION_DATA_FILE_V1.0\n";
410 file <<
"DatasetName: " << datasetName << std::endl;
411 file <<
"InfoText: " << infoText << std::endl;
412 file <<
"NumDimensions: " << numDimensions << std::endl;
413 file <<
"TotalNumExamples: " << totalNumSamples << std::endl;
414 file <<
"NumberOfClasses: " << classTracker.size() << std::endl;
415 file <<
"ClassIDsAndCounters: " << std::endl;
417 for(UINT i=0; i<classTracker.size(); i++){
418 file << classTracker[i].classLabel <<
"\t" << classTracker[i].counter <<
"\t" << classTracker[i].className << std::endl;
421 file <<
"UseExternalRanges: " << useExternalRanges << std::endl;
423 if( useExternalRanges ){
424 for(UINT i=0; i<externalRanges.size(); i++){
425 file << externalRanges[i].minValue <<
"\t" << externalRanges[i].maxValue << std::endl;
431 for(UINT i=0; i<totalNumSamples; i++){
432 file << data[i].getClassLabel();
433 for(UINT j=0; j<numDimensions; j++){
434 file <<
"\t" << data[i][j];
446 file.open(filename.c_str(), std::ios::in);
450 if( !file.is_open() ){
451 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not open file!" << std::endl;
459 if(word !=
"GRT_LABELLED_CLASSIFICATION_DATA_FILE_V1.0"){
460 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not find file header!" << std::endl;
467 if(word !=
"DatasetName:"){
468 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName header!" << std::endl;
469 errorLog << word << std::endl;
476 if(word !=
"InfoText:"){
477 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find InfoText header!" << std::endl;
485 while( word !=
"NumDimensions:" ){
486 infoText += word +
" ";
491 if( word !=
"NumDimensions:" ){
492 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find NumDimensions header!" << std::endl;
496 file >> numDimensions;
500 if( word !=
"TotalNumTrainingExamples:" && word !=
"TotalNumExamples:" ){
501 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find TotalNumTrainingExamples header!" << std::endl;
505 file >> totalNumSamples;
509 if(word !=
"NumberOfClasses:"){
510 errorLog <<
"loadDatasetFromFile(string filename) - failed to find NumberOfClasses header!" << std::endl;
517 classTracker.
resize(numClasses);
521 if(word !=
"ClassIDsAndCounters:"){
522 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find ClassIDsAndCounters header!" << std::endl;
527 for(UINT i=0; i<classTracker.
getSize(); i++){
528 file >> classTracker[i].classLabel;
529 file >> classTracker[i].counter;
530 file >> classTracker[i].className;
535 if(word !=
"UseExternalRanges:"){
536 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find UseExternalRanges header!" << std::endl;
540 file >> useExternalRanges;
543 if( useExternalRanges ){
544 externalRanges.
resize(numDimensions);
545 for(UINT i=0; i<externalRanges.
getSize(); i++){
546 file >> externalRanges[i].minValue;
547 file >> externalRanges[i].maxValue;
553 if( word !=
"LabelledTrainingData:" && word !=
"Data:"){
554 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find LabelledTrainingData header!" << std::endl;
560 data.
resize( totalNumSamples, tempSample );
562 for(UINT i=0; i<totalNumSamples; i++){
566 for(UINT j=0; j<numDimensions; j++){
569 data[i].set(classLabel, sample);
583 file.open(filename.c_str(), std::ios::out );
585 if( !file.is_open() ){
590 for(UINT i=0; i<totalNumSamples; i++){
591 file << data[i].getClassLabel();
592 for(UINT j=0; j<numDimensions; j++){
593 file <<
"," << data[i][j];
606 datasetName =
"NOT_SET";
619 if( !parser.parseCSVFile(filename,
true) ){
620 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Failed to parse CSV file!" << std::endl;
624 if( !parser.getConsistentColumnSize() ){
625 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndexe) - The CSV file does not have a consistent number of columns!" << std::endl;
629 if( parser.getColumnSize() <= 1 ){
630 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - The CSV file does not have enough columns! It should contain at least two columns!" << std::endl;
635 numDimensions = parser.getColumnSize()-1;
648 totalNumSamples = parser.getRowSize();
649 for(UINT i=0; i<totalNumSamples; i++){
651 classLabel = grt_from_str< UINT >( parser[i][classLabelColumnIndex] );
654 data[i].setClassLabel( classLabel );
659 while( j != numDimensions ){
660 if( n != classLabelColumnIndex ){
661 data[i][j++] = grt_from_str< Float >( parser[i][n] );
667 if( classTracker.size() == 0 ){
669 classTracker.push_back(tracker);
671 bool labelFound =
false;
672 const size_t numClasses = classTracker.size();
673 for(
size_t i=0; i<numClasses; i++){
674 if( classLabel == classTracker[i].classLabel ){
675 classTracker[i].counter++;
682 classTracker.push_back(tracker);
702 sort(classTracker.begin(),classTracker.end(),ClassTracker::sortByClassLabelAscending);
707 ClassificationData ClassificationData::partition(
const UINT trainingSizePercentage,
const bool useStratifiedSampling){
708 return split(trainingSizePercentage, useStratifiedSampling);
719 crossValidationSetup =
false;
720 crossValidationIndexs.clear();
730 UINT randomIndex = 0;
732 if( useStratifiedSampling ){
737 for(UINT i=0; i<totalNumSamples; i++){
743 UINT numSamples = classData[k].
getSize();
744 for(UINT x=0; x<numSamples; x++){
749 SWAP(classData[k][ x ], classData[k][ randomIndex ]);
754 UINT numTrainingSamples = 0;
755 UINT numTestSamples = 0;
758 UINT numTrainingExamples = (UINT) floor( Float(classData[k].size()) / 100.0 * Float(trainingSizePercentage) );
759 UINT numTestExamples = ((UINT)classData[k].size())-numTrainingExamples;
760 numTrainingSamples += numTrainingExamples;
761 numTestSamples += numTestExamples;
764 trainingSet.
reserve( numTrainingSamples );
765 testSet.
reserve( numTestSamples );
769 UINT numTrainingExamples = (UINT) floor( Float(classData[k].getSize()) / 100.0 * Float(trainingSizePercentage) );
772 for(UINT i=0; i<numTrainingExamples; i++){
773 trainingSet.
addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() );
775 for(UINT i=numTrainingExamples; i<classData[k].
getSize(); i++){
776 testSet.
addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() );
781 const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
784 UINT randomIndex = 0;
785 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
786 std::random_shuffle(indexs.begin(), indexs.end());
789 trainingSet.
reserve( numTrainingExamples );
790 testSet.
reserve( totalNumSamples-numTrainingExamples );
793 for(UINT i=0; i<numTrainingExamples; i++){
794 trainingSet.
addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() );
796 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
797 testSet.
addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() );
816 errorLog <<
"merge(const ClassificationData &labelledData) - The number of dimensions in the labelledData (" << labelledData.
getNumDimensions() <<
") does not match the number of dimensions of this dataset (" << numDimensions <<
")" << std::endl;
821 crossValidationSetup =
false;
822 crossValidationIndexs.clear();
829 addSample(labelledData[i].getClassLabel(), labelledData[i].getSample());
834 for(UINT i=0; i<classTracker.size(); i++){
846 crossValidationSetup =
false;
847 crossValidationIndexs.clear();
851 errorLog <<
"spiltDataIntoKFolds(const UINT K,const bool useStratifiedSampling) - K can not be zero!" << std::endl;
856 if( K > totalNumSamples ){
857 errorLog <<
"spiltDataIntoKFolds(const UINT K,const bool useStratifiedSampling) - K can not be larger than the total number of samples in the dataset!" << std::endl;
862 if( useStratifiedSampling ){
863 for(UINT c=0; c<classTracker.size(); c++){
864 if( K > classTracker[c].counter ){
865 errorLog <<
"spiltDataIntoKFolds(const UINT K,const bool useStratifiedSampling) - K can not be larger than the number of samples in any given class!" << std::endl;
876 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
879 crossValidationIndexs.
resize(K);
883 UINT randomIndex = 0;
885 if( useStratifiedSampling ){
890 for(UINT i=0; i<totalNumSamples; i++){
896 UINT numSamples = (UINT)classData[c].size();
897 for(UINT x=0; x<numSamples; x++){
902 SWAP(classData[c][ x ] , classData[c][ randomIndex ]);
909 iter = classData[ c ].begin();
911 while( iter != classData[c].end() ){
912 crossValidationIndexs[ k ].push_back( *iter );
921 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
922 for(UINT x=0; x<totalNumSamples; x++){
927 SWAP(indexs[ x ] , indexs[ randomIndex ]);
932 for(UINT i=0; i<totalNumSamples; i++){
934 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
937 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
944 crossValidationSetup =
true;
955 if( !crossValidationSetup ){
956 errorLog <<
"getTrainingFoldData(const UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
960 if( foldIndex >= kFoldValue )
return trainingData;
964 trainingData.
addClass( classTracker[k].classLabel, classTracker[k].className );
969 for(UINT k=0; k<kFoldValue; k++){
970 if( k != foldIndex ){
971 for(UINT i=0; i<crossValidationIndexs[k].
getSize(); i++){
973 index = crossValidationIndexs[k][i];
974 trainingData.
addSample( data[ index ].getClassLabel(), data[ index ].getSample() );
991 if( !crossValidationSetup )
return testData;
993 if( foldIndex >= kFoldValue )
return testData;
997 testData.
addClass( classTracker[k].classLabel, classTracker[k].className );
1000 testData.
reserve( crossValidationIndexs[ foldIndex ].getSize() );
1004 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].
getSize(); i++){
1006 index = crossValidationIndexs[ foldIndex ][i];
1007 testData.
addSample( data[ index ].getClassLabel(), data[ index ].getSample() );
1023 for(UINT i=0; i<classTracker.
getSize(); i++){
1024 if( classTracker[i].classLabel == classLabel ){
1025 classData.
reserve( classTracker[i].counter );
1030 for(UINT i=0; i<totalNumSamples; i++){
1031 if( data[i].getClassLabel() == classLabel ){
1032 classData.
addSample(classLabel, data[i].getSample());
1047 if( numSamples == 0 ) numSamples = totalNumSamples;
1049 newDataset.
reserve( numSamples );
1054 for(UINT k=0; k<K; k++){
1055 newDataset.
addClass( classTracker[k].classLabel );
1058 if( balanceDataset ){
1061 for(UINT i=0; i<totalNumSamples; i++){
1066 UINT numSamplesPerClass = (UINT)floor( numSamples / Float(K) );
1069 UINT classIndex = 0;
1070 UINT classCounter = 0;
1071 UINT randomIndex = 0;
1072 for(UINT i=0; i<numSamples; i++){
1074 randomIndex = classIndexs[ classIndex ][ randomIndex ];
1075 newDataset.
addSample(data[ randomIndex ].getClassLabel(), data[ randomIndex ].getSample());
1076 if( classCounter++ >= numSamplesPerClass && classIndex+1 < K ){
1085 for(UINT i=0; i<numSamples; i++){
1087 newDataset.
addSample( data[randomIndex].getClassLabel(), data[randomIndex].getSample() );
1105 if( totalNumSamples == 0 ){
1106 return regressionData;
1109 const UINT numInputDimensions = numDimensions;
1113 for(UINT i=0; i<totalNumSamples; i++){
1117 UINT classLabel = data[i].getClassLabel();
1119 if( classLabel > 0 ){
1120 targetVector[ classLabel-1 ] = 1;
1122 regressionData.
clear();
1123 return regressionData;
1126 regressionData.
addSample(data[i].getSample(),targetVector);
1129 return regressionData;
1136 if( totalNumSamples == 0 ){
1137 return unlabelledData;
1142 for(UINT i=0; i<totalNumSamples; i++){
1143 unlabelledData.
addSample( data[i].getSample() );
1146 return unlabelledData;
1152 for(UINT i=0; i<classTracker.
getSize(); i++){
1153 if( classTracker[i].classLabel < minClassLabel ){
1154 minClassLabel = classTracker[i].classLabel;
1158 return minClassLabel;
1163 UINT maxClassLabel = 0;
1165 for(UINT i=0; i<classTracker.
getSize(); i++){
1166 if( classTracker[i].classLabel > maxClassLabel ){
1167 maxClassLabel = classTracker[i].classLabel;
1171 return maxClassLabel;
1175 for(UINT k=0; k<classTracker.
getSize(); k++){
1176 if( classTracker[k].classLabel == classLabel ){
1180 warningLog <<
"getClassLabelIndexValue(UINT classLabel) - Failed to find class label: " << classLabel <<
" in class tracker!" << std::endl;
1186 for(UINT i=0; i<classTracker.
getSize(); i++){
1187 if( classTracker[i].classLabel == classLabel ){
1188 return classTracker[i].className;
1192 return "CLASS_LABEL_NOT_FOUND";
1196 std::string statsText;
1197 statsText +=
"DatasetName:\t" + datasetName +
"\n";
1198 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
1199 statsText +=
"Number of Dimensions:\t" +
Util::toString( numDimensions ) +
"\n";
1200 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
1202 statsText +=
"ClassStats:\n";
1205 statsText +=
"ClassLabel:\t" +
Util::toString( classTracker[k].classLabel );
1206 statsText +=
"\tNumber of Samples:\t" +
Util::toString(classTracker[k].counter);
1207 statsText +=
"\tClassName:\t" + classTracker[k].className +
"\n";
1212 statsText +=
"Dataset Ranges:\n";
1213 for(UINT j=0; j<ranges.size(); j++){
1223 if( useExternalRanges )
return externalRanges;
1228 if( totalNumSamples > 0 ){
1229 for(UINT j=0; j<numDimensions; j++){
1230 ranges[j].minValue = data[0][j];
1231 ranges[j].maxValue = data[0][j];
1232 for(UINT i=0; i<totalNumSamples; i++){
1233 if( data[i][j] < ranges[j].minValue ){ ranges[j].minValue = data[i][j]; }
1234 else if( data[i][j] > ranges[j].maxValue ){ ranges[j].maxValue = data[i][j]; }
1247 classLabels[i] = classTracker[i].classLabel;
1259 classSampleCounts[i] = classTracker[i].counter;
1262 return classSampleCounts;
1269 for(UINT j=0; j<numDimensions; j++){
1270 for(UINT i=0; i<totalNumSamples; i++){
1271 mean[j] += data[i][j];
1273 mean[j] /= Float(totalNumSamples);
1284 for(UINT j=0; j<numDimensions; j++){
1285 for(UINT i=0; i<totalNumSamples; i++){
1286 stdDev[j] += SQR(data[i][j]-mean[j]);
1288 stdDev[j] = sqrt( stdDev[j] / Float(totalNumSamples-1) );
1301 for(UINT i=0; i<ranges.size(); i++){
1302 binRange[i] = (ranges[i].maxValue-ranges[i].minValue)/Float(numBins);
1309 for(UINT i=0; i<M; i++){
1310 if( data[i].getClassLabel() == classLabel ){
1311 for(UINT j=0; j<N; j++){
1313 bool binFound =
false;
1314 for(UINT k=0; k<numBins-1; k++){
1315 if( data[i][j] >= ranges[i].minValue + (binRange[j]*k) && data[i][j] >= ranges[i].minValue + (binRange[j]*(k+1)) ){
1321 if( !binFound ) binIndex = numBins-1;
1322 histData[j][binIndex]++;
1328 if( norm == 0 )
return histData;
1333 histData[i][j] /= norm;
1347 for(UINT i=0; i<totalNumSamples; i++){
1349 for(UINT j=0; j<numDimensions; j++){
1350 mean[classIndex][j] += data[i][j];
1352 counter[ classIndex ]++;
1356 for(UINT j=0; j<numDimensions; j++){
1357 mean[k][j] = counter[k] > 0 ? mean[k][j]/counter[k] : 0;
1372 for(UINT i=0; i<totalNumSamples; i++){
1374 for(UINT j=0; j<numDimensions; j++){
1375 stdDev[classIndex][j] += SQR(data[i][j]-mean[classIndex][j]);
1377 counter[ classIndex ]++;
1381 for(UINT j=0; j<numDimensions; j++){
1382 stdDev[k][j] = sqrt( stdDev[k][j] / Float(counter[k]-1) );
1392 MatrixFloat covariance(numDimensions,numDimensions);
1394 for(UINT j=0; j<numDimensions; j++){
1395 for(UINT k=0; k<numDimensions; k++){
1396 for(UINT i=0; i<totalNumSamples; i++){
1397 covariance[j][k] += (data[i][j]-mean[j]) * (data[i][k]-mean[k]) ;
1399 covariance[j][k] /= Float(totalNumSamples-1);
1410 for(UINT k=0; k<K; k++){
1417 VectorFloat ClassificationData::getClassProbabilities()
const {
1422 const UINT K = (UINT)classLabels.size();
1426 for(UINT k=0; k<K; k++){
1427 for(UINT n=0; n<N; n++){
1428 if( classLabels[k] == classTracker[n].classLabel ){
1429 x[k] = classTracker[n].counter;
1430 sum += classTracker[n].counter;
1438 for(UINT k=0; k<K; k++){
1453 for(UINT k=0; k<K; k++){
1454 if( classTracker[k].classLabel == classLabel){
1455 N = classTracker[k].counter;
1462 for(UINT i=0; i<M; i++){
1463 if( data[i].getClassLabel() == classLabel ){
1464 classIndexes[index++] = i;
1468 return classIndexes;
1477 for(UINT i=0; i<M; i++){
1478 for(UINT j=0; j<N; j++){
1479 d[i][j] = data[i][j];
1491 for(UINT i=0; i<M; i++){
1492 for(UINT j=0; j<N; j++){
1493 d[i][j] = data[i][j];
1506 for(UINT k=0; k<numClasses; k++){
1507 for(UINT j=0; j<numDimensions; j++){
1516 for(UINT i=0; i<numSamples; i++){
1523 for(UINT j=0; j<numDimensions; j++){
1528 UINT classLabel = k + 1;
1535 return data.
save( filename );
bool saveDatasetToFile(const std::string &filename) const
bool setDatasetName(std::string datasetName)
bool loadDatasetFromFile(const std::string &filename)
static std::string toString(const int &i)
RegressionData reformatAsRegressionData() const
ClassificationData & operator=(const ClassificationData &rhs)
static bool generateGaussDataset(const std::string filename, const UINT numSamples=10000, const UINT numClasses=10, const UINT numDimensions=3, const Float range=10, const Float sigma=1)
bool addSample(const VectorFloat &sample)
The ClassificationData is the main data structure for recording, labeling, managing, saving, and loading training data for supervised learning problems.
bool relabelAllSamplesWithClassLabel(const UINT oldClassLabel, const UINT newClassLabel)
bool addSample(UINT classLabel, const VectorFloat &sample)
ClassificationData getTestFoldData(const UINT foldIndex) const
bool addClass(const UINT classLabel, const std::string className="NOT_SET")
Vector< ClassTracker > getClassTracker() const
ClassificationData getClassData(const UINT classLabel) const
virtual bool resize(const unsigned int size)
bool setNumDimensions(UINT numDimensions)
UINT eraseAllSamplesWithClassLabel(const UINT classLabel)
MatrixDouble getDataAsMatrixDouble() const
MatrixFloat getClassMean() const
Float getRandomNumberGauss(Float mu=0.0, Float sigma=1.0)
std::string getClassNameForCorrespondingClassLabel(const UINT classLabel) const
bool setClassNameForCorrespondingClassLabel(std::string className, UINT classLabel)
Vector< UINT > getClassLabels() const
bool loadDatasetFromCSVFile(const std::string &filename, const UINT classLabelColumnIndex=0)
bool setAllowNullGestureClass(bool allowNullGestureClass)
UINT getMinimumClassLabel() const
Vector< MatrixFloat > getHistogramData(const UINT numBins) const
UINT removeClass(const UINT classLabel)
ClassificationData(UINT numDimensions=0, std::string datasetName="NOT_SET", std::string infoText="")
bool setAllValues(const T &value)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
bool setInfoText(std::string infoText)
Vector< UINT > getNumSamplesPerClass() const
MatrixFloat getCovarianceMatrix() const
UnlabelledData reformatAsUnlabelledData() const
bool removeSample(const UINT index)
UINT getNumSamples() const
bool spiltDataIntoKFolds(const UINT K, const bool useStratifiedSampling=false)
bool save(const std::string &filename) const
bool setNumDimensions(const UINT numDimensions)
bool enableExternalRangeScaling(const bool useExternalRanges)
bool setExternalRanges(const Vector< MinMax > &externalRanges, const bool useExternalRanges=false)
bool reserve(const UINT N)
bool saveDatasetToCSVFile(const std::string &filename) const
unsigned int getNumRows() const
UINT getNumDimensions() const
UINT getNumClasses() const
unsigned int getNumCols() const
Vector< MinMax > getRanges() const
Float getRandomNumberUniform(Float minRange=0.0, Float maxRange=1.0)
bool merge(const ClassificationData &data)
ClassificationData split(const UINT splitPercentage, const bool useStratifiedSampling=false)
VectorFloat getStdDev() const
Vector< UINT > getClassDataIndexes(const UINT classLabel) const
int getRandomNumberInt(int minRange, int maxRange)
MatrixFloat getDataAsMatrixFloat() const
static bool stringEndsWith(const std::string &str, const std::string &ending)
UINT getClassLabelIndexValue(const UINT classLabel) const
ClassificationData getBootstrappedDataset(UINT numSamples=0, bool balanceDataset=false) const
MatrixFloat getClassHistogramData(const UINT classLabel, const UINT numBins) const
ClassificationData getTrainingFoldData(const UINT foldIndex) const
UINT getMaximumClassLabel() const
bool scale(const Float minTarget, const Float maxTarget)
bool load(const std::string &filename)
MatrixFloat getClassStdDev() const
bool addSample(const VectorFloat &inputVector, const VectorFloat &targetVector)
std::string getStatsAsString() const
virtual ~ClassificationData()
VectorFloat getMean() const