GestureRecognitionToolkit  Version: 0.2.5
The Gesture Recognition Toolkit (GRT) is a cross-platform, open-source, c++ machine learning library for real-time gesture recognition.
RegressionData.cpp
1 /*
2 GRT MIT License
3 Copyright (c) <2012> <Nicholas Gillian, Media Lab, MIT>
4 
5 Permission is hereby granted, free of charge, to any person obtaining a copy of this software
6 and associated documentation files (the "Software"), to deal in the Software without restriction,
7 including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
9 subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in all copies or substantial
12 portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
15 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
16 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
17 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
18 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 */
20 
21 #define GRT_DLL_EXPORTS
22 #include "RegressionData.h"
23 
24 GRT_BEGIN_NAMESPACE
25 
26 RegressionData::RegressionData(const UINT numInputDimensions,const UINT numTargetDimensions,const std::string datasetName,const std::string infoText):totalNumSamples(0){
27  this->numInputDimensions = numInputDimensions;
28  this->numTargetDimensions = numTargetDimensions;
29  this->datasetName = datasetName;
30  this->infoText = infoText;
31  kFoldValue = 0;
32  crossValidationSetup = false;
33  useExternalRanges = false;
34  debugLog.setKey("[DEBUG RegressionData]");
35  errorLog.setKey("[ERROR RegressionData]");
36  warningLog.setKey("[WARNING RegressionData]");
37 }
38 
40  *this = rhs;
41 }
42 
44 
46  if( this != &rhs){
47  this->datasetName = rhs.datasetName;
48  this->infoText = rhs.infoText;
49  this->numInputDimensions = rhs.numInputDimensions;
50  this->numTargetDimensions = rhs.numTargetDimensions;
51  this->totalNumSamples = rhs.totalNumSamples;
52  this->kFoldValue = rhs.kFoldValue;
53  this->crossValidationSetup = rhs.crossValidationSetup;
54  this->useExternalRanges = rhs.useExternalRanges;
55  this->externalInputRanges = rhs.externalInputRanges;
56  this->externalTargetRanges = rhs.externalTargetRanges;
57  this->data = rhs.data;
58  this->crossValidationIndexs = rhs.crossValidationIndexs;
59  this->debugLog = rhs.debugLog;
60  this->errorLog = rhs.errorLog;
61  this->warningLog = rhs.warningLog;
62  }
63  return *this;
64 }
65 
67  totalNumSamples = 0;
68  kFoldValue = 0;
69  crossValidationSetup = false;
70  data.clear();
71  crossValidationIndexs.clear();
72 }
73 
74 bool RegressionData::setInputAndTargetDimensions(const UINT numInputDimensions,const UINT numTargetDimensions){
75  clear();
76  if( numInputDimensions > 0 && numTargetDimensions > 0 ){
77  this->numInputDimensions = numInputDimensions;
78  this->numTargetDimensions = numTargetDimensions;
79 
80  //Clear the external ranges
81  useExternalRanges = false;
82  externalInputRanges.clear();
83  externalTargetRanges.clear();
84  return true;
85  }
86  errorLog << "setInputAndTargetDimensions(UINT numInputDimensions,UINT numTargetDimensions) - The number of input and target dimensions should be greater than zero!" << std::endl;
87  return false;
88 }
89 
90 bool RegressionData::setDatasetName(const std::string &datasetName){
91 
92  //Make sure there are no spaces in the string
93  if( datasetName.find(" ") == std::string::npos ){
94  this->datasetName = datasetName;
95  return true;
96  }
97 
98  errorLog << "setDatasetName(const string &datasetName) - The dataset name cannot contain any spaces!" << std::endl;
99  return false;
100 }
101 
102 bool RegressionData::setInfoText(const std::string &infoText){
103  this->infoText = infoText;
104  return true;
105 }
106 
107 bool RegressionData::addSample(const VectorFloat &inputVector,const VectorFloat &targetVector){
108  if( inputVector.getSize() == numInputDimensions && targetVector.getSize() == numTargetDimensions ){
109  data.push_back( RegressionSample(inputVector,targetVector) );
110  totalNumSamples++;
111 
112  //The dataset has changed so flag that any previous cross validation setup will now not work
113  crossValidationSetup = false;
114  crossValidationIndexs.clear();
115  return true;
116  }
117  errorLog << "addSample(const VectorFloat &inputVector,const VectorFloat &targetVector) - The inputVector size or targetVector size does not match the size of the numInputDimensions or numTargetDimensions" << std::endl;
118  return false;
119 }
120 
122  if( totalNumSamples > 0 ){
123  //Remove the training example from the buffer
124  data.erase(data.end()-1);
125  totalNumSamples = data.getSize();
126 
127  //The dataset has changed so flag that any previous cross validation setup will now not work
128  crossValidationSetup = false;
129  crossValidationIndexs.clear();
130  return true;
131  }
132  warningLog << "removeLastSample() - There are no samples to remove!" << std::endl;
133  return false;
134 }
135 
136 bool RegressionData::reserve(const UINT N){
137 
138  data.reserve( N );
139 
140  if( data.capacity() >= N ) return true;
141 
142  return false;
143 }
144 
145 bool RegressionData::setExternalRanges(const Vector< MinMax > &externalInputRanges,const Vector< MinMax > & externalTargetRanges,const bool useExternalRanges){
146 
147  if( externalInputRanges.getSize() != numInputDimensions ) return false;
148  if( externalTargetRanges.getSize() != numTargetDimensions ) return false;
149 
150  this->externalInputRanges = externalInputRanges;
151  this->externalTargetRanges = externalTargetRanges;
152  this->useExternalRanges = useExternalRanges;
153 
154  return true;
155 }
156 
157 bool RegressionData::enableExternalRangeScaling(const bool useExternalRanges){
158  if( externalInputRanges.getSize() != numInputDimensions && externalTargetRanges.getSize() != numTargetDimensions ){
159  this->useExternalRanges = useExternalRanges;
160  return true;
161  }
162  return false;
163 }
164 
165 bool RegressionData::scale(const Float minTarget,const Float maxTarget){
166  Vector< MinMax > inputRanges = getInputRanges();
167  Vector< MinMax > targetRanges = getTargetRanges();
168  return scale(inputRanges,targetRanges,minTarget,maxTarget);
169 }
170 
171 bool RegressionData::scale(const Vector< MinMax > &inputVectorRanges,const Vector< MinMax > &targetVectorRanges,const Float minTarget,const Float maxTarget){
172  if( inputVectorRanges.getSize() == numInputDimensions && targetVectorRanges.getSize() == numTargetDimensions ){
173 
174  VectorFloat scaledInputVector(numInputDimensions,0);
175  VectorFloat scaledTargetVector(numTargetDimensions,0);
176  for(UINT i=0; i<totalNumSamples; i++){
177 
178  //Scale the input Vector
179  for(UINT j=0; j<numInputDimensions; j++){
180  scaledInputVector[j] = grt_scale(data[i].getInputVectorValue(j),inputVectorRanges[j].minValue,inputVectorRanges[j].maxValue,minTarget,maxTarget);
181  }
182  //Scale the target Vector
183  for(UINT j=0; j<numTargetDimensions; j++){
184  scaledTargetVector[j] = grt_scale(data[i].getTargetVectorValue(j),targetVectorRanges[j].minValue,targetVectorRanges[j].maxValue,minTarget,maxTarget);
185  }
186  //Update the training sample with the scaled data
187  data[i].set(scaledInputVector,scaledTargetVector);
188  }
189 
190  return true;
191  }
192  return false;
193 }
194 
196 
197  if( useExternalRanges ) return externalInputRanges;
198 
199  Vector< MinMax > ranges(numInputDimensions);
200 
201  if( totalNumSamples > 0 ){
202  for(UINT j=0; j<numInputDimensions; j++){
203  ranges[j].minValue = data[0].getInputVectorValue(j);
204  ranges[j].maxValue = data[0].getInputVectorValue(j);
205  for(UINT i=0; i<totalNumSamples; i++){
206  if( data[i].getInputVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getInputVectorValue(j); } //Search for the min value
207  else if( data[i].getInputVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getInputVectorValue(j); } //Search for the max value
208  }
209  }
210  }
211  return ranges;
212 }
213 
215 
216  if( useExternalRanges ) return externalTargetRanges;
217 
218  Vector< MinMax > ranges(numTargetDimensions);
219 
220  if( totalNumSamples > 0 ){
221  for(UINT j=0; j<numTargetDimensions; j++){
222  ranges[j].minValue = data[0].getTargetVectorValue(j);
223  ranges[j].maxValue = data[0].getTargetVectorValue(j);
224  for(UINT i=0; i<totalNumSamples; i++){
225  if( data[i].getTargetVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getTargetVectorValue(j); } //Search for the min value
226  else if( data[i].getTargetVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getTargetVectorValue(j); } //Search for the max value
227  }
228  }
229  }
230  return ranges;
231 }
232 
233 std::string RegressionData::getStatsAsString() const{
234 
235  std::string statsText;
236  statsText += "DatasetName:\t" + datasetName + "\n";
237  statsText += "DatasetInfo:\t" + infoText + "\n";
238  statsText += "Number of Input Dimensions:\t" + Util::toString( numInputDimensions ) + "\n";
239  statsText += "Number of Target Dimensions:\t" + Util::toString( numTargetDimensions ) + "\n";
240  statsText += "Number of Samples:\t" + Util::toString( totalNumSamples ) + "\n";
241 
242  Vector< MinMax > inputRanges = getInputRanges();
243 
244  statsText += "Dataset Input Dimension Ranges:\n";
245  for(UINT j=0; j<inputRanges.size(); j++){
246  statsText += "[" + Util::toString( j+1 ) + "] Min:\t" + Util::toString( inputRanges[j].minValue ) + "\tMax: " + Util::toString( inputRanges[j].maxValue ) + "\n";
247  }
248 
249  Vector< MinMax > targetRanges = getTargetRanges();
250 
251  statsText += "Dataset Target Dimension Ranges:\n";
252  for(UINT j=0; j<targetRanges.size(); j++){
253  statsText += "[" + Util::toString( j+1 ) + "] Min:\t" + Util::toString( targetRanges[j].minValue ) + "\tMax: " + Util::toString( targetRanges[j].maxValue ) + "\n";
254  }
255  return statsText;
256 }
257 
258 bool RegressionData::printStats() const{
259  std::cout << getStatsAsString();
260  return true;
261 }
262 
263 RegressionData RegressionData::partition(const UINT trainingSizePercentage){
264  return split( trainingSizePercentage );
265 }
266 
267 RegressionData RegressionData::split(const UINT trainingSizePercentage){
268 
269  //Partitions the dataset into a training dataset (which is kept by this instance of the RegressionData) and
270  //a testing/validation dataset (which is return as a new instance of the RegressionData). The trainingSizePercentage
271  //therefore sets the size of the data which remains in this instance and the remaining percentage of data is then added to
272  //the testing/validation dataset
273 
274  const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
275 
276  RegressionData trainingSet(numInputDimensions,numTargetDimensions);
277  RegressionData testSet(numInputDimensions,numTargetDimensions);
278  Vector< UINT > indexs( totalNumSamples );
279 
280  //Create the random partion indexs
281  Random random;
282  UINT randomIndex = 0;
283  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
284  for(UINT x=0; x<totalNumSamples; x++){
285  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
286  SWAP( indexs[ x ] , indexs[ randomIndex ] );
287  }
288 
289  //Add the data to the training and test sets
290  for(UINT i=0; i<numTrainingExamples; i++){
291  trainingSet.addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
292  }
293  for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
294  testSet.addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
295  }
296 
297  //Overwrite the training data in this instance with the training data of the trainingSet
298  data = trainingSet.getData();
299  totalNumSamples = trainingSet.getNumSamples();
300 
301  //The dataset has changed so flag that any previous cross validation setup will now not work
302  crossValidationSetup = false;
303  crossValidationIndexs.clear();
304 
305  return testSet;
306 }
307 
308 bool RegressionData::merge(const RegressionData &regressionData){
309 
310  if( regressionData.getNumInputDimensions() != numInputDimensions ){
311  errorLog << "merge(RegressionData &regressionData) - The number of input dimensions in the regressionData (" << regressionData.getNumInputDimensions() << ") does not match the number of input dimensions of this dataset (" << numInputDimensions << ")" << std::endl;
312  return false;
313  }
314 
315  if( regressionData.getNumTargetDimensions() != numTargetDimensions ){
316  errorLog << "merge(RegressionData &regressionData) - The number of target dimensions in the regressionData (" << regressionData.getNumTargetDimensions() << ") does not match the number of target dimensions of this dataset (" << numTargetDimensions << ")" << std::endl;
317  return false;
318  }
319 
320  //Add the data from the labelledData to this instance
321  for(UINT i=0; i<regressionData.getNumSamples(); i++){
322  addSample(regressionData[i].getInputVector(), regressionData[i].getTargetVector());
323  }
324 
325  //The dataset has changed so flag that any previous cross validation setup will now not work
326  crossValidationSetup = false;
327  crossValidationIndexs.clear();
328 
329  return true;
330 }
331 
333 
334  crossValidationSetup = false;
335  crossValidationIndexs.clear();
336 
337  //K can not be zero
338  if( K > totalNumSamples ){
339  errorLog << "spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
340  return false;
341  }
342 
343  //K can not be larger than the number of examples
344  if( K > totalNumSamples ){
345  errorLog << "spiltDataIntoKFolds(UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
346  return false;
347  }
348 
349  //Setup the dataset for k-fold cross validation
350  kFoldValue = K;
351  Vector< UINT > indexs( totalNumSamples );
352 
353  //Work out how many samples are in each fold, the last fold might have more samples than the others
354  UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
355 
356  //Add the random indexs to each fold
357  crossValidationIndexs.resize(K);
358 
359  //Create the random partion indexs
360  Random random;
361  UINT randomIndex = 0;
362 
363  //Randomize the order of the data
364  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
365  for(UINT x=0; x<totalNumSamples; x++){
366  //Pick a random index
367  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
368 
369  //Swap the indexs
370  SWAP( indexs[ x ] , indexs[ randomIndex ] );
371  }
372 
373  UINT counter = 0;
374  UINT foldIndex = 0;
375  for(UINT i=0; i<totalNumSamples; i++){
376  //Add the index to the current fold
377  crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
378 
379  //Move to the next fold if ready
380  if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
381  foldIndex++;
382  counter = 0;
383  }
384  }
385 
386  crossValidationSetup = true;
387  return true;
388 
389 }
390 
392  RegressionData trainingData;
393 
394  if( !crossValidationSetup ){
395  errorLog << "getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
396  return trainingData;
397  }
398 
399  if( foldIndex >= kFoldValue ) return trainingData;
400 
401  trainingData.setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
402 
403  //Add the data to the training set, this will consist of all the data that is NOT in the foldIndex
404  UINT index = 0;
405  for(UINT k=0; k<kFoldValue; k++){
406  if( k != foldIndex ){
407  for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
408 
409  index = crossValidationIndexs[k][i];
410  trainingData.addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
411  }
412  }
413  }
414 
415  return trainingData;
416 }
417 
419  RegressionData testData;
420 
421  if( !crossValidationSetup ) return testData;
422 
423  if( foldIndex >= kFoldValue ) return testData;
424 
425  //Add the data to the training
426  testData.setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
427 
428  UINT index = 0;
429  for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
430 
431  index = crossValidationIndexs[ foldIndex ][i];
432  testData.addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
433  }
434 
435  return testData;
436 }
437 
438 UINT RegressionData::removeDuplicateSamples(){
439 
440  UINT numSamplesRemoved = 0;
441 
442  //Sort the data
443  sort(data.begin(),data.end(),RegressionSample::sortByInputVectorAscending );
444 
445  //Remove any samples that are very close to each other
446  Float minDist = 1.0e-5;
447  Float dist = 0;
448  Float totalDimensions = numInputDimensions + numTargetDimensions;
449  bool keepSearching = true;
450  Vector< RegressionSample >::iterator currentSample = data.begin();
451  Vector< RegressionSample >::iterator nextSample = data.begin()+1;
452 
453  if( currentSample == data.end() ) keepSearching = false;
454  if( nextSample == data.end() ) keepSearching = false;
455 
456  while( keepSearching ){
457  dist = 0;
458  for(UINT i=0; i<numInputDimensions; i++){
459  dist += SQR( currentSample->getInputVectorValue(i) - nextSample->getInputVectorValue(i) );
460  }
461  for(UINT i=0; i<numTargetDimensions; i++){
462  dist += SQR( currentSample->getTargetVectorValue(i) - nextSample->getTargetVectorValue(i) );
463  }
464  dist /= totalDimensions;
465  if( dist <= minDist ){
466  //Remove the next sample
467  currentSample = data.erase( nextSample );
468  nextSample = currentSample + 1;
469  numSamplesRemoved++;
470  debugLog << "Removing sample with dist: " << dist << std::endl;
471  }else{
472  currentSample++;
473  nextSample++;
474  }
475 
476  if( currentSample == data.end() ) keepSearching = false;
477  if( nextSample == data.end() ) keepSearching = false;
478  }
479 
480  return numSamplesRemoved;
481 }
482 
483 bool RegressionData::save(const std::string &filename) const{
484 
485  //Check if the file should be saved as a csv file
486  if( Util::stringEndsWith( filename, ".csv" ) ){
487  return saveDatasetToCSVFile( filename );
488  }
489 
490  //Otherwise save it as a custom GRT file
491  return saveDatasetToFile( filename );
492 }
493 
494 bool RegressionData::load(const std::string &filename){
495 
496  //Check if the file should be loaded as a csv file
497  if( Util::stringEndsWith( filename, ".csv" ) ){
498  return loadDatasetFromCSVFile( filename, numInputDimensions, numTargetDimensions );
499  }
500 
501  //Otherwise save it as a custom GRT file
502  return loadDatasetFromFile( filename );
503 }
504 
505 bool RegressionData::saveDatasetToFile(const std::string &filename) const{
506 
507  std::fstream file;
508  file.open(filename.c_str(), std::ios::out);
509 
510  if( !file.is_open() ){
511  errorLog << "saveDatasetToFile(const string &filename) - Failed to open file!" << std::endl;
512  return false;
513  }
514 
515  file << "GRT_LABELLED_REGRESSION_DATA_FILE_V1.0\n";
516  file << "DatasetName: " << datasetName << std::endl;
517  file << "InfoText: " << infoText << std::endl;
518  file << "NumInputDimensions: "<<numInputDimensions << std::endl;
519  file << "NumTargetDimensions: "<<numTargetDimensions << std::endl;
520  file << "TotalNumTrainingExamples: "<<totalNumSamples << std::endl;
521  file << "UseExternalRanges: " << useExternalRanges << std::endl;
522 
523  if( useExternalRanges ){
524  for(UINT i=0; i<externalInputRanges.getSize(); i++){
525  file << externalInputRanges[i].minValue << "\t" << externalInputRanges[i].maxValue << std::endl;
526  }
527  for(UINT i=0; i<externalTargetRanges.getSize(); i++){
528  file << externalTargetRanges[i].minValue << "\t" << externalTargetRanges[i].maxValue << std::endl;
529  }
530  }
531 
532  file << "RegressionData:\n";
533 
534  for(UINT i=0; i<totalNumSamples; i++){
535  for(UINT j=0; j<numInputDimensions; j++){
536  file << data[i].getInputVectorValue(j) << "\t";
537  }
538  for(UINT j=0; j<numTargetDimensions; j++){
539  file << data[i].getTargetVectorValue(j);
540  if( j!= numTargetDimensions-1 ) file << "\t";
541  }
542  file << std::endl;
543  }
544 
545  file.close();
546  return true;
547 }
548 
549 bool RegressionData::loadDatasetFromFile(const std::string &filename){
550 
551  std::fstream file;
552  file.open(filename.c_str(), std::ios::in);
553  clear();
554 
555  if( !file.is_open() ){
556  errorLog << "loadDatasetFromFile(const string &filename) - Failed to open file!" << std::endl;
557  return false;
558  }
559 
560  std::string word;
561 
562  //Check to make sure this is a file with the Training File Format
563  file >> word;
564  if(word != "GRT_LABELLED_REGRESSION_DATA_FILE_V1.0"){
565  errorLog << "loadDatasetFromFile(const string &filename) - Unknown file header!" << std::endl;
566  file.close();
567  return false;
568  }
569 
570  //Get the name of the dataset
571  file >> word;
572  if(word != "DatasetName:"){
573  errorLog << "loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
574  file.close();
575  return false;
576  }
577  file >> datasetName;
578 
579  file >> word;
580  if(word != "InfoText:"){
581  errorLog << "loadDatasetFromFile(const string &filename) - failed to find InfoText!" << std::endl;
582  file.close();
583  return false;
584  }
585 
586  //Load the info text
587  file >> word;
588  infoText = "";
589  while( word != "NumInputDimensions:" ){
590  infoText += word + " ";
591  file >> word;
592  }
593 
594  //Get the number of input dimensions in the training data
595  if(word != "NumInputDimensions:"){
596  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find NumInputDimensions!" << std::endl;
597  file.close();
598  return false;
599  }
600  file >> numInputDimensions;
601 
602  //Get the number of target dimensions in the training data
603  file >> word;
604  if(word != "NumTargetDimensions:"){
605  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find NumTargetDimensions!" << std::endl;
606  file.close();
607  return false;
608  }
609  file >> numTargetDimensions;
610 
611  //Get the total number of training examples in the training data
612  file >> word;
613  if(word != "TotalNumTrainingExamples:"){
614  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
615  file.close();
616  return false;
617  }
618  file >> totalNumSamples;
619 
620  //Check if the dataset should be scaled using external ranges
621  file >> word;
622  if(word != "UseExternalRanges:"){
623  errorLog << "loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
624  file.close();
625  return false;
626  }
627  file >> useExternalRanges;
628 
629  //If we are using external ranges then load them
630  if( useExternalRanges ){
631  externalInputRanges.resize(numInputDimensions);
632  externalTargetRanges.resize(numTargetDimensions);
633  for(UINT i=0; i<externalInputRanges.size(); i++){
634  file >> externalInputRanges[i].minValue;
635  file >> externalInputRanges[i].maxValue;
636  }
637  for(UINT i=0; i<externalTargetRanges.size(); i++){
638  file >> externalTargetRanges[i].minValue;
639  file >> externalTargetRanges[i].maxValue;
640  }
641  }
642 
643  //Get the main training data
644  file >> word;
645  if( word != "RegressionData:" && word != "LabelledRegressionData:" ){
646  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find RegressionData!" << std::endl;
647  file.close();
648  return false;
649  }
650 
651  VectorFloat inputVector(numInputDimensions);
652  VectorFloat targetVector(numTargetDimensions);
653  data.resize( totalNumSamples, RegressionSample(inputVector,targetVector) );
654 
655  for(UINT i=0; i<totalNumSamples; i++){
656  //Read the input Vector
657  for(UINT j=0; j<numInputDimensions; j++){
658  file >> inputVector[j];
659  }
660  for(UINT j=0; j<numTargetDimensions; j++){
661  file >> targetVector[j];
662  }
663  data[i].set(inputVector, targetVector);
664  }
665 
666  file.close();
667  return true;
668 }
669 
670 bool RegressionData::saveDatasetToCSVFile(const std::string &filename) const{
671 
672  std::fstream file;
673  file.open(filename.c_str(), std::ios::out );
674 
675  if( !file.is_open() ){
676  errorLog << "saveDatasetToCSVFile(const string &filename) - Failed to open file!" << std::endl;
677  return false;
678  }
679 
680  //Write the data to the CSV file
681  for(UINT i=0; i<totalNumSamples; i++){
682  for(UINT j=0; j<numInputDimensions; j++){
683  file << data[i].getInputVector()[j] << ",";
684  }
685  for(UINT j=0; j<numTargetDimensions; j++){
686  file << data[i].getTargetVector()[j];
687  if( j != numTargetDimensions-1 ) file << ",";
688  }
689  file << std::endl;
690  }
691 
692  file.close();
693 
694  return true;
695 }
696 
697 bool RegressionData::loadDatasetFromCSVFile(const std::string &filename,const UINT numInputDimensions,const UINT numTargetDimensions){
698 
699  std::fstream file;
700  std::string value;
701  clear();
702  datasetName = "NOT_SET";
703  infoText = "";
704 
705  //Clear any previous data
706  clear();
707 
708  //Parse the CSV file
709  FileParser parser;
710 
711  if( !parser.parseCSVFile(filename,true) ){
712  errorLog << "loadDatasetFromCSVFile(...) - Failed to parse CSV file!" << std::endl;
713  return false;
714  }
715 
716  if( !parser.getConsistentColumnSize() ){
717  errorLog << "loadDatasetFromCSVFile(...) - The CSV file does not have a consistent number of columns!" << std::endl;
718  return false;
719  }
720 
721  if( parser.getColumnSize() != numInputDimensions+numTargetDimensions ){
722  errorLog << "loadDatasetFromCSVFile(...) - The number of columns in the CSV file (" << parser.getColumnSize() << ")";
723  errorLog << " does not match the number of input dimensions plus the number of target dimensions (" << numInputDimensions+numTargetDimensions << ")" << std::endl;
724  return false;
725  }
726 
727  //Setup the labelled classification data
728  setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
729 
730  UINT n = 0;
731  VectorFloat inputVector(numInputDimensions);
732  VectorFloat targetVector(numTargetDimensions);
733  for(UINT i=0; i<parser.getRowSize(); i++){
734 
735  //Reset n
736  n = 0;
737 
738  //Get the input Vector
739  for(UINT j=0; j<numInputDimensions; j++){
740  inputVector[j] = grt_from_str< Float >( parser[i][n++] );
741  }
742 
743  //Get the target Vector
744  for(UINT j=0; j<numTargetDimensions; j++){
745  targetVector[j] = grt_from_str< Float >( parser[i][n++] );
746  }
747 
748  //Add the labelled sample to the dataset
749  if( !addSample(inputVector, targetVector) ){
750  warningLog << "loadDatasetFromCSVFile(string filename) - Could not add sample " << i << " to the dataset!" << std::endl;
751  }
752  }
753 
754  return true;
755 }
756 
757 GRT_END_NAMESPACE
758 
bool merge(const RegressionData &regressionData)
bool loadDatasetFromCSVFile(const std::string &filename, const UINT numInputDimensions, const UINT numTargetDimensions)
static std::string toString(const int &i)
Definition: Util.cpp:81
bool save(const std::string &filename) const
Vector< MinMax > getInputRanges() const
RegressionData & operator=(const RegressionData &rhs)
This file contains the Random class, a useful wrapper for generating cross platform random functions...
Definition: Random.h:46
bool load(const std::string &filename)
virtual bool resize(const unsigned int size)
Definition: Vector.h:133
RegressionData getTrainingFoldData(const UINT foldIndex) const
virtual bool setKey(const std::string &key)
sets the key that gets written at the start of each message, this will be written in the format &#39;key ...
Definition: Log.h:166
bool setInfoText(const std::string &infoText)
UINT getSize() const
Definition: Vector.h:201
UINT getNumInputDimensions() const
bool setExternalRanges(const Vector< MinMax > &externalInputRanges, const Vector< MinMax > &externalTargetRanges, const bool useExternalRanges)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
Vector< MinMax > getTargetRanges() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumTargetDimensions() const
RegressionData(const UINT numInputDimensions=0, const UINT numTargetDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool saveDatasetToCSVFile(const std::string &filename) const
The RegressionData is the main data structure for recording, labeling, managing, saving, and loading datasets that can be used to train and test the GRT supervised regression algorithms.
RegressionData split(const UINT trainingSizePercentage)
bool setDatasetName(const std::string &datasetName)
int getRandomNumberInt(int minRange, int maxRange)
Definition: Random.cpp:59
static bool stringEndsWith(const std::string &str, const std::string &ending)
Definition: Util.cpp:164
bool saveDatasetToFile(const std::string &filename) const
Vector< RegressionSample > getData() const
RegressionData getTestFoldData(const UINT foldIndex) const
bool loadDatasetFromFile(const std::string &filename)
bool enableExternalRangeScaling(const bool useExternalRanges)
bool addSample(const VectorFloat &inputVector, const VectorFloat &targetVector)
bool reserve(const UINT N)
bool spiltDataIntoKFolds(const UINT K)
UINT getNumSamples() const