GestureRecognitionToolkit  Version: 0.1.0
The Gesture Recognition Toolkit (GRT) is a cross-platform, open-source, c++ machine learning library for real-time gesture recognition.
RegressionData.cpp
1 /*
2 GRT MIT License
3 Copyright (c) <2012> <Nicholas Gillian, Media Lab, MIT>
4 
5 Permission is hereby granted, free of charge, to any person obtaining a copy of this software
6 and associated documentation files (the "Software"), to deal in the Software without restriction,
7 including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
9 subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in all copies or substantial
12 portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
15 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
16 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
17 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
18 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 */
20 
21 #include "RegressionData.h"
22 
23 GRT_BEGIN_NAMESPACE
24 
25 RegressionData::RegressionData(const UINT numInputDimensions,const UINT numTargetDimensions,const std::string datasetName,const std::string infoText):totalNumSamples(0){
26  this->numInputDimensions = numInputDimensions;
27  this->numTargetDimensions = numTargetDimensions;
28  this->datasetName = datasetName;
29  this->infoText = infoText;
30  kFoldValue = 0;
31  crossValidationSetup = false;
32  useExternalRanges = false;
33  debugLog.setProceedingText("[DEBUG LRD]");
34  errorLog.setProceedingText("[ERROR LRD]");
35  warningLog.setProceedingText("[WARNING LRD]");
36 }
37 
39  *this = rhs;
40 }
41 
43 
45  if( this != &rhs){
46  this->datasetName = rhs.datasetName;
47  this->infoText = rhs.infoText;
48  this->numInputDimensions = rhs.numInputDimensions;
49  this->numTargetDimensions = rhs.numTargetDimensions;
50  this->totalNumSamples = rhs.totalNumSamples;
51  this->kFoldValue = rhs.kFoldValue;
52  this->crossValidationSetup = rhs.crossValidationSetup;
53  this->useExternalRanges = rhs.useExternalRanges;
54  this->externalInputRanges = rhs.externalInputRanges;
55  this->externalTargetRanges = rhs.externalTargetRanges;
56  this->data = rhs.data;
57  this->crossValidationIndexs = rhs.crossValidationIndexs;
58  this->debugLog = rhs.debugLog;
59  this->errorLog = rhs.errorLog;
60  this->warningLog = rhs.warningLog;
61  }
62  return *this;
63 }
64 
66  totalNumSamples = 0;
67  kFoldValue = 0;
68  crossValidationSetup = false;
69  data.clear();
70  crossValidationIndexs.clear();
71 }
72 
73 bool RegressionData::setInputAndTargetDimensions(const UINT numInputDimensions,const UINT numTargetDimensions){
74  clear();
75  if( numInputDimensions > 0 && numTargetDimensions > 0 ){
76  this->numInputDimensions = numInputDimensions;
77  this->numTargetDimensions = numTargetDimensions;
78 
79  //Clear the external ranges
80  useExternalRanges = false;
81  externalInputRanges.clear();
82  externalTargetRanges.clear();
83  return true;
84  }
85  errorLog << "setInputAndTargetDimensions(UINT numInputDimensions,UINT numTargetDimensions) - The number of input and target dimensions should be greater than zero!" << std::endl;
86  return false;
87 }
88 
89 bool RegressionData::setDatasetName(const std::string &datasetName){
90 
91  //Make sure there are no spaces in the string
92  if( datasetName.find(" ") == std::string::npos ){
93  this->datasetName = datasetName;
94  return true;
95  }
96 
97  errorLog << "setDatasetName(const string &datasetName) - The dataset name cannot contain any spaces!" << std::endl;
98  return false;
99 }
100 
101 bool RegressionData::setInfoText(const std::string &infoText){
102  this->infoText = infoText;
103  return true;
104 }
105 
106 bool RegressionData::addSample(const VectorFloat &inputVector,const VectorFloat &targetVector){
107  if( inputVector.getSize() == numInputDimensions && targetVector.getSize() == numTargetDimensions ){
108  data.push_back( RegressionSample(inputVector,targetVector) );
109  totalNumSamples++;
110 
111  //The dataset has changed so flag that any previous cross validation setup will now not work
112  crossValidationSetup = false;
113  crossValidationIndexs.clear();
114  return true;
115  }
116  errorLog << "addSample(const VectorFloat &inputVector,const VectorFloat &targetVector) - The inputVector size or targetVector size does not match the size of the numInputDimensions or numTargetDimensions" << std::endl;
117  return false;
118 }
119 
121  if( totalNumSamples > 0 ){
122  //Remove the training example from the buffer
123  data.erase(data.end()-1);
124  totalNumSamples = data.getSize();
125 
126  //The dataset has changed so flag that any previous cross validation setup will now not work
127  crossValidationSetup = false;
128  crossValidationIndexs.clear();
129  return true;
130  }
131  warningLog << "removeLastSample() - There are no samples to remove!" << std::endl;
132  return false;
133 }
134 
135 bool RegressionData::reserve(const UINT N){
136 
137  data.reserve( N );
138 
139  if( data.capacity() >= N ) return true;
140 
141  return false;
142 }
143 
144 bool RegressionData::setExternalRanges(const Vector< MinMax > &externalInputRanges,const Vector< MinMax > & externalTargetRanges,const bool useExternalRanges){
145 
146  if( externalInputRanges.getSize() != numInputDimensions ) return false;
147  if( externalTargetRanges.getSize() != numTargetDimensions ) return false;
148 
149  this->externalInputRanges = externalInputRanges;
150  this->externalTargetRanges = externalTargetRanges;
151  this->useExternalRanges = useExternalRanges;
152 
153  return true;
154 }
155 
156 bool RegressionData::enableExternalRangeScaling(const bool useExternalRanges){
157  if( externalInputRanges.getSize() != numInputDimensions && externalTargetRanges.getSize() != numTargetDimensions ){
158  this->useExternalRanges = useExternalRanges;
159  return true;
160  }
161  return false;
162 }
163 
164 bool RegressionData::scale(const Float minTarget,const Float maxTarget){
165  Vector< MinMax > inputRanges = getInputRanges();
166  Vector< MinMax > targetRanges = getTargetRanges();
167  return scale(inputRanges,targetRanges,minTarget,maxTarget);
168 }
169 
170 bool RegressionData::scale(const Vector< MinMax > &inputVectorRanges,const Vector< MinMax > &targetVectorRanges,const Float minTarget,const Float maxTarget){
171  if( inputVectorRanges.getSize() == numInputDimensions && targetVectorRanges.getSize() == numTargetDimensions ){
172 
173  VectorFloat scaledInputVector(numInputDimensions,0);
174  VectorFloat scaledTargetVector(numTargetDimensions,0);
175  for(UINT i=0; i<totalNumSamples; i++){
176 
177  //Scale the input Vector
178  for(UINT j=0; j<numInputDimensions; j++){
179  scaledInputVector[j] = grt_scale(data[i].getInputVectorValue(j),inputVectorRanges[j].minValue,inputVectorRanges[j].maxValue,minTarget,maxTarget);
180  }
181  //Scale the target Vector
182  for(UINT j=0; j<numTargetDimensions; j++){
183  scaledTargetVector[j] = grt_scale(data[i].getTargetVectorValue(j),targetVectorRanges[j].minValue,targetVectorRanges[j].maxValue,minTarget,maxTarget);
184  }
185  //Update the training sample with the scaled data
186  data[i].set(scaledInputVector,scaledTargetVector);
187  }
188 
189  return true;
190  }
191  return false;
192 }
193 
195 
196  if( useExternalRanges ) return externalInputRanges;
197 
198  Vector< MinMax > ranges(numInputDimensions);
199 
200  if( totalNumSamples > 0 ){
201  for(UINT j=0; j<numInputDimensions; j++){
202  ranges[j].minValue = data[0].getInputVectorValue(j);
203  ranges[j].maxValue = data[0].getInputVectorValue(j);
204  for(UINT i=0; i<totalNumSamples; i++){
205  if( data[i].getInputVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getInputVectorValue(j); } //Search for the min value
206  else if( data[i].getInputVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getInputVectorValue(j); } //Search for the max value
207  }
208  }
209  }
210  return ranges;
211 }
212 
214 
215  if( useExternalRanges ) return externalTargetRanges;
216 
217  Vector< MinMax > ranges(numTargetDimensions);
218 
219  if( totalNumSamples > 0 ){
220  for(UINT j=0; j<numTargetDimensions; j++){
221  ranges[j].minValue = data[0].getTargetVectorValue(j);
222  ranges[j].maxValue = data[0].getTargetVectorValue(j);
223  for(UINT i=0; i<totalNumSamples; i++){
224  if( data[i].getTargetVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getTargetVectorValue(j); } //Search for the min value
225  else if( data[i].getTargetVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getTargetVectorValue(j); } //Search for the max value
226  }
227  }
228  }
229  return ranges;
230 }
231 
232 std::string RegressionData::getStatsAsString() const{
233 
234  std::string statsText;
235  statsText += "DatasetName:\t" + datasetName + "\n";
236  statsText += "DatasetInfo:\t" + infoText + "\n";
237  statsText += "Number of Input Dimensions:\t" + Util::toString( numInputDimensions ) + "\n";
238  statsText += "Number of Target Dimensions:\t" + Util::toString( numTargetDimensions ) + "\n";
239  statsText += "Number of Samples:\t" + Util::toString( totalNumSamples ) + "\n";
240 
241  Vector< MinMax > inputRanges = getInputRanges();
242 
243  statsText += "Dataset Input Dimension Ranges:\n";
244  for(UINT j=0; j<inputRanges.size(); j++){
245  statsText += "[" + Util::toString( j+1 ) + "] Min:\t" + Util::toString( inputRanges[j].minValue ) + "\tMax: " + Util::toString( inputRanges[j].maxValue ) + "\n";
246  }
247 
248  Vector< MinMax > targetRanges = getTargetRanges();
249 
250  statsText += "Dataset Target Dimension Ranges:\n";
251  for(UINT j=0; j<targetRanges.size(); j++){
252  statsText += "[" + Util::toString( j+1 ) + "] Min:\t" + Util::toString( targetRanges[j].minValue ) + "\tMax: " + Util::toString( targetRanges[j].maxValue ) + "\n";
253  }
254  return statsText;
255 }
256 
257 bool RegressionData::printStats() const{
258  std::cout << getStatsAsString();
259  return true;
260 }
261 
262 RegressionData RegressionData::partition(const UINT trainingSizePercentage){
263 
264  //Partitions the dataset into a training dataset (which is kept by this instance of the RegressionData) and
265  //a testing/validation dataset (which is return as a new instance of the RegressionData). The trainingSizePercentage
266  //therefore sets the size of the data which remains in this instance and the remaining percentage of data is then added to
267  //the testing/validation dataset
268 
269  const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
270 
271  RegressionData trainingSet(numInputDimensions,numTargetDimensions);
272  RegressionData testSet(numInputDimensions,numTargetDimensions);
273  Vector< UINT > indexs( totalNumSamples );
274 
275  //Create the random partion indexs
276  Random random;
277  UINT randomIndex = 0;
278  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
279  for(UINT x=0; x<totalNumSamples; x++){
280  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
281  SWAP( indexs[ x ] , indexs[ randomIndex ] );
282  }
283 
284  //Add the data to the training and test sets
285  for(UINT i=0; i<numTrainingExamples; i++){
286  trainingSet.addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
287  }
288  for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
289  testSet.addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
290  }
291 
292  //Overwrite the training data in this instance with the training data of the trainingSet
293  data = trainingSet.getData();
294  totalNumSamples = trainingSet.getNumSamples();
295 
296  //The dataset has changed so flag that any previous cross validation setup will now not work
297  crossValidationSetup = false;
298  crossValidationIndexs.clear();
299 
300  return testSet;
301 }
302 
303 bool RegressionData::merge(const RegressionData &regressionData){
304 
305  if( regressionData.getNumInputDimensions() != numInputDimensions ){
306  errorLog << "merge(RegressionData &regressionData) - The number of input dimensions in the regressionData (" << regressionData.getNumInputDimensions() << ") does not match the number of input dimensions of this dataset (" << numInputDimensions << ")" << std::endl;
307  return false;
308  }
309 
310  if( regressionData.getNumTargetDimensions() != numTargetDimensions ){
311  errorLog << "merge(RegressionData &regressionData) - The number of target dimensions in the regressionData (" << regressionData.getNumTargetDimensions() << ") does not match the number of target dimensions of this dataset (" << numTargetDimensions << ")" << std::endl;
312  return false;
313  }
314 
315  //Add the data from the labelledData to this instance
316  for(UINT i=0; i<regressionData.getNumSamples(); i++){
317  addSample(regressionData[i].getInputVector(), regressionData[i].getTargetVector());
318  }
319 
320  //The dataset has changed so flag that any previous cross validation setup will now not work
321  crossValidationSetup = false;
322  crossValidationIndexs.clear();
323 
324  return true;
325 }
326 
328 
329  crossValidationSetup = false;
330  crossValidationIndexs.clear();
331 
332  //K can not be zero
333  if( K > totalNumSamples ){
334  errorLog << "spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
335  return false;
336  }
337 
338  //K can not be larger than the number of examples
339  if( K > totalNumSamples ){
340  errorLog << "spiltDataIntoKFolds(UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
341  return false;
342  }
343 
344  //Setup the dataset for k-fold cross validation
345  kFoldValue = K;
346  Vector< UINT > indexs( totalNumSamples );
347 
348  //Work out how many samples are in each fold, the last fold might have more samples than the others
349  UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
350 
351  //Add the random indexs to each fold
352  crossValidationIndexs.resize(K);
353 
354  //Create the random partion indexs
355  Random random;
356  UINT randomIndex = 0;
357 
358  //Randomize the order of the data
359  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
360  for(UINT x=0; x<totalNumSamples; x++){
361  //Pick a random index
362  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
363 
364  //Swap the indexs
365  SWAP( indexs[ x ] , indexs[ randomIndex ] );
366  }
367 
368  UINT counter = 0;
369  UINT foldIndex = 0;
370  for(UINT i=0; i<totalNumSamples; i++){
371  //Add the index to the current fold
372  crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
373 
374  //Move to the next fold if ready
375  if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
376  foldIndex++;
377  counter = 0;
378  }
379  }
380 
381  crossValidationSetup = true;
382  return true;
383 
384 }
385 
387  RegressionData trainingData;
388 
389  if( !crossValidationSetup ){
390  errorLog << "getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
391  return trainingData;
392  }
393 
394  if( foldIndex >= kFoldValue ) return trainingData;
395 
396  trainingData.setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
397 
398  //Add the data to the training set, this will consist of all the data that is NOT in the foldIndex
399  UINT index = 0;
400  for(UINT k=0; k<kFoldValue; k++){
401  if( k != foldIndex ){
402  for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
403 
404  index = crossValidationIndexs[k][i];
405  trainingData.addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
406  }
407  }
408  }
409 
410  return trainingData;
411 }
412 
414  RegressionData testData;
415 
416  if( !crossValidationSetup ) return testData;
417 
418  if( foldIndex >= kFoldValue ) return testData;
419 
420  //Add the data to the training
421  testData.setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
422 
423  UINT index = 0;
424  for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
425 
426  index = crossValidationIndexs[ foldIndex ][i];
427  testData.addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
428  }
429 
430  return testData;
431 }
432 
433 UINT RegressionData::removeDuplicateSamples(){
434 
435  UINT numSamplesRemoved = 0;
436 
437  //Sort the data
438  sort(data.begin(),data.end(),RegressionSample::sortByInputVectorAscending );
439 
440  //Remove any samples that are very close to each other
441  Float minDist = 1.0e-5;
442  Float dist = 0;
443  Float totalDimensions = numInputDimensions + numTargetDimensions;
444  bool keepSearching = true;
445  Vector< RegressionSample >::iterator currentSample = data.begin();
446  Vector< RegressionSample >::iterator nextSample = data.begin()+1;
447 
448  if( currentSample == data.end() ) keepSearching = false;
449  if( nextSample == data.end() ) keepSearching = false;
450 
451  while( keepSearching ){
452  dist = 0;
453  for(UINT i=0; i<numInputDimensions; i++){
454  dist += SQR( currentSample->getInputVectorValue(i) - nextSample->getInputVectorValue(i) );
455  }
456  for(UINT i=0; i<numTargetDimensions; i++){
457  dist += SQR( currentSample->getTargetVectorValue(i) - nextSample->getTargetVectorValue(i) );
458  }
459  dist /= totalDimensions;
460  if( dist <= minDist ){
461  //Remove the next sample
462  currentSample = data.erase( nextSample );
463  nextSample = currentSample + 1;
464  numSamplesRemoved++;
465  debugLog << "Removing sample with dist: " << dist << std::endl;
466  }else{
467  currentSample++;
468  nextSample++;
469  }
470 
471  if( currentSample == data.end() ) keepSearching = false;
472  if( nextSample == data.end() ) keepSearching = false;
473  }
474 
475  return numSamplesRemoved;
476 }
477 
478 bool RegressionData::save(const std::string &filename) const{
479 
480  //Check if the file should be saved as a csv file
481  if( Util::stringEndsWith( filename, ".csv" ) ){
482  return saveDatasetToCSVFile( filename );
483  }
484 
485  //Otherwise save it as a custom GRT file
486  return saveDatasetToFile( filename );
487 }
488 
489 bool RegressionData::load(const std::string &filename){
490 
491  //Check if the file should be loaded as a csv file
492  if( Util::stringEndsWith( filename, ".csv" ) ){
493  return loadDatasetFromCSVFile( filename, numInputDimensions, numTargetDimensions );
494  }
495 
496  //Otherwise save it as a custom GRT file
497  return loadDatasetFromFile( filename );
498 }
499 
500 bool RegressionData::saveDatasetToFile(const std::string &filename) const{
501 
502  std::fstream file;
503  file.open(filename.c_str(), std::ios::out);
504 
505  if( !file.is_open() ){
506  errorLog << "saveDatasetToFile(const string &filename) - Failed to open file!" << std::endl;
507  return false;
508  }
509 
510  file << "GRT_LABELLED_REGRESSION_DATA_FILE_V1.0\n";
511  file << "DatasetName: " << datasetName << std::endl;
512  file << "InfoText: " << infoText << std::endl;
513  file << "NumInputDimensions: "<<numInputDimensions << std::endl;
514  file << "NumTargetDimensions: "<<numTargetDimensions << std::endl;
515  file << "TotalNumTrainingExamples: "<<totalNumSamples << std::endl;
516  file << "UseExternalRanges: " << useExternalRanges << std::endl;
517 
518  if( useExternalRanges ){
519  for(UINT i=0; i<externalInputRanges.getSize(); i++){
520  file << externalInputRanges[i].minValue << "\t" << externalInputRanges[i].maxValue << std::endl;
521  }
522  for(UINT i=0; i<externalTargetRanges.getSize(); i++){
523  file << externalTargetRanges[i].minValue << "\t" << externalTargetRanges[i].maxValue << std::endl;
524  }
525  }
526 
527  file << "RegressionData:\n";
528 
529  for(UINT i=0; i<totalNumSamples; i++){
530  for(UINT j=0; j<numInputDimensions; j++){
531  file << data[i].getInputVectorValue(j) << "\t";
532  }
533  for(UINT j=0; j<numTargetDimensions; j++){
534  file << data[i].getTargetVectorValue(j);
535  if( j!= numTargetDimensions-1 ) file << "\t";
536  }
537  file << std::endl;
538  }
539 
540  file.close();
541  return true;
542 }
543 
544 bool RegressionData::loadDatasetFromFile(const std::string &filename){
545 
546  std::fstream file;
547  file.open(filename.c_str(), std::ios::in);
548  clear();
549 
550  if( !file.is_open() ){
551  errorLog << "loadDatasetFromFile(const string &filename) - Failed to open file!" << std::endl;
552  return false;
553  }
554 
555  std::string word;
556 
557  //Check to make sure this is a file with the Training File Format
558  file >> word;
559  if(word != "GRT_LABELLED_REGRESSION_DATA_FILE_V1.0"){
560  errorLog << "loadDatasetFromFile(const string &filename) - Unknown file header!" << std::endl;
561  file.close();
562  return false;
563  }
564 
565  //Get the name of the dataset
566  file >> word;
567  if(word != "DatasetName:"){
568  errorLog << "loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
569  file.close();
570  return false;
571  }
572  file >> datasetName;
573 
574  file >> word;
575  if(word != "InfoText:"){
576  errorLog << "loadDatasetFromFile(const string &filename) - failed to find InfoText!" << std::endl;
577  file.close();
578  return false;
579  }
580 
581  //Load the info text
582  file >> word;
583  infoText = "";
584  while( word != "NumInputDimensions:" ){
585  infoText += word + " ";
586  file >> word;
587  }
588 
589  //Get the number of input dimensions in the training data
590  if(word != "NumInputDimensions:"){
591  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find NumInputDimensions!" << std::endl;
592  file.close();
593  return false;
594  }
595  file >> numInputDimensions;
596 
597  //Get the number of target dimensions in the training data
598  file >> word;
599  if(word != "NumTargetDimensions:"){
600  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find NumTargetDimensions!" << std::endl;
601  file.close();
602  return false;
603  }
604  file >> numTargetDimensions;
605 
606  //Get the total number of training examples in the training data
607  file >> word;
608  if(word != "TotalNumTrainingExamples:"){
609  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
610  file.close();
611  return false;
612  }
613  file >> totalNumSamples;
614 
615  //Check if the dataset should be scaled using external ranges
616  file >> word;
617  if(word != "UseExternalRanges:"){
618  errorLog << "loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
619  file.close();
620  return false;
621  }
622  file >> useExternalRanges;
623 
624  //If we are using external ranges then load them
625  if( useExternalRanges ){
626  externalInputRanges.resize(numInputDimensions);
627  externalTargetRanges.resize(numTargetDimensions);
628  for(UINT i=0; i<externalInputRanges.size(); i++){
629  file >> externalInputRanges[i].minValue;
630  file >> externalInputRanges[i].maxValue;
631  }
632  for(UINT i=0; i<externalTargetRanges.size(); i++){
633  file >> externalTargetRanges[i].minValue;
634  file >> externalTargetRanges[i].maxValue;
635  }
636  }
637 
638  //Get the main training data
639  file >> word;
640  if( word != "RegressionData:" && word != "LabelledRegressionData:" ){
641  errorLog << "loadDatasetFromFile(const string &filename) - Failed to find RegressionData!" << std::endl;
642  file.close();
643  return false;
644  }
645 
646  VectorFloat inputVector(numInputDimensions);
647  VectorFloat targetVector(numTargetDimensions);
648  data.resize( totalNumSamples, RegressionSample(inputVector,targetVector) );
649 
650  for(UINT i=0; i<totalNumSamples; i++){
651  //Read the input Vector
652  for(UINT j=0; j<numInputDimensions; j++){
653  file >> inputVector[j];
654  }
655  for(UINT j=0; j<numTargetDimensions; j++){
656  file >> targetVector[j];
657  }
658  data[i].set(inputVector, targetVector);
659  }
660 
661  file.close();
662  return true;
663 }
664 
665 bool RegressionData::saveDatasetToCSVFile(const std::string &filename) const{
666 
667  std::fstream file;
668  file.open(filename.c_str(), std::ios::out );
669 
670  if( !file.is_open() ){
671  errorLog << "saveDatasetToCSVFile(const string &filename) - Failed to open file!" << std::endl;
672  return false;
673  }
674 
675  //Write the data to the CSV file
676  for(UINT i=0; i<totalNumSamples; i++){
677  for(UINT j=0; j<numInputDimensions; j++){
678  file << data[i].getInputVector()[j] << ",";
679  }
680  for(UINT j=0; j<numTargetDimensions; j++){
681  file << data[i].getTargetVector()[j];
682  if( j != numTargetDimensions-1 ) file << ",";
683  }
684  file << std::endl;
685  }
686 
687  file.close();
688 
689  return true;
690 }
691 
692 bool RegressionData::loadDatasetFromCSVFile(const std::string &filename,const UINT numInputDimensions,const UINT numTargetDimensions){
693 
694  std::fstream file;
695  std::string value;
696  clear();
697  datasetName = "NOT_SET";
698  infoText = "";
699 
700  //Clear any previous data
701  clear();
702 
703  //Parse the CSV file
704  FileParser parser;
705 
706  if( !parser.parseCSVFile(filename,true) ){
707  errorLog << "loadDatasetFromCSVFile(...) - Failed to parse CSV file!" << std::endl;
708  return false;
709  }
710 
711  if( !parser.getConsistentColumnSize() ){
712  errorLog << "loadDatasetFromCSVFile(...) - The CSV file does not have a consistent number of columns!" << std::endl;
713  return false;
714  }
715 
716  if( parser.getColumnSize() != numInputDimensions+numTargetDimensions ){
717  errorLog << "loadDatasetFromCSVFile(...) - The number of columns in the CSV file (" << parser.getColumnSize() << ")";
718  errorLog << " does not match the number of input dimensions plus the number of target dimensions (" << numInputDimensions+numTargetDimensions << ")" << std::endl;
719  return false;
720  }
721 
722  //Setup the labelled classification data
723  setInputAndTargetDimensions(numInputDimensions, numTargetDimensions);
724 
725  UINT n = 0;
726  VectorFloat inputVector(numInputDimensions);
727  VectorFloat targetVector(numTargetDimensions);
728  for(UINT i=0; i<parser.getRowSize(); i++){
729 
730  //Reset n
731  n = 0;
732 
733  //Get the input Vector
734  for(UINT j=0; j<numInputDimensions; j++){
735  inputVector[j] = grt_from_str< Float >( parser[i][n++] );
736  }
737 
738  //Get the target Vector
739  for(UINT j=0; j<numTargetDimensions; j++){
740  targetVector[j] = grt_from_str< Float >( parser[i][n++] );
741  }
742 
743  //Add the labelled sample to the dataset
744  if( !addSample(inputVector, targetVector) ){
745  warningLog << "loadDatasetFromCSVFile(string filename) - Could not add sample " << i << " to the dataset!" << std::endl;
746  }
747  }
748 
749  return true;
750 }
751 
752 GRT_END_NAMESPACE
753 
bool merge(const RegressionData &regressionData)
bool loadDatasetFromCSVFile(const std::string &filename, const UINT numInputDimensions, const UINT numTargetDimensions)
static std::string toString(const int &i)
Definition: Util.cpp:73
bool save(const std::string &filename) const
Vector< MinMax > getInputRanges() const
RegressionData & operator=(const RegressionData &rhs)
Definition: Random.h:40
bool load(const std::string &filename)
virtual bool resize(const unsigned int size)
Definition: Vector.h:133
RegressionData getTrainingFoldData(const UINT foldIndex) const
bool setInfoText(const std::string &infoText)
UINT getNumInputDimensions() const
unsigned int getSize() const
Definition: Vector.h:193
bool setExternalRanges(const Vector< MinMax > &externalInputRanges, const Vector< MinMax > &externalTargetRanges, const bool useExternalRanges)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
Vector< MinMax > getTargetRanges() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumTargetDimensions() const
RegressionData(const UINT numInputDimensions=0, const UINT numTargetDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool saveDatasetToCSVFile(const std::string &filename) const
The RegressionData is the main data structure for recording, labeling, managing, saving, and loading datasets that can be used to train and test the GRT supervised regression algorithms.
bool setDatasetName(const std::string &datasetName)
int getRandomNumberInt(int minRange, int maxRange)
Definition: Random.h:88
static bool stringEndsWith(const std::string &str, const std::string &ending)
Definition: Util.cpp:156
bool saveDatasetToFile(const std::string &filename) const
Vector< RegressionSample > getData() const
RegressionData getTestFoldData(const UINT foldIndex) const
bool loadDatasetFromFile(const std::string &filename)
bool enableExternalRangeScaling(const bool useExternalRanges)
RegressionData partition(const UINT trainingSizePercentage)
bool addSample(const VectorFloat &inputVector, const VectorFloat &targetVector)
bool reserve(const UINT N)
bool spiltDataIntoKFolds(const UINT K)
UINT getNumSamples() const