GestureRecognitionToolkit  Version: 0.2.0
The Gesture Recognition Toolkit (GRT) is a cross-platform, open-source, c++ machine learning library for real-time gesture recognition.
TimeSeriesClassificationData.cpp
1 /*
2 GRT MIT License
3 Copyright (c) <2012> <Nicholas Gillian, Media Lab, MIT>
4 
5 Permission is hereby granted, free of charge, to any person obtaining a copy of this software
6 and associated documentation files (the "Software"), to deal in the Software without restriction,
7 including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
9 subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in all copies or substantial
12 portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
15 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
16 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
17 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
18 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 */
20 
21 #define GRT_DLL_EXPORTS
23 
24 GRT_BEGIN_NAMESPACE
25 
26 TimeSeriesClassificationData::TimeSeriesClassificationData(UINT numDimensions,std::string datasetName,std::string infoText){
27  debugLog.setProceedingText("[DEBUG TSCD]");
28  errorLog.setProceedingText("[ERROR TSCD]");
29  warningLog.setProceedingText("[WARNING TSCD]");
30 
31  this->numDimensions = numDimensions;
32  this->datasetName = datasetName;
33  this->infoText = infoText;
34  totalNumSamples = 0;
35  crossValidationSetup = false;
36  useExternalRanges = false;
37  allowNullGestureClass = true;
38  if( numDimensions > 0 ){
39  setNumDimensions(numDimensions);
40  }
41 }
42 
44 
45  debugLog.setProceedingText("[DEBUG TSCD]");
46  errorLog.setProceedingText("[ERROR TSCD]");
47  warningLog.setProceedingText("[WARNING TSCD]");
48 
49  *this = rhs;
50 }
51 
53 
55  if( this != &rhs){
56  this->datasetName = rhs.datasetName;
57  this->infoText = rhs.infoText;
58  this->numDimensions = rhs.numDimensions;
63  this->totalNumSamples = rhs.totalNumSamples;
64  this->data = rhs.data;
65  this->classTracker = rhs.classTracker;
66  this->externalRanges = rhs.externalRanges;
67  this->debugLog = rhs.debugLog;
68  this->errorLog = rhs.errorLog;
69  this->warningLog = rhs.warningLog;
70  }
71  return *this;
72 }
73 
75  totalNumSamples = 0;
76  data.clear();
77  classTracker.clear();
78 }
79 
80 bool TimeSeriesClassificationData::setNumDimensions(const UINT numDimensions){
81  if( numDimensions > 0 ){
82  //Clear any previous training data
83  clear();
84 
85  //Set the dimensionality of the training data
86  this->numDimensions = numDimensions;
87 
88  useExternalRanges = false;
89  externalRanges.clear();
90 
91  return true;
92  }
93 
94  errorLog << "setNumDimensions(UINT numDimensions) - The number of dimensions of the dataset must be greater than zero!" << std::endl;
95  return false;
96 }
97 
98 bool TimeSeriesClassificationData::setDatasetName(const std::string datasetName){
99 
100  //Make sure there are no spaces in the std::string
101  if( datasetName.find(" ") == std::string::npos ){
102  this->datasetName = datasetName;
103  return true;
104  }
105 
106  errorLog << "setDatasetName(std::string datasetName) - The dataset name cannot contain any spaces!" << std::endl;
107  return false;
108 }
109 
110 bool TimeSeriesClassificationData::setInfoText(const std::string infoText){
111  this->infoText = infoText;
112  return true;
113 }
114 
115 bool TimeSeriesClassificationData::setClassNameForCorrespondingClassLabel(const std::string className,const UINT classLabel){
116 
117  for(UINT i=0; i<classTracker.size(); i++){
118  if( classTracker[i].classLabel == classLabel ){
119  classTracker[i].className = className;
120  return true;
121  }
122  }
123 
124  return false;
125 }
126 
127 bool TimeSeriesClassificationData::setAllowNullGestureClass(const bool allowNullGestureClass){
128  this->allowNullGestureClass = allowNullGestureClass;
129  return true;
130 }
131 
132 bool TimeSeriesClassificationData::addSample(const UINT classLabel,const MatrixFloat &trainingSample){
133 
134  if( trainingSample.getNumCols() != numDimensions ){
135  errorLog << "addSample(UINT classLabel, MatrixFloat trainingSample) - The dimensionality of the training sample (" << trainingSample.getNumCols() << ") does not match that of the dataset (" << numDimensions << ")" << std::endl;
136  return false;
137  }
138 
139  //The class label must be greater than zero (as zero is used for the null rejection class label
140  if( classLabel == GRT_DEFAULT_NULL_CLASS_LABEL && !allowNullGestureClass ){
141  errorLog << "addSample(UINT classLabel, MatrixFloat sample) - the class label can not be 0!" << std::endl;
142  return false;
143  }
144 
145  TimeSeriesClassificationSample newSample(classLabel,trainingSample);
146  data.push_back( newSample );
147  totalNumSamples++;
148 
149  if( classTracker.size() == 0 ){
150  ClassTracker tracker(classLabel,1);
151  classTracker.push_back(tracker);
152  }else{
153  bool labelFound = false;
154  for(UINT i=0; i<classTracker.size(); i++){
155  if( classLabel == classTracker[i].classLabel ){
156  classTracker[i].counter++;
157  labelFound = true;
158  break;
159  }
160  }
161  if( !labelFound ){
162  ClassTracker tracker(classLabel,1);
163  classTracker.push_back(tracker);
164  }
165  }
166  return true;
167 }
168 
170  UINT numExamplesRemoved = 0;
171  UINT numExamplesToRemove = 0;
172 
173  //Find out how many training examples we need to remove
174  for(UINT i=0; i<classTracker.size(); i++){
175  if( classTracker[i].classLabel == classLabel ){
176  numExamplesToRemove = classTracker[i].counter;
177  classTracker.erase(classTracker.begin()+i);
178  break;
179  }
180  }
181 
182  //Remove the samples with the matching class ID
183  if( numExamplesToRemove > 0 ){
184  UINT i=0;
185  while( numExamplesRemoved < numExamplesToRemove ){
186  if( data[i].getClassLabel() == classLabel ){
187  data.erase(data.begin()+i);
188  numExamplesRemoved++;
189  }else if( ++i == data.size() ) break;
190  }
191  }
192 
193  totalNumSamples = (UINT)data.size();
194 
195  return numExamplesRemoved;
196 }
197 
199 
200  if( totalNumSamples > 0 ){
201 
202  //Find the corresponding class ID for the last training example
203  UINT classLabel = data[ totalNumSamples-1 ].getClassLabel();
204 
205  //Remove the training example from the buffer
206  data.erase(data.end()-1);
207 
208  totalNumSamples = (UINT)data.size();
209 
210  //Remove the value from the counter
211  for(UINT i=0; i<classTracker.size(); i++){
212  if( classTracker[i].classLabel == classLabel ){
213  classTracker[i].counter--;
214  break;
215  }
216  }
217 
218  return true;
219 
220  }else return false;
221 
222 }
223 
224 bool TimeSeriesClassificationData::relabelAllSamplesWithClassLabel(const UINT oldClassLabel,const UINT newClassLabel){
225  bool oldClassLabelFound = false;
226  bool newClassLabelAllReadyExists = false;
227  UINT indexOfOldClassLabel = 0;
228  UINT indexOfNewClassLabel = 0;
229 
230  //Find out how many training examples we need to relabel
231  for(UINT i=0; i<classTracker.size(); i++){
232  if( classTracker[i].classLabel == oldClassLabel ){
233  indexOfOldClassLabel = i;
234  oldClassLabelFound = true;
235  }
236  if( classTracker[i].classLabel == newClassLabel ){
237  indexOfNewClassLabel = i;
238  newClassLabelAllReadyExists = true;
239  }
240  }
241 
242  //If the old class label was not found then we can't do anything
243  if( !oldClassLabelFound ){
244  return false;
245  }
246 
247  //Relabel the old class labels
248  for(UINT i=0; i<totalNumSamples; i++){
249  if( data[i].getClassLabel() == oldClassLabel ){
250  data[i].setTrainingSample(newClassLabel, data[i].getData());
251  }
252  }
253 
254  //Update the class label counters
255  if( newClassLabelAllReadyExists ){
256  //Add the old sample count to the new sample count
257  classTracker[ indexOfNewClassLabel ].counter += classTracker[ indexOfOldClassLabel ].counter;
258 
259  //Erase the old class tracker
260  classTracker.erase( classTracker.begin() + indexOfOldClassLabel );
261  }else{
262  //Create a new class tracker
263  classTracker.push_back( ClassTracker(newClassLabel,classTracker[ indexOfOldClassLabel ].counter,classTracker[ indexOfOldClassLabel ].className) );
264  }
265 
266  return true;
267 }
268 
269 bool TimeSeriesClassificationData::setExternalRanges(const Vector< MinMax > &externalRanges,const bool useExternalRanges){
270 
271  if( externalRanges.size() != numDimensions ) return false;
272 
273  this->externalRanges = externalRanges;
274  this->useExternalRanges = useExternalRanges;
275 
276  return true;
277 }
278 
280  if( externalRanges.size() == numDimensions ){
281  this->useExternalRanges = useExternalRanges;
282  return true;
283  }
284  return false;
285 }
286 
287 bool TimeSeriesClassificationData::scale(const Float minTarget,const Float maxTarget){
288  Vector< MinMax > ranges = getRanges();
289  return scale(ranges,minTarget,maxTarget);
290 }
291 
292 bool TimeSeriesClassificationData::scale(const Vector<MinMax> &ranges,const Float minTarget,const Float maxTarget){
293  if( ranges.size() != numDimensions ) return false;
294 
295  //Scale the training data
296  for(UINT i=0; i<totalNumSamples; i++){
297  for(UINT x=0; x<data[i].getLength(); x++){
298  for(UINT j=0; j<numDimensions; j++){
299  data[i][x][j] = Util::scale(data[i][x][j],ranges[j].minValue,ranges[j].maxValue,minTarget,maxTarget);
300  }
301  }
302  }
303 
304  return true;
305 }
306 
307 bool TimeSeriesClassificationData::save(const std::string &filename) const{
308 
309  //Check if the file should be saved as a csv file
310  if( Util::stringEndsWith( filename, ".csv" ) ){
311  return saveDatasetToCSVFile( filename );
312  }
313 
314  //Otherwise save it as a custom GRT file
315  return saveDatasetToFile( filename );
316 }
317 
318 bool TimeSeriesClassificationData::load(const std::string &filename){
319 
320  //Check if the file should be loaded as a csv file
321  if( Util::stringEndsWith( filename, ".csv" ) ){
322  return loadDatasetFromCSVFile( filename );
323  }
324 
325  //Otherwise save it as a custom GRT file
326  return loadDatasetFromFile( filename );
327 }
328 
329 bool TimeSeriesClassificationData::saveDatasetToFile(const std::string fileName) const{
330 
331  std::fstream file;
332  file.open(fileName.c_str(), std::ios::out);
333 
334  if( !file.is_open() ){
335  errorLog << "saveDatasetToFile(std::string fileName) - Failed to open file!" << std::endl;
336  return false;
337  }
338 
339  file << "GRT_LABELLED_TIME_SERIES_CLASSIFICATION_DATA_FILE_V1.0\n";
340  file << "DatasetName: " << datasetName << std::endl;
341  file << "InfoText: " << infoText << std::endl;
342  file << "NumDimensions: "<<numDimensions << std::endl;
343  file << "TotalNumTrainingExamples: "<<totalNumSamples << std::endl;
344  file << "NumberOfClasses: "<<classTracker.size() << std::endl;
345  file << "ClassIDsAndCounters: " << std::endl;
346 
347  for(UINT i=0; i<classTracker.size(); i++){
348  file << classTracker[i].classLabel << "\t" << classTracker[i].counter << std::endl;
349  }
350 
351  file << "UseExternalRanges: " << useExternalRanges << std::endl;
352 
353  if( useExternalRanges ){
354  for(UINT i=0; i<externalRanges.size(); i++){
355  file << externalRanges[i].minValue << "\t" << externalRanges[i].maxValue << std::endl;
356  }
357  }
358 
359  file << "LabelledTimeSeriesTrainingData:\n";
360 
361  for(UINT x=0; x<totalNumSamples; x++){
362  file << "************TIME_SERIES************\n";
363  file << "ClassID: "<<data[x].getClassLabel() << std::endl;
364  file << "TimeSeriesLength: "<<data[x].getLength()<< std::endl;
365  file << "TimeSeriesData: \n";
366  for(UINT i=0; i<data[x].getLength(); i++){
367  for(UINT j=0; j<numDimensions; j++){
368  file << data[x][i][j];
369  if( j<numDimensions-1 ) file << "\t";
370  }file << std::endl;
371  }
372  }
373 
374  file.close();
375  return true;
376 }
377 
378 bool TimeSeriesClassificationData::loadDatasetFromFile(const std::string filename){
379 
380  std::fstream file;
381  file.open(filename.c_str(), std::ios::in);
382  UINT numClasses = 0;
383  clear();
384 
385  if( !file.is_open() ){
386  errorLog << "loadDatasetFromFile(std::string filename) - FILE NOT OPEN!" << std::endl;
387  return false;
388  }
389 
390  std::string word;
391 
392  //Check to make sure this is a file with the Training File Format
393  file >> word;
394  if(word != "GRT_LABELLED_TIME_SERIES_CLASSIFICATION_DATA_FILE_V1.0"){
395  file.close();
396  clear();
397  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find file header!" << std::endl;
398  return false;
399  }
400 
401  //Get the name of the dataset
402  file >> word;
403  if(word != "DatasetName:"){
404  errorLog << "loadDatasetFromFile(std::string filename) - failed to find DatasetName!" << std::endl;
405  file.close();
406  return false;
407  }
408  file >> datasetName;
409 
410  file >> word;
411  if(word != "InfoText:"){
412  errorLog << "loadDatasetFromFile(std::string filename) - failed to find InfoText!" << std::endl;
413  file.close();
414  return false;
415  }
416 
417  //Load the info text
418  file >> word;
419  infoText = "";
420  while( word != "NumDimensions:" ){
421  infoText += word + " ";
422  file >> word;
423  }
424 
425  //Get the number of dimensions in the training data
426  if(word != "NumDimensions:"){
427  file.close();
428  clear();
429  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find NumDimensions!" << std::endl;
430  return false;
431  }
432  file >> numDimensions;
433 
434  //Get the total number of training examples in the training data
435  file >> word;
436  if(word != "TotalNumTrainingExamples:"){
437  file.close();
438  clear();
439  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
440  return false;
441  }
442  file >> totalNumSamples;
443 
444  //Get the total number of classes in the training data
445  file >> word;
446  if(word != "NumberOfClasses:"){
447  file.close();
448  clear();
449  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find NumberOfClasses!" << std::endl;
450  return false;
451  }
452  file >> numClasses;
453 
454  //Resize the class counter buffer and load the counters
455  classTracker.resize(numClasses);
456 
457  //Get the total number of classes in the training data
458  file >> word;
459  if(word != "ClassIDsAndCounters:"){
460  file.close();
461  clear();
462  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find ClassIDsAndCounters!" << std::endl;
463  return false;
464  }
465 
466  for(UINT i=0; i<classTracker.size(); i++){
467  file >> classTracker[i].classLabel;
468  file >> classTracker[i].counter;
469  }
470 
471  //Get the UseExternalRanges
472  file >> word;
473  if(word != "UseExternalRanges:"){
474  file.close();
475  clear();
476  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find UseExternalRanges!" << std::endl;
477  return false;
478  }
479 
480  file >> useExternalRanges;
481 
482  if( useExternalRanges ){
483  externalRanges.resize(numDimensions);
484  for(UINT i=0; i<externalRanges.size(); i++){
485  file >> externalRanges[i].minValue;
486  file >> externalRanges[i].maxValue;
487  }
488  }
489 
490  //Get the main training data
491  file >> word;
492  if(word != "LabelledTimeSeriesTrainingData:"){
493  file.close();
494  clear();
495  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find LabelledTimeSeriesTrainingData!" << std::endl;
496  return false;
497  }
498 
499  //Reset the memory
500  data.resize( totalNumSamples, TimeSeriesClassificationSample() );
501 
502  //Load each of the time series
503  for(UINT x=0; x<totalNumSamples; x++){
504  UINT classLabel = 0;
505  UINT timeSeriesLength = 0;
506 
507  file >> word;
508  if( word != "************TIME_SERIES************" ){
509  file.close();
510  clear();
511  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find TimeSeries Header!" << std::endl;
512  return false;
513  }
514 
515  file >> word;
516  if( word != "ClassID:" ){
517  file.close();
518  clear();
519  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find ClassID!" << std::endl;
520  return false;
521  }
522  file >> classLabel;
523 
524  file >> word;
525  if( word != "TimeSeriesLength:" ){
526  file.close();
527  clear();
528  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find TimeSeriesLength!" << std::endl;
529  return false;
530  }
531  file >> timeSeriesLength;
532 
533  file >> word;
534  if( word != "TimeSeriesData:" ){
535  file.close();
536  clear();
537  errorLog << "loadDatasetFromFile(std::string filename) - Failed to find TimeSeriesData!" << std::endl;
538  return false;
539  }
540 
541  //Load the time series data
542  MatrixFloat trainingExample(timeSeriesLength,numDimensions);
543  for(UINT i=0; i<timeSeriesLength; i++){
544  for(UINT j=0; j<numDimensions; j++){
545  file >> trainingExample[i][j];
546  }
547  }
548 
549  data[x].setTrainingSample(classLabel,trainingExample);
550  }
551 
552  file.close();
553  return true;
554 }
555 
556 bool TimeSeriesClassificationData::saveDatasetToCSVFile(const std::string &filename) const{
557 
558  std::fstream file;
559  file.open(filename.c_str(), std::ios::out );
560 
561  if( !file.is_open() ){
562  return false;
563  }
564 
565  //Write the data to the CSV file
566  for(UINT x=0; x<totalNumSamples; x++){
567  for(UINT i=0; i<data[x].getLength(); i++){
568  file << x+1 << ",";
569  file << data[x].getClassLabel() << ",";
570  for(UINT j=0; j<numDimensions; j++){
571  file << data[x][i][j];
572  if( j+1 < numDimensions ){
573  file << ",";
574  }
575  }
576  file << std::endl;
577  }
578  }
579 
580  file.close();
581 
582  return true;
583 }
584 
586 
587  numDimensions = 0;
588  datasetName = "NOT_SET";
589  infoText = "";
590 
591  //Clear any previous data
592  clear();
593 
594  //Parse the CSV file
595  FileParser parser;
596 
597  if( !parser.parseCSVFile(filename,true) ){
598  errorLog << "loadDatasetFromCSVFile(const std::string &filename) - Failed to parse CSV file!" << std::endl;
599  return false;
600  }
601 
602  if( !parser.getConsistentColumnSize() ){
603  errorLog << "loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have a consistent number of columns!" << std::endl;
604  return false;
605  }
606 
607  if( parser.getColumnSize() <= 2 ){
608  errorLog << "loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have enough columns! It should contain at least three columns!" << std::endl;
609  return false;
610  }
611 
612  //Set the number of dimensions
613  numDimensions = parser.getColumnSize()-2;
614 
615  //Reserve the memory for the data
616  data.reserve( parser.getRowSize() );
617 
618  UINT sampleCounter = 0;
619  UINT lastSampleCounter = 0;
620  UINT classLabel = 0;
621  UINT j = 0;
622  UINT n = 0;
623  VectorFloat sample(numDimensions);
624  MatrixFloat timeseries;
625  for(UINT i=0; i<parser.getRowSize(); i++){
626 
627  sampleCounter = grt_from_str< UINT >( parser[i][0] );
628 
629  //Check to see if a new timeseries has started, if so then add the previous time series as a sample and start recording the new time series
630  if( sampleCounter != lastSampleCounter && i != 0 ){
631  //Add the labelled sample to the dataset
632  if( !addSample(classLabel, timeseries) ){
633  warningLog << "loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Could not add sample " << i << " to the dataset!" << std::endl;
634  }
635  timeseries.clear();
636  }
637  lastSampleCounter = sampleCounter;
638 
639  //Get the class label
640  classLabel = grt_from_str< UINT >( parser[i][1] );
641 
642  //Get the sample data
643  j=0;
644  n=2;
645  while( j != numDimensions ){
646  sample[j++] = grt_from_str< Float >( parser[i][n] );
647  n++;
648  }
649 
650  //Add the sample to the timeseries
651  timeseries.push_back( sample );
652  }
653  if ( timeseries.getSize() > 0 )
654  //Add the labelled sample to the dataset
655  if( !addSample(classLabel, timeseries) ){
656  warningLog << "loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Could not add sample " << parser.getRowSize()-1 << " to the dataset!" << std::endl;
657  }
658 
659  return true;
660 }
661 
663 
664  std::cout << getStatsAsString();
665 
666  return true;
667 }
668 
670 
671  std::string stats;
672 
673  stats += "DatasetName:\t" + datasetName + "\n";
674  stats += "DatasetInfo:\t" + infoText + "\n";
675  stats += "Number of Dimensions:\t" + Util::toString(numDimensions) + "\n";
676  stats += "Number of Samples:\t" + Util::toString(totalNumSamples) + "\n";
677  stats += "Number of Classes:\t" + Util::toString(getNumClasses()) + "\n";
678  stats += "ClassStats:\n";
679 
680  for(UINT k=0; k<getNumClasses(); k++){
681  stats += "ClassLabel:\t" + Util::toString(classTracker[k].classLabel);
682  stats += "\tNumber of Samples:\t" + Util::toString( classTracker[k].counter );
683  stats +="\tClassName:\t" + classTracker[k].className + "\n";
684  }
685 
686  Vector< MinMax > ranges = getRanges();
687 
688  stats += "Dataset Ranges:\n";
689  for(UINT j=0; j<ranges.size(); j++){
690  stats += "[" + Util::toString( j+1 ) + "] Min:\t" + Util::toString( ranges[j].minValue ) + "\tMax: " + Util::toString( ranges[j].maxValue ) + "\n";
691  }
692 
693  stats += "Timeseries Lengths:\n";
694  UINT M = (UINT)data.size();
695  for(UINT j=0; j<M; j++){
696  stats += "ClassLabel: " + Util::toString( data[j].getClassLabel() ) + " Length:\t" + Util::toString( data[j].getLength() ) + "\n";
697  }
698 
699  return stats;
700 }
701 
702 TimeSeriesClassificationData TimeSeriesClassificationData::partition(const UINT trainingSizePercentage,const bool useStratifiedSampling){
703  return split( trainingSizePercentage, useStratifiedSampling );
704 }
705 
706 TimeSeriesClassificationData TimeSeriesClassificationData::split(const UINT trainingSizePercentage,const bool useStratifiedSampling){
707 
708  //Partitions the dataset into a training dataset (which is kept by this instance of the TimeSeriesClassificationData) and
709  //a testing/validation dataset (which is return as a new instance of the TimeSeriesClassificationData). The trainingSizePercentage
710  //therefore sets the size of the data which remains in this instance and the remaining percentage of data is then added to
711  //the testing/validation dataset
712 
713  //The dataset has changed so flag that any previous cross validation setup will now not work
714  crossValidationSetup = false;
715  crossValidationIndexs.clear();
716 
722 
723  //Create the random partion indexs
724  Random random;
725  UINT randomIndex = 0;
726 
727  if( useStratifiedSampling ){
728  //Break the data into seperate classes
729  Vector< Vector< UINT > > classData( getNumClasses() );
730 
731  //Add the indexs to their respective classes
732  for(UINT i=0; i<totalNumSamples; i++){
733  classData[ getClassLabelIndexValue( data[i].getClassLabel() ) ].push_back( i );
734  }
735 
736  //Randomize the order of the indexs in each of the class index buffers
737  for(UINT k=0; k<getNumClasses(); k++){
738  UINT numSamples = (UINT)classData[k].size();
739  for(UINT x=0; x<numSamples; x++){
740  //Pick a random index
741  randomIndex = random.getRandomNumberInt(0,numSamples);
742 
743  //Swap the indexs
744  SWAP( classData[k][ x ] ,classData[k][ randomIndex ] );
745  }
746  }
747 
748  //Loop over each class and add the data to the trainingSet and testSet
749  for(UINT k=0; k<getNumClasses(); k++){
750  UINT numTrainingExamples = (UINT) floor( Float(classData[k].size()) / 100.0 * Float(trainingSizePercentage) );
751 
752  //Add the data to the training and test sets
753  for(UINT i=0; i<numTrainingExamples; i++){
754  trainingSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getData() );
755  }
756  for(UINT i=numTrainingExamples; i<classData[k].size(); i++){
757  testSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getData() );
758  }
759  }
760 
761  //Overwrite the training data in this instance with the training data of the trainingSet
762  data = trainingSet.getClassificationData();
763  totalNumSamples = trainingSet.getNumSamples();
764  }else{
765 
766  const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
767  //Create the random partion indexs
768  Random random;
769  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
770  for(UINT x=0; x<totalNumSamples; x++){
771  //Pick a random index
772  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
773 
774  //Swap the indexs
775  SWAP( indexs[ x ] , indexs[ randomIndex ] );
776  }
777 
778  //Add the data to the training and test sets
779  for(UINT i=0; i<numTrainingExamples; i++){
780  trainingSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getData() );
781  }
782  for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
783  testSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getData() );
784  }
785 
786  //Overwrite the training data in this instance with the training data of the trainingSet
787  data = trainingSet.getClassificationData();
788  totalNumSamples = trainingSet.getNumSamples();
789  }
790 
791  return testSet;
792 }
793 
795 
796  if( labelledData.getNumDimensions() != numDimensions ){
797  errorLog << "merge(TimeSeriesClassificationData &labelledData) - The number of dimensions in the labelledData (" << labelledData.getNumDimensions() << ") does not match the number of dimensions of this dataset (" << numDimensions << ")" << std::endl;
798  return false;
799  }
800 
801  //The dataset has changed so flag that any previous cross validation setup will now not work
802  crossValidationSetup = false;
803  crossValidationIndexs.clear();
804 
805  //Add the data from the labelledData to this instance
806  for(UINT i=0; i<labelledData.getNumSamples(); i++){
807  addSample(labelledData[i].getClassLabel(), labelledData[i].getData());
808  }
809 
810  //Set the class names from the dataset
812  for(UINT i=0; i<classTracker.size(); i++){
813  setClassNameForCorrespondingClassLabel(classTracker[i].className, classTracker[i].classLabel);
814  }
815 
816  return true;
817 }
818 
819 bool TimeSeriesClassificationData::spiltDataIntoKFolds(const UINT K,const bool useStratifiedSampling){
820 
821  crossValidationSetup = false;
822  crossValidationIndexs.clear();
823 
824  //K can not be zero
825  if( K > totalNumSamples ){
826  errorLog << "spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
827  return false;
828  }
829 
830  //K can not be larger than the number of examples
831  if( K > totalNumSamples ){
832  errorLog << "spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) - K can not be larger than the total number of samples in the dataset!" << std::endl;
833  return false;
834  }
835 
836  //K can not be larger than the number of examples in a specific class if the stratified sampling option is true
837  if( useStratifiedSampling ){
838  for(UINT c=0; c<classTracker.size(); c++){
839  if( K > classTracker[c].counter ){
840  errorLog << "spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) - K can not be larger than the number of samples in any given class!" << std::endl;
841  return false;
842  }
843  }
844  }
845 
846  //Setup the dataset for k-fold cross validation
847  kFoldValue = K;
849 
850  //Work out how many samples are in each fold, the last fold might have more samples than the others
851  UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
852 
853  //Resize the cross validation indexs buffer
855 
856  //Create the random partion indexs
857  Random random;
858  UINT randomIndex = 0;
859 
860  if( useStratifiedSampling ){
861  //Break the data into seperate classes
862  Vector< Vector< UINT > > classData( getNumClasses() );
863 
864  //Add the indexs to their respective classes
865  for(UINT i=0; i<totalNumSamples; i++){
866  classData[ getClassLabelIndexValue( data[i].getClassLabel() ) ].push_back( i );
867  }
868 
869  //Randomize the order of the indexs in each of the class index buffers
870  for(UINT c=0; c<getNumClasses(); c++){
871  UINT numSamples = (UINT)classData[c].size();
872  for(UINT x=0; x<numSamples; x++){
873  //Pick a random index
874  randomIndex = random.getRandomNumberInt(0,numSamples);
875 
876  //Swap the indexs
877  SWAP( classData[c][ x ] , classData[c][ randomIndex ] );
878  }
879  }
880 
881  //Loop over each of the classes and add the data equally to each of the k folds until there is no data left
883  for(UINT c=0; c<getNumClasses(); c++){
884  iter = classData[ c ].begin();
885  UINT k = 0;
886  while( iter != classData[c].end() ){
887  crossValidationIndexs[ k ].push_back( *iter );
888  iter++;
889  k++;
890  k = k % K;
891  }
892  }
893 
894  }else{
895  //Randomize the order of the data
896  for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
897  for(UINT x=0; x<totalNumSamples; x++){
898  //Pick a random index
899  randomIndex = random.getRandomNumberInt(0,totalNumSamples);
900 
901  //Swap the indexs
902  SWAP( indexs[ x ] , indexs[ randomIndex ] );
903  }
904 
905  UINT counter = 0;
906  UINT foldIndex = 0;
907  for(UINT i=0; i<totalNumSamples; i++){
908  //Add the index to the current fold
909  crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
910 
911  //Move to the next fold if ready
912  if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
913  foldIndex++;
914  counter = 0;
915  }
916  }
917  }
918 
919  crossValidationSetup = true;
920  return true;
921 
922 }
923 
925 
926  TimeSeriesClassificationData trainingData;
927 
928  if( !crossValidationSetup ){
929  errorLog << "getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
930  return trainingData;
931  }
932 
933  if( foldIndex >= kFoldValue ) return trainingData;
934 
935  trainingData.setNumDimensions( numDimensions );
936 
937  //Add the data to the training set, this will consist of all the data that is NOT in the foldIndex
938  UINT index = 0;
939  for(UINT k=0; k<kFoldValue; k++){
940  if( k != foldIndex ){
941  for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
942 
943  index = crossValidationIndexs[k][i];
944  trainingData.addSample( data[ index ].getClassLabel(), data[ index ].getData() );
945  }
946  }
947  }
948 
949  return trainingData;
950 }
951 
954 
955  if( !crossValidationSetup ) return testData;
956 
957  if( foldIndex >= kFoldValue ) return testData;
958 
959  //Add the data to the training
960  testData.setNumDimensions( numDimensions );
961 
962  UINT index = 0;
963  for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
964 
965  index = crossValidationIndexs[ foldIndex ][i];
966  testData.addSample( data[ index ].getClassLabel(), data[ index ].getData() );
967  }
968 
969  return testData;
970 }
971 
974  for(UINT x=0; x<totalNumSamples; x++){
975  if( data[x].getClassLabel() == classLabel ){
976  classData.addSample( classLabel, data[x].getData() );
977  }
978  }
979  return classData;
980 }
981 
983 
984  UnlabelledData unlabelledData;
985 
986  if( totalNumSamples == 0 ){
987  return unlabelledData;
988  }
989 
990  unlabelledData.setNumDimensions( numDimensions );
991 
992  for(UINT i=0; i<totalNumSamples; i++){
993  for(UINT x=0; x<data[i].getLength(); x++){
994  unlabelledData.addSample( data[i].getData().getRow( x ) );
995  }
996  }
997 
998  return unlabelledData;
999 }
1000 
1002  UINT minClassLabel = 99999;
1003 
1004  for(UINT i=0; i<classTracker.size(); i++){
1005  if( classTracker[i].classLabel < minClassLabel ){
1006  minClassLabel = classTracker[i].classLabel;
1007  }
1008  }
1009 
1010  return minClassLabel;
1011 }
1012 
1013 
1015  UINT maxClassLabel = 0;
1016 
1017  for(UINT i=0; i<classTracker.size(); i++){
1018  if( classTracker[i].classLabel > maxClassLabel ){
1019  maxClassLabel = classTracker[i].classLabel;
1020  }
1021  }
1022 
1023  return maxClassLabel;
1024 }
1025 
1027  for(UINT k=0; k<classTracker.size(); k++){
1028  if( classTracker[k].classLabel == classLabel ){
1029  return k;
1030  }
1031  }
1032  warningLog << "getClassLabelIndexValue(UINT classLabel) - Failed to find class label: " << classLabel << " in class tracker!" << std::endl;
1033  return 0;
1034 }
1035 
1037 
1038  for(UINT i=0; i<classTracker.size(); i++){
1039  if( classTracker[i].classLabel == classLabel ){
1040  return classTracker[i].className;
1041  }
1042  }
1043  return "CLASS_LABEL_NOT_FOUND";
1044 }
1045 
1047 
1048  if( useExternalRanges ) return externalRanges;
1049 
1050  Vector<MinMax> ranges(numDimensions);
1051 
1052  if( totalNumSamples > 0 ){
1053  for(UINT j=0; j<numDimensions; j++){
1054  ranges[j].minValue = data[0][0][0];
1055  ranges[j].maxValue = data[0][0][0];
1056  for(UINT x=0; x<totalNumSamples; x++){
1057  for(UINT i=0; i<data[x].getLength(); i++){
1058  if( data[x][i][j] < ranges[j].minValue ){ ranges[j].minValue = data[x][i][j]; } //Search for the min value
1059  else if( data[x][i][j] > ranges[j].maxValue ){ ranges[j].maxValue = data[x][i][j]; } //Search for the max value
1060  }
1061  }
1062  }
1063  }
1064  return ranges;
1065 }
1066 
1068 
1069  //Count how many samples are in the entire dataset
1070  UINT M = 0;
1071  UINT index = 0;
1072  for(UINT x=0; x<totalNumSamples; x++){
1073  M += data[x].getLength();
1074  }
1075 
1076  if( M == 0 ) MatrixFloat();
1077 
1078  //Get all the data and concatenate it into 1 matrix
1079  MatrixFloat matrixData(M,numDimensions);
1080  for(UINT x=0; x<totalNumSamples; x++){
1081  for(UINT i=0; i<data[x].getLength(); i++){
1082  for(UINT j=0; j<numDimensions; j++){
1083  matrixData[index][j] = data[x][i][j];
1084  }
1085  index++;
1086  }
1087  }
1088  return matrixData;
1089 }
1090 
1091 GRT_END_NAMESPACE
1092 
void clear()
Definition: Matrix.h:522
bool spiltDataIntoKFolds(const UINT K, const bool useStratifiedSampling=false)
bool loadDatasetFromCSVFile(const std::string &filename)
unsigned int getSize() const
Definition: Matrix.h:564
static std::string toString(const int &i)
Definition: Util.cpp:74
bool addSample(const VectorFloat &sample)
UINT numDimensions
The number of dimensions in the dataset.
static Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
Definition: Util.cpp:53
bool setInfoText(const std::string infoText)
Vector< MinMax > externalRanges
A vector containing a set of externalRanges set by the user.
bool merge(const TimeSeriesClassificationData &labelledData)
Vector< TimeSeriesClassificationSample > data
The labelled time series classification data.
UINT kFoldValue
The number of folds the dataset has been spilt into for cross valiation.
Definition: Random.h:40
bool setNumDimensions(const UINT numDimensions)
virtual bool resize(const unsigned int size)
Definition: Vector.h:133
TimeSeriesClassificationData & operator=(const TimeSeriesClassificationData &rhs)
bool saveDatasetToFile(const std::string filename) const
bool useExternalRanges
A flag to show if the dataset should be scaled using the externalRanges values.
UINT totalNumSamples
The total number of samples in the dataset.
Vector< ClassTracker > getClassTracker() const
bool setExternalRanges(const Vector< MinMax > &externalRanges, const bool useExternalRanges=false)
The TimeSeriesClassificationData is the main data structure for recording, labeling, managing, saving, and loading training data for supervised temporal learning problems. Unlike the ClassificationData, in which each sample consists of 1 N dimensional datum, a TimeSeriesClassificationData sample will consist of an N dimensional time series of length M. The length of each time series sample (i.e. M) can be different for each datum in the dataset.
bool setNumDimensions(const UINT numDimensions)
WarningLog warningLog
Default warning log.
bool relabelAllSamplesWithClassLabel(const UINT oldClassLabel, const UINT newClassLabel)
UINT eraseAllSamplesWithClassLabel(const UINT classLabel)
bool allowNullGestureClass
A flag that enables/disables a user from adding new samples with a class label matching the default n...
bool enableExternalRangeScaling(const bool useExternalRanges)
unsigned int getNumCols() const
Definition: Matrix.h:549
std::string getClassNameForCorrespondingClassLabel(const UINT classLabel) const
DebugLog debugLog
Default debugging log.
UINT getClassLabelIndexValue(const UINT classLabel) const
bool crossValidationSetup
A flag to show if the dataset is ready for cross validation.
bool addSample(const UINT classLabel, const MatrixFloat &trainingSample)
std::string datasetName
The name of the dataset.
bool scale(const Float minTarget, const Float maxTarget)
TimeSeriesClassificationData getClassData(const UINT classLabel) const
bool setClassNameForCorrespondingClassLabel(const std::string className, const UINT classLabel)
bool saveDatasetToCSVFile(const std::string &filename) const
ErrorLog errorLog
Default error log.
Vector< ClassTracker > classTracker
A vector of ClassTracker, which keeps track of the number of samples of each class.
UnlabelledData reformatAsUnlabelledData() const
int getRandomNumberInt(int minRange, int maxRange)
Definition: Random.h:88
TimeSeriesClassificationData split(const UINT partitionPercentage, const bool useStratifiedSampling=false)
static bool stringEndsWith(const std::string &str, const std::string &ending)
Definition: Util.cpp:157
TimeSeriesClassificationData(UINT numDimensions=0, std::string datasetName="NOT_SET", std::string infoText="")
std::string infoText
Some infoText about the dataset.
bool setDatasetName(const std::string datasetName)
Vector< TimeSeriesClassificationSample > getClassificationData() const
bool push_back(const Vector< T > &sample)
Definition: Matrix.h:401
Vector< Vector< UINT > > crossValidationIndexs
A vector to hold the indexs of the dataset for the cross validation.
bool setAllowNullGestureClass(const bool allowNullGestureClass)
TimeSeriesClassificationData getTrainingFoldData(const UINT foldIndex) const
bool load(const std::string &filename)
bool save(const std::string &filename) const
bool loadDatasetFromFile(const std::string filename)
TimeSeriesClassificationData getTestFoldData(const UINT foldIndex) const