21 #define GRT_DLL_EXPORTS 38 if( numDimensions > 0 ){
81 if( numDimensions > 0 ){
94 errorLog <<
"setNumDimensions(UINT numDimensions) - The number of dimensions of the dataset must be greater than zero!" << std::endl;
101 if( datasetName.find(
" ") == std::string::npos ){
106 errorLog <<
"setDatasetName(std::string datasetName) - The dataset name cannot contain any spaces!" << std::endl;
135 errorLog <<
"addSample(UINT classLabel, MatrixFloat trainingSample) - The dimensionality of the training sample (" << trainingSample.
getNumCols() <<
") does not match that of the dataset (" <<
numDimensions <<
")" << std::endl;
141 errorLog <<
"addSample(UINT classLabel, MatrixFloat sample) - the class label can not be 0!" << std::endl;
146 data.push_back( newSample );
153 bool labelFound =
false;
170 UINT numExamplesRemoved = 0;
171 UINT numExamplesToRemove = 0;
183 if( numExamplesToRemove > 0 ){
185 while( numExamplesRemoved < numExamplesToRemove ){
186 if(
data[i].getClassLabel() == classLabel ){
188 numExamplesRemoved++;
189 }
else if( ++i ==
data.size() )
break;
195 return numExamplesRemoved;
225 bool oldClassLabelFound =
false;
226 bool newClassLabelAllReadyExists =
false;
227 UINT indexOfOldClassLabel = 0;
228 UINT indexOfNewClassLabel = 0;
233 indexOfOldClassLabel = i;
234 oldClassLabelFound =
true;
237 indexOfNewClassLabel = i;
238 newClassLabelAllReadyExists =
true;
243 if( !oldClassLabelFound ){
249 if(
data[i].getClassLabel() == oldClassLabel ){
250 data[i].setTrainingSample(newClassLabel,
data[i].getData());
255 if( newClassLabelAllReadyExists ){
289 return scale(ranges,minTarget,maxTarget);
297 for(UINT x=0; x<
data[i].getLength(); x++){
299 data[i][x][j] =
Util::scale(
data[i][x][j],ranges[j].minValue,ranges[j].maxValue,minTarget,maxTarget);
332 file.open(fileName.c_str(), std::ios::out);
334 if( !file.is_open() ){
335 errorLog <<
"saveDatasetToFile(std::string fileName) - Failed to open file!" << std::endl;
339 file <<
"GRT_LABELLED_TIME_SERIES_CLASSIFICATION_DATA_FILE_V1.0\n";
340 file <<
"DatasetName: " <<
datasetName << std::endl;
341 file <<
"InfoText: " <<
infoText << std::endl;
344 file <<
"NumberOfClasses: "<<
classTracker.size() << std::endl;
345 file <<
"ClassIDsAndCounters: " << std::endl;
359 file <<
"LabelledTimeSeriesTrainingData:\n";
362 file <<
"************TIME_SERIES************\n";
363 file <<
"ClassID: "<<
data[x].getClassLabel() << std::endl;
364 file <<
"TimeSeriesLength: "<<
data[x].getLength()<< std::endl;
365 file <<
"TimeSeriesData: \n";
366 for(UINT i=0; i<
data[x].getLength(); i++){
368 file <<
data[x][i][j];
369 if( j<numDimensions-1 ) file <<
"\t";
381 file.open(filename.c_str(), std::ios::in);
385 if( !file.is_open() ){
386 errorLog <<
"loadDatasetFromFile(std::string filename) - FILE NOT OPEN!" << std::endl;
394 if(word !=
"GRT_LABELLED_TIME_SERIES_CLASSIFICATION_DATA_FILE_V1.0"){
397 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find file header!" << std::endl;
403 if(word !=
"DatasetName:"){
404 errorLog <<
"loadDatasetFromFile(std::string filename) - failed to find DatasetName!" << std::endl;
411 if(word !=
"InfoText:"){
412 errorLog <<
"loadDatasetFromFile(std::string filename) - failed to find InfoText!" << std::endl;
420 while( word !=
"NumDimensions:" ){
426 if(word !=
"NumDimensions:"){
429 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find NumDimensions!" << std::endl;
436 if(word !=
"TotalNumTrainingExamples:"){
439 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
446 if(word !=
"NumberOfClasses:"){
449 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find NumberOfClasses!" << std::endl;
459 if(word !=
"ClassIDsAndCounters:"){
462 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find ClassIDsAndCounters!" << std::endl;
473 if(word !=
"UseExternalRanges:"){
476 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find UseExternalRanges!" << std::endl;
482 if( useExternalRanges ){
492 if(word !=
"LabelledTimeSeriesTrainingData:"){
495 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find LabelledTimeSeriesTrainingData!" << std::endl;
505 UINT timeSeriesLength = 0;
508 if( word !=
"************TIME_SERIES************" ){
511 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find TimeSeries Header!" << std::endl;
516 if( word !=
"ClassID:" ){
519 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find ClassID!" << std::endl;
525 if( word !=
"TimeSeriesLength:" ){
528 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find TimeSeriesLength!" << std::endl;
531 file >> timeSeriesLength;
534 if( word !=
"TimeSeriesData:" ){
537 errorLog <<
"loadDatasetFromFile(std::string filename) - Failed to find TimeSeriesData!" << std::endl;
542 MatrixFloat trainingExample(timeSeriesLength,numDimensions);
543 for(UINT i=0; i<timeSeriesLength; i++){
545 file >> trainingExample[i][j];
549 data[x].setTrainingSample(classLabel,trainingExample);
559 file.open(filename.c_str(), std::ios::out );
561 if( !file.is_open() ){
567 for(UINT i=0; i<
data[x].getLength(); i++){
569 file <<
data[x].getClassLabel() <<
",";
571 file <<
data[x][i][j];
572 if( j+1 < numDimensions ){
597 if( !parser.parseCSVFile(filename,
true) ){
598 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - Failed to parse CSV file!" << std::endl;
602 if( !parser.getConsistentColumnSize() ){
603 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have a consistent number of columns!" << std::endl;
607 if( parser.getColumnSize() <= 2 ){
608 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have enough columns! It should contain at least three columns!" << std::endl;
616 data.reserve( parser.getRowSize() );
618 UINT sampleCounter = 0;
619 UINT lastSampleCounter = 0;
625 for(UINT i=0; i<parser.getRowSize(); i++){
627 sampleCounter = grt_from_str< UINT >( parser[i][0] );
630 if( sampleCounter != lastSampleCounter && i != 0 ){
632 if( !
addSample(classLabel, timeseries) ){
633 warningLog <<
"loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Could not add sample " << i <<
" to the dataset!" << std::endl;
637 lastSampleCounter = sampleCounter;
640 classLabel = grt_from_str< UINT >( parser[i][1] );
646 sample[j++] = grt_from_str< Float >( parser[i][n] );
653 if ( timeseries.
getSize() > 0 )
655 if( !
addSample(classLabel, timeseries) ){
656 warningLog <<
"loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Could not add sample " << parser.getRowSize()-1 <<
" to the dataset!" << std::endl;
674 stats +=
"DatasetInfo:\t" +
infoText +
"\n";
678 stats +=
"ClassStats:\n";
683 stats +=
"\tClassName:\t" +
classTracker[k].className +
"\n";
688 stats +=
"Dataset Ranges:\n";
689 for(UINT j=0; j<ranges.size(); j++){
693 stats +=
"Timeseries Lengths:\n";
694 UINT M = (UINT)
data.size();
695 for(UINT j=0; j<M; j++){
702 TimeSeriesClassificationData TimeSeriesClassificationData::partition(
const UINT trainingSizePercentage,
const bool useStratifiedSampling){
703 return split( trainingSizePercentage, useStratifiedSampling );
725 UINT randomIndex = 0;
727 if( useStratifiedSampling ){
738 UINT numSamples = (UINT)classData[k].size();
739 for(UINT x=0; x<numSamples; x++){
744 SWAP( classData[k][ x ] ,classData[k][ randomIndex ] );
750 UINT numTrainingExamples = (UINT) floor( Float(classData[k].size()) / 100.0 * Float(trainingSizePercentage) );
753 for(UINT i=0; i<numTrainingExamples; i++){
754 trainingSet.
addSample(
data[ classData[k][i] ].getClassLabel(),
data[ classData[k][i] ].getData() );
756 for(UINT i=numTrainingExamples; i<classData[k].size(); i++){
757 testSet.
addSample(
data[ classData[k][i] ].getClassLabel(),
data[ classData[k][i] ].getData() );
766 const UINT numTrainingExamples = (UINT) floor( Float(
totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
775 SWAP( indexs[ x ] , indexs[ randomIndex ] );
779 for(UINT i=0; i<numTrainingExamples; i++){
780 trainingSet.
addSample(
data[ indexs[i] ].getClassLabel(),
data[ indexs[i] ].getData() );
783 testSet.
addSample(
data[ indexs[i] ].getClassLabel(),
data[ indexs[i] ].getData() );
797 errorLog <<
"merge(TimeSeriesClassificationData &labelledData) - The number of dimensions in the labelledData (" << labelledData.
getNumDimensions() <<
") does not match the number of dimensions of this dataset (" <<
numDimensions <<
")" << std::endl;
807 addSample(labelledData[i].getClassLabel(), labelledData[i].getData());
812 for(UINT i=0; i<classTracker.size(); i++){
826 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
832 errorLog <<
"spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) - K can not be larger than the total number of samples in the dataset!" << std::endl;
837 if( useStratifiedSampling ){
840 errorLog <<
"spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) - K can not be larger than the number of samples in any given class!" << std::endl;
858 UINT randomIndex = 0;
860 if( useStratifiedSampling ){
871 UINT numSamples = (UINT)classData[c].size();
872 for(UINT x=0; x<numSamples; x++){
877 SWAP( classData[c][ x ] , classData[c][ randomIndex ] );
884 iter = classData[ c ].begin();
886 while( iter != classData[c].end() ){
902 SWAP( indexs[ x ] , indexs[ randomIndex ] );
912 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
929 errorLog <<
"getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
933 if( foldIndex >=
kFoldValue )
return trainingData;
940 if( k != foldIndex ){
944 trainingData.
addSample(
data[ index ].getClassLabel(),
data[ index ].getData() );
957 if( foldIndex >=
kFoldValue )
return testData;
975 if(
data[x].getClassLabel() == classLabel ){
987 return unlabelledData;
993 for(UINT x=0; x<
data[i].getLength(); x++){
998 return unlabelledData;
1002 UINT minClassLabel = 99999;
1010 return minClassLabel;
1015 UINT maxClassLabel = 0;
1023 return maxClassLabel;
1032 warningLog <<
"getClassLabelIndexValue(UINT classLabel) - Failed to find class label: " << classLabel <<
" in class tracker!" << std::endl;
1043 return "CLASS_LABEL_NOT_FOUND";
1054 ranges[j].minValue =
data[0][0][0];
1055 ranges[j].maxValue =
data[0][0][0];
1057 for(UINT i=0; i<
data[x].getLength(); i++){
1058 if(
data[x][i][j] < ranges[j].minValue ){ ranges[j].minValue =
data[x][i][j]; }
1059 else if(
data[x][i][j] > ranges[j].maxValue ){ ranges[j].maxValue =
data[x][i][j]; }
1073 M +=
data[x].getLength();
1081 for(UINT i=0; i<
data[x].getLength(); i++){
1083 matrixData[index][j] =
data[x][i][j];
std::string getStatsAsString() const
bool spiltDataIntoKFolds(const UINT K, const bool useStratifiedSampling=false)
bool loadDatasetFromCSVFile(const std::string &filename)
unsigned int getSize() const
static std::string toString(const int &i)
bool addSample(const VectorFloat &sample)
virtual ~TimeSeriesClassificationData()
UINT numDimensions
The number of dimensions in the dataset.
static Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
bool setInfoText(const std::string infoText)
Vector< MinMax > externalRanges
A vector containing a set of externalRanges set by the user.
bool merge(const TimeSeriesClassificationData &labelledData)
Vector< TimeSeriesClassificationSample > data
The labelled time series classification data.
UINT kFoldValue
The number of folds the dataset has been spilt into for cross valiation.
This file contains the Random class, a useful wrapper for generating cross platform random functions...
bool setNumDimensions(const UINT numDimensions)
virtual bool resize(const unsigned int size)
TimeSeriesClassificationData & operator=(const TimeSeriesClassificationData &rhs)
virtual bool setKey(const std::string &key)
sets the key that gets written at the start of each message, this will be written in the format 'key ...
bool saveDatasetToFile(const std::string filename) const
Vector< MinMax > getRanges() const
bool useExternalRanges
A flag to show if the dataset should be scaled using the externalRanges values.
UINT totalNumSamples
The total number of samples in the dataset.
Vector< ClassTracker > getClassTracker() const
bool setExternalRanges(const Vector< MinMax > &externalRanges, const bool useExternalRanges=false)
The TimeSeriesClassificationData is the main data structure for recording, labeling, managing, saving, and loading training data for supervised temporal learning problems. Unlike the ClassificationData, in which each sample consists of 1 N dimensional datum, a TimeSeriesClassificationData sample will consist of an N dimensional time series of length M. The length of each time series sample (i.e. M) can be different for each datum in the dataset.
MatrixFloat getDataAsMatrixFloat() const
bool setNumDimensions(const UINT numDimensions)
WarningLog warningLog
Default warning log.
bool relabelAllSamplesWithClassLabel(const UINT oldClassLabel, const UINT newClassLabel)
UINT eraseAllSamplesWithClassLabel(const UINT classLabel)
bool allowNullGestureClass
A flag that enables/disables a user from adding new samples with a class label matching the default n...
bool enableExternalRangeScaling(const bool useExternalRanges)
unsigned int getNumCols() const
std::string getClassNameForCorrespondingClassLabel(const UINT classLabel) const
DebugLog debugLog
Default debugging log.
UINT getClassLabelIndexValue(const UINT classLabel) const
bool crossValidationSetup
A flag to show if the dataset is ready for cross validation.
bool addSample(const UINT classLabel, const MatrixFloat &trainingSample)
UINT getMaximumClassLabel() const
std::string datasetName
The name of the dataset.
bool scale(const Float minTarget, const Float maxTarget)
TimeSeriesClassificationData getClassData(const UINT classLabel) const
bool setClassNameForCorrespondingClassLabel(const std::string className, const UINT classLabel)
UINT getNumDimensions() const
bool saveDatasetToCSVFile(const std::string &filename) const
ErrorLog errorLog
Default error log.
Vector< ClassTracker > classTracker
A vector of ClassTracker, which keeps track of the number of samples of each class.
UnlabelledData reformatAsUnlabelledData() const
int getRandomNumberInt(int minRange, int maxRange)
TimeSeriesClassificationData split(const UINT partitionPercentage, const bool useStratifiedSampling=false)
static bool stringEndsWith(const std::string &str, const std::string &ending)
TimeSeriesClassificationData(UINT numDimensions=0, std::string datasetName="NOT_SET", std::string infoText="")
UINT getMinimumClassLabel() const
std::string infoText
Some infoText about the dataset.
UINT getNumClasses() const
bool setDatasetName(const std::string datasetName)
UINT getNumSamples() const
Vector< TimeSeriesClassificationSample > getClassificationData() const
bool push_back(const Vector< T > &sample)
Vector< Vector< UINT > > crossValidationIndexs
A vector to hold the indexs of the dataset for the cross validation.
bool setAllowNullGestureClass(const bool allowNullGestureClass)
TimeSeriesClassificationData getTrainingFoldData(const UINT foldIndex) const
bool load(const std::string &filename)
bool save(const std::string &filename) const
bool loadDatasetFromFile(const std::string filename)
TimeSeriesClassificationData getTestFoldData(const UINT foldIndex) const