25 RegressionData::RegressionData(
const UINT numInputDimensions,
const UINT numTargetDimensions,
const std::string datasetName,
const std::string infoText):totalNumSamples(0){
26 this->numInputDimensions = numInputDimensions;
27 this->numTargetDimensions = numTargetDimensions;
28 this->datasetName = datasetName;
29 this->infoText = infoText;
31 crossValidationSetup =
false;
32 useExternalRanges =
false;
33 debugLog.setProceedingText(
"[DEBUG LRD]");
34 errorLog.setProceedingText(
"[ERROR LRD]");
35 warningLog.setProceedingText(
"[WARNING LRD]");
46 this->datasetName = rhs.datasetName;
47 this->infoText = rhs.infoText;
48 this->numInputDimensions = rhs.numInputDimensions;
49 this->numTargetDimensions = rhs.numTargetDimensions;
50 this->totalNumSamples = rhs.totalNumSamples;
51 this->kFoldValue = rhs.kFoldValue;
52 this->crossValidationSetup = rhs.crossValidationSetup;
53 this->useExternalRanges = rhs.useExternalRanges;
54 this->externalInputRanges = rhs.externalInputRanges;
55 this->externalTargetRanges = rhs.externalTargetRanges;
56 this->data = rhs.data;
57 this->crossValidationIndexs = rhs.crossValidationIndexs;
58 this->debugLog = rhs.debugLog;
59 this->errorLog = rhs.errorLog;
60 this->warningLog = rhs.warningLog;
68 crossValidationSetup =
false;
70 crossValidationIndexs.clear();
75 if( numInputDimensions > 0 && numTargetDimensions > 0 ){
76 this->numInputDimensions = numInputDimensions;
77 this->numTargetDimensions = numTargetDimensions;
80 useExternalRanges =
false;
81 externalInputRanges.clear();
82 externalTargetRanges.clear();
85 errorLog <<
"setInputAndTargetDimensions(UINT numInputDimensions,UINT numTargetDimensions) - The number of input and target dimensions should be greater than zero!" << std::endl;
92 if( datasetName.find(
" ") == std::string::npos ){
93 this->datasetName = datasetName;
97 errorLog <<
"setDatasetName(const string &datasetName) - The dataset name cannot contain any spaces!" << std::endl;
102 this->infoText = infoText;
107 if( inputVector.
getSize() == numInputDimensions && targetVector.
getSize() == numTargetDimensions ){
112 crossValidationSetup =
false;
113 crossValidationIndexs.clear();
116 errorLog <<
"addSample(const VectorFloat &inputVector,const VectorFloat &targetVector) - The inputVector size or targetVector size does not match the size of the numInputDimensions or numTargetDimensions" << std::endl;
121 if( totalNumSamples > 0 ){
123 data.erase(data.end()-1);
124 totalNumSamples = data.
getSize();
127 crossValidationSetup =
false;
128 crossValidationIndexs.clear();
131 warningLog <<
"removeLastSample() - There are no samples to remove!" << std::endl;
139 if( data.capacity() >= N )
return true;
146 if( externalInputRanges.
getSize() != numInputDimensions )
return false;
147 if( externalTargetRanges.
getSize() != numTargetDimensions )
return false;
149 this->externalInputRanges = externalInputRanges;
150 this->externalTargetRanges = externalTargetRanges;
151 this->useExternalRanges = useExternalRanges;
157 if( externalInputRanges.
getSize() != numInputDimensions && externalTargetRanges.
getSize() != numTargetDimensions ){
158 this->useExternalRanges = useExternalRanges;
167 return scale(inputRanges,targetRanges,minTarget,maxTarget);
171 if( inputVectorRanges.
getSize() == numInputDimensions && targetVectorRanges.
getSize() == numTargetDimensions ){
173 VectorFloat scaledInputVector(numInputDimensions,0);
174 VectorFloat scaledTargetVector(numTargetDimensions,0);
175 for(UINT i=0; i<totalNumSamples; i++){
178 for(UINT j=0; j<numInputDimensions; j++){
179 scaledInputVector[j] = grt_scale(data[i].getInputVectorValue(j),inputVectorRanges[j].minValue,inputVectorRanges[j].maxValue,minTarget,maxTarget);
182 for(UINT j=0; j<numTargetDimensions; j++){
183 scaledTargetVector[j] = grt_scale(data[i].getTargetVectorValue(j),targetVectorRanges[j].minValue,targetVectorRanges[j].maxValue,minTarget,maxTarget);
186 data[i].set(scaledInputVector,scaledTargetVector);
196 if( useExternalRanges )
return externalInputRanges;
200 if( totalNumSamples > 0 ){
201 for(UINT j=0; j<numInputDimensions; j++){
202 ranges[j].minValue = data[0].getInputVectorValue(j);
203 ranges[j].maxValue = data[0].getInputVectorValue(j);
204 for(UINT i=0; i<totalNumSamples; i++){
205 if( data[i].getInputVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getInputVectorValue(j); }
206 else if( data[i].getInputVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getInputVectorValue(j); }
215 if( useExternalRanges )
return externalTargetRanges;
219 if( totalNumSamples > 0 ){
220 for(UINT j=0; j<numTargetDimensions; j++){
221 ranges[j].minValue = data[0].getTargetVectorValue(j);
222 ranges[j].maxValue = data[0].getTargetVectorValue(j);
223 for(UINT i=0; i<totalNumSamples; i++){
224 if( data[i].getTargetVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getTargetVectorValue(j); }
225 else if( data[i].getTargetVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getTargetVectorValue(j); }
232 std::string RegressionData::getStatsAsString()
const{
234 std::string statsText;
235 statsText +=
"DatasetName:\t" + datasetName +
"\n";
236 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
237 statsText +=
"Number of Input Dimensions:\t" +
Util::toString( numInputDimensions ) +
"\n";
238 statsText +=
"Number of Target Dimensions:\t" +
Util::toString( numTargetDimensions ) +
"\n";
239 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
243 statsText +=
"Dataset Input Dimension Ranges:\n";
244 for(UINT j=0; j<inputRanges.size(); j++){
250 statsText +=
"Dataset Target Dimension Ranges:\n";
251 for(UINT j=0; j<targetRanges.size(); j++){
257 bool RegressionData::printStats()
const{
258 std::cout << getStatsAsString();
269 const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
271 RegressionData trainingSet(numInputDimensions,numTargetDimensions);
277 UINT randomIndex = 0;
278 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
279 for(UINT x=0; x<totalNumSamples; x++){
281 SWAP( indexs[ x ] , indexs[ randomIndex ] );
285 for(UINT i=0; i<numTrainingExamples; i++){
286 trainingSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
288 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
289 testSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
297 crossValidationSetup =
false;
298 crossValidationIndexs.clear();
306 errorLog <<
"merge(RegressionData ®ressionData) - The number of input dimensions in the regressionData (" << regressionData.
getNumInputDimensions() <<
") does not match the number of input dimensions of this dataset (" << numInputDimensions <<
")" << std::endl;
311 errorLog <<
"merge(RegressionData ®ressionData) - The number of target dimensions in the regressionData (" << regressionData.
getNumTargetDimensions() <<
") does not match the number of target dimensions of this dataset (" << numTargetDimensions <<
")" << std::endl;
317 addSample(regressionData[i].getInputVector(), regressionData[i].getTargetVector());
321 crossValidationSetup =
false;
322 crossValidationIndexs.clear();
329 crossValidationSetup =
false;
330 crossValidationIndexs.clear();
333 if( K > totalNumSamples ){
334 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
339 if( K > totalNumSamples ){
340 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
349 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
352 crossValidationIndexs.
resize(K);
356 UINT randomIndex = 0;
359 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
360 for(UINT x=0; x<totalNumSamples; x++){
365 SWAP( indexs[ x ] , indexs[ randomIndex ] );
370 for(UINT i=0; i<totalNumSamples; i++){
372 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
375 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
381 crossValidationSetup =
true;
389 if( !crossValidationSetup ){
390 errorLog <<
"getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
394 if( foldIndex >= kFoldValue )
return trainingData;
400 for(UINT k=0; k<kFoldValue; k++){
401 if( k != foldIndex ){
402 for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
404 index = crossValidationIndexs[k][i];
405 trainingData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
416 if( !crossValidationSetup )
return testData;
418 if( foldIndex >= kFoldValue )
return testData;
424 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
426 index = crossValidationIndexs[ foldIndex ][i];
427 testData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
433 UINT RegressionData::removeDuplicateSamples(){
435 UINT numSamplesRemoved = 0;
438 sort(data.begin(),data.end(),RegressionSample::sortByInputVectorAscending );
441 Float minDist = 1.0e-5;
443 Float totalDimensions = numInputDimensions + numTargetDimensions;
444 bool keepSearching =
true;
448 if( currentSample == data.end() ) keepSearching =
false;
449 if( nextSample == data.end() ) keepSearching =
false;
451 while( keepSearching ){
453 for(UINT i=0; i<numInputDimensions; i++){
454 dist += SQR( currentSample->getInputVectorValue(i) - nextSample->getInputVectorValue(i) );
456 for(UINT i=0; i<numTargetDimensions; i++){
457 dist += SQR( currentSample->getTargetVectorValue(i) - nextSample->getTargetVectorValue(i) );
459 dist /= totalDimensions;
460 if( dist <= minDist ){
462 currentSample = data.erase( nextSample );
463 nextSample = currentSample + 1;
465 debugLog <<
"Removing sample with dist: " << dist << std::endl;
471 if( currentSample == data.end() ) keepSearching =
false;
472 if( nextSample == data.end() ) keepSearching =
false;
475 return numSamplesRemoved;
503 file.open(filename.c_str(), std::ios::out);
505 if( !file.is_open() ){
506 errorLog <<
"saveDatasetToFile(const string &filename) - Failed to open file!" << std::endl;
510 file <<
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0\n";
511 file <<
"DatasetName: " << datasetName << std::endl;
512 file <<
"InfoText: " << infoText << std::endl;
513 file <<
"NumInputDimensions: "<<numInputDimensions << std::endl;
514 file <<
"NumTargetDimensions: "<<numTargetDimensions << std::endl;
515 file <<
"TotalNumTrainingExamples: "<<totalNumSamples << std::endl;
516 file <<
"UseExternalRanges: " << useExternalRanges << std::endl;
518 if( useExternalRanges ){
519 for(UINT i=0; i<externalInputRanges.
getSize(); i++){
520 file << externalInputRanges[i].minValue <<
"\t" << externalInputRanges[i].maxValue << std::endl;
522 for(UINT i=0; i<externalTargetRanges.
getSize(); i++){
523 file << externalTargetRanges[i].minValue <<
"\t" << externalTargetRanges[i].maxValue << std::endl;
527 file <<
"RegressionData:\n";
529 for(UINT i=0; i<totalNumSamples; i++){
530 for(UINT j=0; j<numInputDimensions; j++){
531 file << data[i].getInputVectorValue(j) <<
"\t";
533 for(UINT j=0; j<numTargetDimensions; j++){
534 file << data[i].getTargetVectorValue(j);
535 if( j!= numTargetDimensions-1 ) file <<
"\t";
547 file.open(filename.c_str(), std::ios::in);
550 if( !file.is_open() ){
551 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to open file!" << std::endl;
559 if(word !=
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0"){
560 errorLog <<
"loadDatasetFromFile(const string &filename) - Unknown file header!" << std::endl;
567 if(word !=
"DatasetName:"){
568 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
575 if(word !=
"InfoText:"){
576 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find InfoText!" << std::endl;
584 while( word !=
"NumInputDimensions:" ){
585 infoText += word +
" ";
590 if(word !=
"NumInputDimensions:"){
591 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumInputDimensions!" << std::endl;
595 file >> numInputDimensions;
599 if(word !=
"NumTargetDimensions:"){
600 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumTargetDimensions!" << std::endl;
604 file >> numTargetDimensions;
608 if(word !=
"TotalNumTrainingExamples:"){
609 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
613 file >> totalNumSamples;
617 if(word !=
"UseExternalRanges:"){
618 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
622 file >> useExternalRanges;
625 if( useExternalRanges ){
626 externalInputRanges.
resize(numInputDimensions);
627 externalTargetRanges.
resize(numTargetDimensions);
628 for(UINT i=0; i<externalInputRanges.size(); i++){
629 file >> externalInputRanges[i].minValue;
630 file >> externalInputRanges[i].maxValue;
632 for(UINT i=0; i<externalTargetRanges.size(); i++){
633 file >> externalTargetRanges[i].minValue;
634 file >> externalTargetRanges[i].maxValue;
640 if( word !=
"RegressionData:" && word !=
"LabelledRegressionData:" ){
641 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find RegressionData!" << std::endl;
650 for(UINT i=0; i<totalNumSamples; i++){
652 for(UINT j=0; j<numInputDimensions; j++){
653 file >> inputVector[j];
655 for(UINT j=0; j<numTargetDimensions; j++){
656 file >> targetVector[j];
658 data[i].set(inputVector, targetVector);
668 file.open(filename.c_str(), std::ios::out );
670 if( !file.is_open() ){
671 errorLog <<
"saveDatasetToCSVFile(const string &filename) - Failed to open file!" << std::endl;
676 for(UINT i=0; i<totalNumSamples; i++){
677 for(UINT j=0; j<numInputDimensions; j++){
678 file << data[i].getInputVector()[j] <<
",";
680 for(UINT j=0; j<numTargetDimensions; j++){
681 file << data[i].getTargetVector()[j];
682 if( j != numTargetDimensions-1 ) file <<
",";
697 datasetName =
"NOT_SET";
706 if( !parser.parseCSVFile(filename,
true) ){
707 errorLog <<
"loadDatasetFromCSVFile(...) - Failed to parse CSV file!" << std::endl;
711 if( !parser.getConsistentColumnSize() ){
712 errorLog <<
"loadDatasetFromCSVFile(...) - The CSV file does not have a consistent number of columns!" << std::endl;
716 if( parser.getColumnSize() != numInputDimensions+numTargetDimensions ){
717 errorLog <<
"loadDatasetFromCSVFile(...) - The number of columns in the CSV file (" << parser.getColumnSize() <<
")";
718 errorLog <<
" does not match the number of input dimensions plus the number of target dimensions (" << numInputDimensions+numTargetDimensions <<
")" << std::endl;
728 for(UINT i=0; i<parser.getRowSize(); i++){
734 for(UINT j=0; j<numInputDimensions; j++){
735 inputVector[j] = grt_from_str< Float >( parser[i][n++] );
739 for(UINT j=0; j<numTargetDimensions; j++){
740 targetVector[j] = grt_from_str< Float >( parser[i][n++] );
744 if( !
addSample(inputVector, targetVector) ){
745 warningLog <<
"loadDatasetFromCSVFile(string filename) - Could not add sample " << i <<
" to the dataset!" << std::endl;
bool merge(const RegressionData ®ressionData)
bool loadDatasetFromCSVFile(const std::string &filename, const UINT numInputDimensions, const UINT numTargetDimensions)
static std::string toString(const int &i)
bool save(const std::string &filename) const
Vector< MinMax > getInputRanges() const
RegressionData & operator=(const RegressionData &rhs)
bool load(const std::string &filename)
virtual bool resize(const unsigned int size)
RegressionData getTrainingFoldData(const UINT foldIndex) const
bool setInfoText(const std::string &infoText)
UINT getNumInputDimensions() const
unsigned int getSize() const
bool setExternalRanges(const Vector< MinMax > &externalInputRanges, const Vector< MinMax > &externalTargetRanges, const bool useExternalRanges)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
Vector< MinMax > getTargetRanges() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumTargetDimensions() const
RegressionData(const UINT numInputDimensions=0, const UINT numTargetDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool saveDatasetToCSVFile(const std::string &filename) const
The RegressionData is the main data structure for recording, labeling, managing, saving, and loading datasets that can be used to train and test the GRT supervised regression algorithms.
bool setDatasetName(const std::string &datasetName)
int getRandomNumberInt(int minRange, int maxRange)
static bool stringEndsWith(const std::string &str, const std::string &ending)
bool saveDatasetToFile(const std::string &filename) const
Vector< RegressionSample > getData() const
RegressionData getTestFoldData(const UINT foldIndex) const
bool loadDatasetFromFile(const std::string &filename)
bool enableExternalRangeScaling(const bool useExternalRanges)
RegressionData partition(const UINT trainingSizePercentage)
bool addSample(const VectorFloat &inputVector, const VectorFloat &targetVector)
bool reserve(const UINT N)
bool spiltDataIntoKFolds(const UINT K)
UINT getNumSamples() const