21 #define GRT_DLL_EXPORTS 26 RegressionData::RegressionData(
const UINT numInputDimensions,
const UINT numTargetDimensions,
const std::string datasetName,
const std::string infoText):totalNumSamples(0){
27 this->numInputDimensions = numInputDimensions;
28 this->numTargetDimensions = numTargetDimensions;
29 this->datasetName = datasetName;
30 this->infoText = infoText;
32 crossValidationSetup =
false;
33 useExternalRanges =
false;
34 debugLog.
setKey(
"[DEBUG RegressionData]");
35 errorLog.
setKey(
"[ERROR RegressionData]");
36 warningLog.setKey(
"[WARNING RegressionData]");
47 this->datasetName = rhs.datasetName;
48 this->infoText = rhs.infoText;
49 this->numInputDimensions = rhs.numInputDimensions;
50 this->numTargetDimensions = rhs.numTargetDimensions;
51 this->totalNumSamples = rhs.totalNumSamples;
52 this->kFoldValue = rhs.kFoldValue;
53 this->crossValidationSetup = rhs.crossValidationSetup;
54 this->useExternalRanges = rhs.useExternalRanges;
55 this->externalInputRanges = rhs.externalInputRanges;
56 this->externalTargetRanges = rhs.externalTargetRanges;
57 this->data = rhs.data;
58 this->crossValidationIndexs = rhs.crossValidationIndexs;
59 this->debugLog = rhs.debugLog;
60 this->errorLog = rhs.errorLog;
61 this->warningLog = rhs.warningLog;
69 crossValidationSetup =
false;
71 crossValidationIndexs.clear();
76 if( numInputDimensions > 0 && numTargetDimensions > 0 ){
77 this->numInputDimensions = numInputDimensions;
78 this->numTargetDimensions = numTargetDimensions;
81 useExternalRanges =
false;
82 externalInputRanges.clear();
83 externalTargetRanges.clear();
86 errorLog <<
"setInputAndTargetDimensions(UINT numInputDimensions,UINT numTargetDimensions) - The number of input and target dimensions should be greater than zero!" << std::endl;
93 if( datasetName.find(
" ") == std::string::npos ){
94 this->datasetName = datasetName;
98 errorLog <<
"setDatasetName(const string &datasetName) - The dataset name cannot contain any spaces!" << std::endl;
103 this->infoText = infoText;
108 if( inputVector.
getSize() == numInputDimensions && targetVector.
getSize() == numTargetDimensions ){
113 crossValidationSetup =
false;
114 crossValidationIndexs.clear();
117 errorLog <<
"addSample(const VectorFloat &inputVector,const VectorFloat &targetVector) - The inputVector size or targetVector size does not match the size of the numInputDimensions or numTargetDimensions" << std::endl;
122 if( totalNumSamples > 0 ){
124 data.erase(data.end()-1);
125 totalNumSamples = data.
getSize();
128 crossValidationSetup =
false;
129 crossValidationIndexs.clear();
132 warningLog <<
"removeLastSample() - There are no samples to remove!" << std::endl;
140 if( data.capacity() >= N )
return true;
147 if( externalInputRanges.
getSize() != numInputDimensions )
return false;
148 if( externalTargetRanges.
getSize() != numTargetDimensions )
return false;
150 this->externalInputRanges = externalInputRanges;
151 this->externalTargetRanges = externalTargetRanges;
152 this->useExternalRanges = useExternalRanges;
158 if( externalInputRanges.
getSize() != numInputDimensions && externalTargetRanges.
getSize() != numTargetDimensions ){
159 this->useExternalRanges = useExternalRanges;
168 return scale(inputRanges,targetRanges,minTarget,maxTarget);
172 if( inputVectorRanges.
getSize() == numInputDimensions && targetVectorRanges.
getSize() == numTargetDimensions ){
174 VectorFloat scaledInputVector(numInputDimensions,0);
175 VectorFloat scaledTargetVector(numTargetDimensions,0);
176 for(UINT i=0; i<totalNumSamples; i++){
179 for(UINT j=0; j<numInputDimensions; j++){
180 scaledInputVector[j] = grt_scale(data[i].getInputVectorValue(j),inputVectorRanges[j].minValue,inputVectorRanges[j].maxValue,minTarget,maxTarget);
183 for(UINT j=0; j<numTargetDimensions; j++){
184 scaledTargetVector[j] = grt_scale(data[i].getTargetVectorValue(j),targetVectorRanges[j].minValue,targetVectorRanges[j].maxValue,minTarget,maxTarget);
187 data[i].set(scaledInputVector,scaledTargetVector);
197 if( useExternalRanges )
return externalInputRanges;
201 if( totalNumSamples > 0 ){
202 for(UINT j=0; j<numInputDimensions; j++){
203 ranges[j].minValue = data[0].getInputVectorValue(j);
204 ranges[j].maxValue = data[0].getInputVectorValue(j);
205 for(UINT i=0; i<totalNumSamples; i++){
206 if( data[i].getInputVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getInputVectorValue(j); }
207 else if( data[i].getInputVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getInputVectorValue(j); }
216 if( useExternalRanges )
return externalTargetRanges;
220 if( totalNumSamples > 0 ){
221 for(UINT j=0; j<numTargetDimensions; j++){
222 ranges[j].minValue = data[0].getTargetVectorValue(j);
223 ranges[j].maxValue = data[0].getTargetVectorValue(j);
224 for(UINT i=0; i<totalNumSamples; i++){
225 if( data[i].getTargetVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getTargetVectorValue(j); }
226 else if( data[i].getTargetVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getTargetVectorValue(j); }
233 std::string RegressionData::getStatsAsString()
const{
235 std::string statsText;
236 statsText +=
"DatasetName:\t" + datasetName +
"\n";
237 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
238 statsText +=
"Number of Input Dimensions:\t" +
Util::toString( numInputDimensions ) +
"\n";
239 statsText +=
"Number of Target Dimensions:\t" +
Util::toString( numTargetDimensions ) +
"\n";
240 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
244 statsText +=
"Dataset Input Dimension Ranges:\n";
245 for(UINT j=0; j<inputRanges.size(); j++){
251 statsText +=
"Dataset Target Dimension Ranges:\n";
252 for(UINT j=0; j<targetRanges.size(); j++){
258 bool RegressionData::printStats()
const{
259 std::cout << getStatsAsString();
263 RegressionData RegressionData::partition(
const UINT trainingSizePercentage){
264 return split( trainingSizePercentage );
274 const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
276 RegressionData trainingSet(numInputDimensions,numTargetDimensions);
282 UINT randomIndex = 0;
283 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
284 for(UINT x=0; x<totalNumSamples; x++){
286 SWAP( indexs[ x ] , indexs[ randomIndex ] );
290 for(UINT i=0; i<numTrainingExamples; i++){
291 trainingSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
293 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
294 testSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
302 crossValidationSetup =
false;
303 crossValidationIndexs.clear();
311 errorLog <<
"merge(RegressionData ®ressionData) - The number of input dimensions in the regressionData (" << regressionData.
getNumInputDimensions() <<
") does not match the number of input dimensions of this dataset (" << numInputDimensions <<
")" << std::endl;
316 errorLog <<
"merge(RegressionData ®ressionData) - The number of target dimensions in the regressionData (" << regressionData.
getNumTargetDimensions() <<
") does not match the number of target dimensions of this dataset (" << numTargetDimensions <<
")" << std::endl;
322 addSample(regressionData[i].getInputVector(), regressionData[i].getTargetVector());
326 crossValidationSetup =
false;
327 crossValidationIndexs.clear();
334 crossValidationSetup =
false;
335 crossValidationIndexs.clear();
338 if( K > totalNumSamples ){
339 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be zero!" << std::endl;
344 if( K > totalNumSamples ){
345 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
354 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
357 crossValidationIndexs.
resize(K);
361 UINT randomIndex = 0;
364 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
365 for(UINT x=0; x<totalNumSamples; x++){
370 SWAP( indexs[ x ] , indexs[ randomIndex ] );
375 for(UINT i=0; i<totalNumSamples; i++){
377 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
380 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
386 crossValidationSetup =
true;
394 if( !crossValidationSetup ){
395 errorLog <<
"getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << std::endl;
399 if( foldIndex >= kFoldValue )
return trainingData;
405 for(UINT k=0; k<kFoldValue; k++){
406 if( k != foldIndex ){
407 for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
409 index = crossValidationIndexs[k][i];
410 trainingData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
421 if( !crossValidationSetup )
return testData;
423 if( foldIndex >= kFoldValue )
return testData;
429 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
431 index = crossValidationIndexs[ foldIndex ][i];
432 testData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
438 UINT RegressionData::removeDuplicateSamples(){
440 UINT numSamplesRemoved = 0;
443 sort(data.begin(),data.end(),RegressionSample::sortByInputVectorAscending );
446 Float minDist = 1.0e-5;
448 Float totalDimensions = numInputDimensions + numTargetDimensions;
449 bool keepSearching =
true;
453 if( currentSample == data.end() ) keepSearching =
false;
454 if( nextSample == data.end() ) keepSearching =
false;
456 while( keepSearching ){
458 for(UINT i=0; i<numInputDimensions; i++){
459 dist += SQR( currentSample->getInputVectorValue(i) - nextSample->getInputVectorValue(i) );
461 for(UINT i=0; i<numTargetDimensions; i++){
462 dist += SQR( currentSample->getTargetVectorValue(i) - nextSample->getTargetVectorValue(i) );
464 dist /= totalDimensions;
465 if( dist <= minDist ){
467 currentSample = data.erase( nextSample );
468 nextSample = currentSample + 1;
470 debugLog <<
"Removing sample with dist: " << dist << std::endl;
476 if( currentSample == data.end() ) keepSearching =
false;
477 if( nextSample == data.end() ) keepSearching =
false;
480 return numSamplesRemoved;
508 file.open(filename.c_str(), std::ios::out);
510 if( !file.is_open() ){
511 errorLog <<
"saveDatasetToFile(const string &filename) - Failed to open file!" << std::endl;
515 file <<
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0\n";
516 file <<
"DatasetName: " << datasetName << std::endl;
517 file <<
"InfoText: " << infoText << std::endl;
518 file <<
"NumInputDimensions: "<<numInputDimensions << std::endl;
519 file <<
"NumTargetDimensions: "<<numTargetDimensions << std::endl;
520 file <<
"TotalNumTrainingExamples: "<<totalNumSamples << std::endl;
521 file <<
"UseExternalRanges: " << useExternalRanges << std::endl;
523 if( useExternalRanges ){
524 for(UINT i=0; i<externalInputRanges.
getSize(); i++){
525 file << externalInputRanges[i].minValue <<
"\t" << externalInputRanges[i].maxValue << std::endl;
527 for(UINT i=0; i<externalTargetRanges.
getSize(); i++){
528 file << externalTargetRanges[i].minValue <<
"\t" << externalTargetRanges[i].maxValue << std::endl;
532 file <<
"RegressionData:\n";
534 for(UINT i=0; i<totalNumSamples; i++){
535 for(UINT j=0; j<numInputDimensions; j++){
536 file << data[i].getInputVectorValue(j) <<
"\t";
538 for(UINT j=0; j<numTargetDimensions; j++){
539 file << data[i].getTargetVectorValue(j);
540 if( j!= numTargetDimensions-1 ) file <<
"\t";
552 file.open(filename.c_str(), std::ios::in);
555 if( !file.is_open() ){
556 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to open file!" << std::endl;
564 if(word !=
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0"){
565 errorLog <<
"loadDatasetFromFile(const string &filename) - Unknown file header!" << std::endl;
572 if(word !=
"DatasetName:"){
573 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
580 if(word !=
"InfoText:"){
581 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find InfoText!" << std::endl;
589 while( word !=
"NumInputDimensions:" ){
590 infoText += word +
" ";
595 if(word !=
"NumInputDimensions:"){
596 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumInputDimensions!" << std::endl;
600 file >> numInputDimensions;
604 if(word !=
"NumTargetDimensions:"){
605 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumTargetDimensions!" << std::endl;
609 file >> numTargetDimensions;
613 if(word !=
"TotalNumTrainingExamples:"){
614 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find TotalNumTrainingExamples!" << std::endl;
618 file >> totalNumSamples;
622 if(word !=
"UseExternalRanges:"){
623 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << std::endl;
627 file >> useExternalRanges;
630 if( useExternalRanges ){
631 externalInputRanges.
resize(numInputDimensions);
632 externalTargetRanges.
resize(numTargetDimensions);
633 for(UINT i=0; i<externalInputRanges.size(); i++){
634 file >> externalInputRanges[i].minValue;
635 file >> externalInputRanges[i].maxValue;
637 for(UINT i=0; i<externalTargetRanges.size(); i++){
638 file >> externalTargetRanges[i].minValue;
639 file >> externalTargetRanges[i].maxValue;
645 if( word !=
"RegressionData:" && word !=
"LabelledRegressionData:" ){
646 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find RegressionData!" << std::endl;
655 for(UINT i=0; i<totalNumSamples; i++){
657 for(UINT j=0; j<numInputDimensions; j++){
658 file >> inputVector[j];
660 for(UINT j=0; j<numTargetDimensions; j++){
661 file >> targetVector[j];
663 data[i].set(inputVector, targetVector);
673 file.open(filename.c_str(), std::ios::out );
675 if( !file.is_open() ){
676 errorLog <<
"saveDatasetToCSVFile(const string &filename) - Failed to open file!" << std::endl;
681 for(UINT i=0; i<totalNumSamples; i++){
682 for(UINT j=0; j<numInputDimensions; j++){
683 file << data[i].getInputVector()[j] <<
",";
685 for(UINT j=0; j<numTargetDimensions; j++){
686 file << data[i].getTargetVector()[j];
687 if( j != numTargetDimensions-1 ) file <<
",";
702 datasetName =
"NOT_SET";
711 if( !parser.parseCSVFile(filename,
true) ){
712 errorLog <<
"loadDatasetFromCSVFile(...) - Failed to parse CSV file!" << std::endl;
716 if( !parser.getConsistentColumnSize() ){
717 errorLog <<
"loadDatasetFromCSVFile(...) - The CSV file does not have a consistent number of columns!" << std::endl;
721 if( parser.getColumnSize() != numInputDimensions+numTargetDimensions ){
722 errorLog <<
"loadDatasetFromCSVFile(...) - The number of columns in the CSV file (" << parser.getColumnSize() <<
")";
723 errorLog <<
" does not match the number of input dimensions plus the number of target dimensions (" << numInputDimensions+numTargetDimensions <<
")" << std::endl;
733 for(UINT i=0; i<parser.getRowSize(); i++){
739 for(UINT j=0; j<numInputDimensions; j++){
740 inputVector[j] = grt_from_str< Float >( parser[i][n++] );
744 for(UINT j=0; j<numTargetDimensions; j++){
745 targetVector[j] = grt_from_str< Float >( parser[i][n++] );
749 if( !
addSample(inputVector, targetVector) ){
750 warningLog <<
"loadDatasetFromCSVFile(string filename) - Could not add sample " << i <<
" to the dataset!" << std::endl;
bool merge(const RegressionData ®ressionData)
bool loadDatasetFromCSVFile(const std::string &filename, const UINT numInputDimensions, const UINT numTargetDimensions)
static std::string toString(const int &i)
bool save(const std::string &filename) const
Vector< MinMax > getInputRanges() const
RegressionData & operator=(const RegressionData &rhs)
This file contains the Random class, a useful wrapper for generating cross platform random functions...
bool load(const std::string &filename)
virtual bool resize(const unsigned int size)
RegressionData getTrainingFoldData(const UINT foldIndex) const
virtual bool setKey(const std::string &key)
sets the key that gets written at the start of each message, this will be written in the format 'key ...
bool setInfoText(const std::string &infoText)
UINT getNumInputDimensions() const
bool setExternalRanges(const Vector< MinMax > &externalInputRanges, const Vector< MinMax > &externalTargetRanges, const bool useExternalRanges)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
Vector< MinMax > getTargetRanges() const
bool scale(const Float minTarget, const Float maxTarget)
UINT getNumTargetDimensions() const
RegressionData(const UINT numInputDimensions=0, const UINT numTargetDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool saveDatasetToCSVFile(const std::string &filename) const
The RegressionData is the main data structure for recording, labeling, managing, saving, and loading datasets that can be used to train and test the GRT supervised regression algorithms.
RegressionData split(const UINT trainingSizePercentage)
bool setDatasetName(const std::string &datasetName)
int getRandomNumberInt(int minRange, int maxRange)
static bool stringEndsWith(const std::string &str, const std::string &ending)
bool saveDatasetToFile(const std::string &filename) const
Vector< RegressionSample > getData() const
RegressionData getTestFoldData(const UINT foldIndex) const
bool loadDatasetFromFile(const std::string &filename)
bool enableExternalRangeScaling(const bool useExternalRanges)
bool addSample(const VectorFloat &inputVector, const VectorFloat &targetVector)
bool reserve(const UINT N)
bool spiltDataIntoKFolds(const UINT K)
UINT getNumSamples() const