25 UnlabelledData::UnlabelledData(
const UINT numDimensions,
const std::string datasetName,
const std::string infoText):debugLog(
"[DEBUG ULCD]"),errorLog(
"[ERROR ULCD]"),warningLog(
"[WARNING ULCD]"){
26 this->datasetName = datasetName;
27 this->numDimensions = numDimensions;
28 this->infoText = infoText;
30 crossValidationSetup =
false;
31 useExternalRanges =
false;
43 this->datasetName = rhs.datasetName;
44 this->infoText = rhs.infoText;
45 this->numDimensions = rhs.numDimensions;
46 this->totalNumSamples = rhs.totalNumSamples;
47 this->kFoldValue = rhs.kFoldValue;
48 this->crossValidationSetup = rhs.crossValidationSetup;
49 this->useExternalRanges = rhs.useExternalRanges;
50 this->externalRanges = rhs.externalRanges;
51 this->data = rhs.data;
52 this->crossValidationIndexs = rhs.crossValidationIndexs;
53 this->debugLog = rhs.debugLog;
54 this->errorLog = rhs.errorLog;
55 this->warningLog = rhs.warningLog;
63 crossValidationSetup =
false;
64 crossValidationIndexs.clear();
69 if( numDimensions > 0 ){
74 this->numDimensions = numDimensions;
77 useExternalRanges =
false;
78 externalRanges.clear();
88 if( datasetName.find(
" ") == std::string::npos ){
89 this->datasetName = datasetName;
97 this->infoText = infoText;
103 if( sample.size() != numDimensions )
return false;
106 crossValidationSetup =
false;
107 crossValidationIndexs.clear();
109 data.push_back( sample );
117 if( totalNumSamples > 0 ){
120 crossValidationSetup =
false;
121 crossValidationIndexs.clear();
124 if( totalNumSamples == 1 ){
129 data.erase( data.begin()+data.size()-1 );
141 if( data.capacity() >= N )
return true;
148 if( externalRanges.size() != numDimensions )
return false;
150 this->externalRanges = externalRanges;
151 this->useExternalRanges = useExternalRanges;
157 if( externalRanges.size() == numDimensions ){
158 this->useExternalRanges = useExternalRanges;
166 return scale(ranges,minTarget,maxTarget);
170 if( ranges.size() != numDimensions )
return false;
173 for(UINT i=0; i<totalNumSamples; i++){
174 for(UINT j=0; j<numDimensions; j++){
175 data[i][j] =
Util::scale(data[i][j],ranges[j].minValue,ranges[j].maxValue,minTarget,maxTarget);
207 file.open(filename.c_str(), std::ios::out);
209 if( !file.is_open() ){
210 errorLog <<
"saveDatasetToFile(const std::string &filename) - Failed to open file!" << std::endl;
214 file <<
"GRT_UNLABELLED_DATA_FILE_V1.0\n";
215 file <<
"DatasetName: " << datasetName << std::endl;
216 file <<
"InfoText: " << infoText << std::endl;
217 file <<
"NumDimensions: " << numDimensions << std::endl;
218 file <<
"TotalNumTrainingExamples: " << totalNumSamples << std::endl;
220 file <<
"UseExternalRanges: " << useExternalRanges << std::endl;
222 if( useExternalRanges ){
223 for(UINT i=0; i<externalRanges.size(); i++){
224 file << externalRanges[i].minValue <<
"\t" << externalRanges[i].maxValue << std::endl;
228 file <<
"UnlabelledTrainingData:\n";
230 for(UINT i=0; i<totalNumSamples; i++){
231 for(UINT j=0; j<numDimensions; j++){
232 if( j != 0 ) file <<
"\t";
245 file.open(filename.c_str(), std::ios::in);
248 if( !file.is_open() ){
249 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not open file!" << std::endl;
257 if( word !=
"GRT_UNLABELLED_DATA_FILE_V1.0" && word !=
"GRT_UNLABELLED_CLASSIFICATION_DATA_FILE_V1.0" ){
258 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not find file header!" << std::endl;
265 if(word !=
"DatasetName:"){
266 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
273 if(word !=
"InfoText:"){
274 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find InfoText!" << std::endl;
282 while( word !=
"NumDimensions:" ){
283 infoText += word +
" ";
288 if(word !=
"NumDimensions:"){
289 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
293 file >> numDimensions;
297 if(word !=
"TotalNumTrainingExamples:"){
298 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
302 file >> totalNumSamples;
306 if(word !=
"UseExternalRanges:"){
307 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
311 file >> useExternalRanges;
314 if( useExternalRanges ){
315 externalRanges.
resize(numDimensions);
316 for(UINT i=0; i<externalRanges.size(); i++){
317 file >> externalRanges[i].minValue;
318 file >> externalRanges[i].maxValue;
324 if(word !=
"UnlabelledTrainingData:"){
325 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
331 for(UINT i=0; i<totalNumSamples; i++){
332 for(UINT j=0; j<numDimensions; j++){
345 file.open(filename.c_str(), std::ios::out );
347 if( !file.is_open() ){
348 errorLog <<
"saveDatasetToCSVFile(const std::string &filename) - Failed to open file!" << std::endl;
353 for(UINT i=0; i<totalNumSamples; i++){
354 for(UINT j=0; j<numDimensions; j++){
355 if( j != 0 ) file <<
",";
369 datasetName =
"NOT_SET";
378 if( !parser.parseCSVFile(filename,
true) ){
379 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - Failed to parse CSV file!" << std::endl;
383 if( !parser.getConsistentColumnSize() ){
384 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have a consistent number of columns!" << std::endl;
388 const UINT rows = parser.getRowSize();
389 const UINT cols = parser.getColumnSize();
392 numDimensions = cols;
395 data.reserve( rows );
398 for(UINT i=0; i<rows; i++){
401 for(UINT j=0; j<numDimensions; j++){
407 warningLog <<
"loadDatasetFromCSVFile(const std::string &filename) - Could not add sample " << i <<
" to the dataset!" << std::endl;
422 crossValidationSetup =
false;
423 crossValidationIndexs.clear();
425 const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
433 UINT randomIndex = 0;
434 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
435 for(UINT x=0; x<totalNumSamples; x++){
440 SWAP( indexs[ x ] , indexs[ randomIndex ] );
443 trainingSet.
reserve( numTrainingExamples );
444 testSet.
reserve( totalNumSamples-numTrainingExamples );
447 for(UINT i=0; i<numTrainingExamples; i++){
448 trainingSet.
addSample( data[ indexs[i] ] );
450 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
463 errorLog <<
"merge(const UnlabelledData &unlabelledData) - The number of dimensions in the unlabelledData (" << unlabelledData.
getNumDimensions() <<
") does not match the number of dimensions of this dataset (" << numDimensions <<
")" << std::endl;
468 crossValidationSetup =
false;
469 crossValidationIndexs.clear();
483 crossValidationSetup =
false;
484 crossValidationIndexs.clear();
487 if( K > totalNumSamples ){
488 errorLog <<
"spiltDataIntoKFolds(const UINT K) - K can not be zero!" << std::endl;
493 if( K > totalNumSamples ){
494 errorLog <<
"spiltDataIntoKFolds(const UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
503 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
506 crossValidationIndexs.
resize(K);
510 UINT randomIndex = 0;
513 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
514 for(UINT x=0; x<totalNumSamples; x++){
519 grt_swap( indexs[ x ] , indexs[ randomIndex ] );
524 for(UINT i=0; i<totalNumSamples; i++){
526 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
529 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
535 crossValidationSetup =
true;
543 if( !crossValidationSetup ){
544 errorLog <<
"getTrainingFoldData(const UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K) function first before calling this function!" << std::endl;
548 if( foldIndex >= kFoldValue )
return trainingData;
554 for(UINT k=0; k<kFoldValue; k++){
555 if( k != foldIndex ){
556 numSamples += (UINT)crossValidationIndexs[k].size();
559 trainingData.
reserve( numSamples );
563 for(UINT k=0; k<kFoldValue; k++){
564 if( k != foldIndex ){
565 for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
567 index = crossValidationIndexs[k][i];
579 if( !crossValidationSetup )
return testData;
581 if( foldIndex >= kFoldValue )
return testData;
587 UINT numSamples = (UINT)crossValidationIndexs[ foldIndex ].size();
589 testData.
reserve( numSamples );
592 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
594 index = crossValidationIndexs[ foldIndex ][i];
602 std::string statsText;
603 statsText +=
"DatasetName:\t" + datasetName +
"\n";
604 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
605 statsText +=
"Number of Dimensions:\t" +
Util::toString( numDimensions ) +
"\n";
606 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
610 statsText +=
"Dataset Ranges:\n";
611 for(UINT j=0; j<ranges.size(); j++){
621 if( useExternalRanges )
return externalRanges;
626 if( totalNumSamples > 0 ){
627 for(UINT j=0; j<numDimensions; j++){
628 ranges[j].minValue = data[0][0];
629 ranges[j].maxValue = data[0][0];
630 for(UINT i=0; i<totalNumSamples; i++){
631 if( data[i][j] < ranges[j].minValue ){ ranges[j].minValue = data[i][j]; }
632 else if( data[i][j] > ranges[j].maxValue ){ ranges[j].maxValue = data[i][j]; }
648 for(UINT i=0; i<rows; i++){
649 for(UINT j=0; j<cols; j++){
650 d[i][j] = data[i][j];
662 for(UINT i=0; i<rows; i++){
663 for(UINT j=0; j<cols; j++){
664 d[i][j] = data[i][j];
bool loadDatasetFromCSVFile(const std::string &filename)
static std::string toString(const int &i)
bool scale(const Float minTarget, const Float maxTarget)
bool addSample(const VectorFloat &sample)
static Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
static Float stringToFloat(const std::string &s)
bool load(const std::string &filename)
MatrixFloat getDataAsMatrixFloat() const
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
bool setExternalRanges(const Vector< MinMax > &externalRanges, const bool useExternalRanges=false)
UINT getNumSamples() const
UnlabelledData & operator=(const UnlabelledData &rhs)
bool reserve(const UINT N)
Vector< VectorFloat > getData() const
The UnlabelledData class is the main data container for supporting unsupervised learning.
UnlabelledData getTestFoldData(const UINT foldIndex) const
UnlabelledData(const UINT numDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool save(const std::string &filename) const
bool saveDatasetToFile(const std::string &filename) const
bool setNumDimensions(const UINT numDimensions)
bool enableExternalRangeScaling(const bool useExternalRanges)
UnlabelledData partition(const UINT partitionPercentage)
bool setInfoText(const std::string infoText)
bool saveDatasetToCSVFile(const std::string &filename) const
Vector< MinMax > getRanges() const
int getRandomNumberInt(int minRange, int maxRange)
static bool stringEndsWith(const std::string &str, const std::string &ending)
UnlabelledData getTrainingFoldData(const UINT foldIndex) const
MatrixDouble getDataAsMatrixDouble() const
bool merge(const UnlabelledData &unlabelledData)
bool setDatasetName(const std::string datasetName)
bool spiltDataIntoKFolds(const UINT K)
bool loadDatasetFromFile(const std::string &filename)
std::string getStatsAsString() const