21 #define GRT_DLL_EXPORTS 26 UnlabelledData::UnlabelledData(
const UINT numDimensions,
const std::string datasetName,
const std::string infoText):debugLog(
"[DEBUG ULCD]"),errorLog(
"[ERROR ULCD]"),warningLog(
"[WARNING ULCD]"){
27 this->datasetName = datasetName;
28 this->numDimensions = numDimensions;
29 this->infoText = infoText;
31 crossValidationSetup =
false;
32 useExternalRanges =
false;
44 this->datasetName = rhs.datasetName;
45 this->infoText = rhs.infoText;
46 this->numDimensions = rhs.numDimensions;
47 this->totalNumSamples = rhs.totalNumSamples;
48 this->kFoldValue = rhs.kFoldValue;
49 this->crossValidationSetup = rhs.crossValidationSetup;
50 this->useExternalRanges = rhs.useExternalRanges;
51 this->externalRanges = rhs.externalRanges;
52 this->data = rhs.data;
53 this->crossValidationIndexs = rhs.crossValidationIndexs;
54 this->debugLog = rhs.debugLog;
55 this->errorLog = rhs.errorLog;
56 this->warningLog = rhs.warningLog;
64 crossValidationSetup =
false;
65 crossValidationIndexs.clear();
70 if( numDimensions > 0 ){
75 this->numDimensions = numDimensions;
78 useExternalRanges =
false;
79 externalRanges.clear();
89 if( datasetName.find(
" ") == std::string::npos ){
90 this->datasetName = datasetName;
98 this->infoText = infoText;
104 if( sample.size() != numDimensions )
return false;
107 crossValidationSetup =
false;
108 crossValidationIndexs.clear();
110 data.push_back( sample );
118 if( totalNumSamples > 0 ){
121 crossValidationSetup =
false;
122 crossValidationIndexs.clear();
125 if( totalNumSamples == 1 ){
130 data.erase( data.begin()+data.size()-1 );
142 if( data.capacity() >= N )
return true;
149 if( externalRanges.size() != numDimensions )
return false;
151 this->externalRanges = externalRanges;
152 this->useExternalRanges = useExternalRanges;
158 if( externalRanges.size() == numDimensions ){
159 this->useExternalRanges = useExternalRanges;
167 return scale(ranges,minTarget,maxTarget);
171 if( ranges.size() != numDimensions )
return false;
174 for(UINT i=0; i<totalNumSamples; i++){
175 for(UINT j=0; j<numDimensions; j++){
176 data[i][j] =
Util::scale(data[i][j],ranges[j].minValue,ranges[j].maxValue,minTarget,maxTarget);
208 file.open(filename.c_str(), std::ios::out);
210 if( !file.is_open() ){
211 errorLog <<
"saveDatasetToFile(const std::string &filename) - Failed to open file!" << std::endl;
215 file <<
"GRT_UNLABELLED_DATA_FILE_V1.0\n";
216 file <<
"DatasetName: " << datasetName << std::endl;
217 file <<
"InfoText: " << infoText << std::endl;
218 file <<
"NumDimensions: " << numDimensions << std::endl;
219 file <<
"TotalNumTrainingExamples: " << totalNumSamples << std::endl;
221 file <<
"UseExternalRanges: " << useExternalRanges << std::endl;
223 if( useExternalRanges ){
224 for(UINT i=0; i<externalRanges.size(); i++){
225 file << externalRanges[i].minValue <<
"\t" << externalRanges[i].maxValue << std::endl;
229 file <<
"UnlabelledTrainingData:\n";
231 for(UINT i=0; i<totalNumSamples; i++){
232 for(UINT j=0; j<numDimensions; j++){
233 if( j != 0 ) file <<
"\t";
246 file.open(filename.c_str(), std::ios::in);
249 if( !file.is_open() ){
250 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not open file!" << std::endl;
258 if( word !=
"GRT_UNLABELLED_DATA_FILE_V1.0" && word !=
"GRT_UNLABELLED_CLASSIFICATION_DATA_FILE_V1.0" ){
259 errorLog <<
"loadDatasetFromFile(const std::string &filename) - could not find file header!" << std::endl;
266 if(word !=
"DatasetName:"){
267 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
274 if(word !=
"InfoText:"){
275 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find InfoText!" << std::endl;
283 while( word !=
"NumDimensions:" ){
284 infoText += word +
" ";
289 if(word !=
"NumDimensions:"){
290 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
294 file >> numDimensions;
298 if(word !=
"TotalNumTrainingExamples:"){
299 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
303 file >> totalNumSamples;
307 if(word !=
"UseExternalRanges:"){
308 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
312 file >> useExternalRanges;
315 if( useExternalRanges ){
316 externalRanges.
resize(numDimensions);
317 for(UINT i=0; i<externalRanges.size(); i++){
318 file >> externalRanges[i].minValue;
319 file >> externalRanges[i].maxValue;
325 if(word !=
"UnlabelledTrainingData:"){
326 errorLog <<
"loadDatasetFromFile(const std::string &filename) - failed to find DatasetName!" << std::endl;
332 for(UINT i=0; i<totalNumSamples; i++){
333 for(UINT j=0; j<numDimensions; j++){
346 file.open(filename.c_str(), std::ios::out );
348 if( !file.is_open() ){
349 errorLog <<
"saveDatasetToCSVFile(const std::string &filename) - Failed to open file!" << std::endl;
354 for(UINT i=0; i<totalNumSamples; i++){
355 for(UINT j=0; j<numDimensions; j++){
356 if( j != 0 ) file <<
",";
370 datasetName =
"NOT_SET";
379 if( !parser.parseCSVFile(filename,
true) ){
380 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - Failed to parse CSV file!" << std::endl;
384 if( !parser.getConsistentColumnSize() ){
385 errorLog <<
"loadDatasetFromCSVFile(const std::string &filename) - The CSV file does not have a consistent number of columns!" << std::endl;
389 const UINT rows = parser.getRowSize();
390 const UINT cols = parser.getColumnSize();
393 numDimensions = cols;
396 data.reserve( rows );
399 for(UINT i=0; i<rows; i++){
402 for(UINT j=0; j<numDimensions; j++){
408 warningLog <<
"loadDatasetFromCSVFile(const std::string &filename) - Could not add sample " << i <<
" to the dataset!" << std::endl;
415 UnlabelledData UnlabelledData::partition(
const UINT trainingSizePercentage){
416 return split( trainingSizePercentage );
427 crossValidationSetup =
false;
428 crossValidationIndexs.clear();
430 const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) );
438 UINT randomIndex = 0;
439 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
440 for(UINT x=0; x<totalNumSamples; x++){
445 SWAP( indexs[ x ] , indexs[ randomIndex ] );
448 trainingSet.
reserve( numTrainingExamples );
449 testSet.
reserve( totalNumSamples-numTrainingExamples );
452 for(UINT i=0; i<numTrainingExamples; i++){
453 trainingSet.
addSample( data[ indexs[i] ] );
455 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
468 errorLog <<
"merge(const UnlabelledData &unlabelledData) - The number of dimensions in the unlabelledData (" << unlabelledData.
getNumDimensions() <<
") does not match the number of dimensions of this dataset (" << numDimensions <<
")" << std::endl;
473 crossValidationSetup =
false;
474 crossValidationIndexs.clear();
488 crossValidationSetup =
false;
489 crossValidationIndexs.clear();
492 if( K > totalNumSamples ){
493 errorLog <<
"spiltDataIntoKFolds(const UINT K) - K can not be zero!" << std::endl;
498 if( K > totalNumSamples ){
499 errorLog <<
"spiltDataIntoKFolds(const UINT K) - K can not be larger than the total number of samples in the dataset!" << std::endl;
508 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/Float(K) );
511 crossValidationIndexs.
resize(K);
515 UINT randomIndex = 0;
518 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
519 for(UINT x=0; x<totalNumSamples; x++){
524 grt_swap( indexs[ x ] , indexs[ randomIndex ] );
529 for(UINT i=0; i<totalNumSamples; i++){
531 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
534 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
540 crossValidationSetup =
true;
548 if( !crossValidationSetup ){
549 errorLog <<
"getTrainingFoldData(const UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K) function first before calling this function!" << std::endl;
553 if( foldIndex >= kFoldValue )
return trainingData;
559 for(UINT k=0; k<kFoldValue; k++){
560 if( k != foldIndex ){
561 numSamples += (UINT)crossValidationIndexs[k].size();
564 trainingData.
reserve( numSamples );
568 for(UINT k=0; k<kFoldValue; k++){
569 if( k != foldIndex ){
570 for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
572 index = crossValidationIndexs[k][i];
584 if( !crossValidationSetup )
return testData;
586 if( foldIndex >= kFoldValue )
return testData;
592 UINT numSamples = (UINT)crossValidationIndexs[ foldIndex ].size();
594 testData.
reserve( numSamples );
597 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
599 index = crossValidationIndexs[ foldIndex ][i];
607 std::string statsText;
608 statsText +=
"DatasetName:\t" + datasetName +
"\n";
609 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
610 statsText +=
"Number of Dimensions:\t" +
Util::toString( numDimensions ) +
"\n";
611 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
615 statsText +=
"Dataset Ranges:\n";
616 for(UINT j=0; j<ranges.size(); j++){
626 if( useExternalRanges )
return externalRanges;
631 if( totalNumSamples > 0 ){
632 for(UINT j=0; j<numDimensions; j++){
633 ranges[j].minValue = data[0][0];
634 ranges[j].maxValue = data[0][0];
635 for(UINT i=0; i<totalNumSamples; i++){
636 if( data[i][j] < ranges[j].minValue ){ ranges[j].minValue = data[i][j]; }
637 else if( data[i][j] > ranges[j].maxValue ){ ranges[j].maxValue = data[i][j]; }
653 for(UINT i=0; i<rows; i++){
654 for(UINT j=0; j<cols; j++){
655 d[i][j] = data[i][j];
667 for(UINT i=0; i<rows; i++){
668 for(UINT j=0; j<cols; j++){
669 d[i][j] = data[i][j];
bool loadDatasetFromCSVFile(const std::string &filename)
static std::string toString(const int &i)
bool scale(const Float minTarget, const Float maxTarget)
bool addSample(const VectorFloat &sample)
static Float scale(const Float &x, const Float &minSource, const Float &maxSource, const Float &minTarget, const Float &maxTarget, const bool constrain=false)
static Float stringToFloat(const std::string &s)
This file contains the Random class, a useful wrapper for generating cross platform random functions...
bool load(const std::string &filename)
MatrixFloat getDataAsMatrixFloat() const
UINT getNumDimensions() const
virtual bool resize(const unsigned int size)
bool setExternalRanges(const Vector< MinMax > &externalRanges, const bool useExternalRanges=false)
UnlabelledData split(const UINT partitionPercentage)
UINT getNumSamples() const
UnlabelledData & operator=(const UnlabelledData &rhs)
bool reserve(const UINT N)
Vector< VectorFloat > getData() const
The UnlabelledData class is the main data container for supporting unsupervised learning.
UnlabelledData getTestFoldData(const UINT foldIndex) const
UnlabelledData(const UINT numDimensions=0, const std::string datasetName="NOT_SET", const std::string infoText="")
bool save(const std::string &filename) const
bool saveDatasetToFile(const std::string &filename) const
bool setNumDimensions(const UINT numDimensions)
bool enableExternalRangeScaling(const bool useExternalRanges)
bool setInfoText(const std::string infoText)
bool saveDatasetToCSVFile(const std::string &filename) const
Vector< MinMax > getRanges() const
int getRandomNumberInt(int minRange, int maxRange)
static bool stringEndsWith(const std::string &str, const std::string &ending)
UnlabelledData getTrainingFoldData(const UINT foldIndex) const
MatrixDouble getDataAsMatrixDouble() const
bool merge(const UnlabelledData &unlabelledData)
bool setDatasetName(const std::string datasetName)
bool spiltDataIntoKFolds(const UINT K)
bool loadDatasetFromFile(const std::string &filename)
std::string getStatsAsString() const