// // Created by martin on 7/13/18. // #include <algorithm> #include <experimental/filesystem> #include <boost/serialization/export.hpp> #include "DataSetSerialization.h" #include "exceptions.h" BOOST_CLASS_EXPORT_IMPLEMENT(lib4neuro::DataSet); namespace lib4neuro { DataSet::DataSet() { this->n_elements = 0; this->input_dim = 0; this->output_dim = 0; } DataSet::DataSet(std::string file_path) { std::ifstream ifs(file_path); if(ifs.is_open()) { try { boost::archive::text_iarchive ia(ifs); ia >> *this; }catch(boost::archive::archive_exception& e) { THROW_RUNTIME_ERROR("Serialized archive error: '" + e.what() + "'! Please, check if your file is really " "the serialized DataSet."); } ifs.close(); } else { THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!"); } } DataSet::DataSet(std::vector<std::pair<std::vector<double>, std::vector<double>>> *data_ptr, NormalizationStrategy* ns) { this->n_elements = data_ptr->size(); this->data = *data_ptr; this->input_dim = this->data[0].first.size(); this->output_dim = this->data[0].second.size(); if(ns) { this->normalization_strategy = ns; // this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value()); // this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value()); } //TODO check the complete data set for input/output dimensions } DataSet::DataSet(double lower_bound, double upper_bound, unsigned int size, double output, NormalizationStrategy* ns) { std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec; this->data = new_data_vec; this->n_elements = 0; this->input_dim = 1; this->output_dim = 1; if(ns) { this->normalization_strategy = ns; // this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value()); // this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value()); } this->add_isotropic_data(lower_bound, upper_bound, size, output); } DataSet::DataSet(std::vector<double> &bounds, unsigned int no_elems_in_one_dim, std::vector<double> (*output_func)(std::vector<double> &), unsigned int output_dim, NormalizationStrategy* ns) { std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec; this->data = new_data_vec; this->input_dim = bounds.size() / 2; this->output_dim = output_dim; this->n_elements = 0; if(ns) { this->normalization_strategy = ns; } this->add_isotropic_data(bounds, no_elems_in_one_dim, output_func); } void DataSet::add_data_pair(std::vector<double> &inputs, std::vector<double> &outputs) { if(this->n_elements == 0 && this->input_dim == 0 && this->output_dim == 0) { this->input_dim = inputs.size(); this->output_dim = outputs.size(); } if (inputs.size() != this->input_dim) { THROW_RUNTIME_ERROR("Bad input dimension."); } else if (outputs.size() != this->output_dim) { THROW_RUNTIME_ERROR("Bad output dimension."); } this->n_elements++; this->data.emplace_back(std::make_pair(inputs, outputs)); } void DataSet::add_isotropic_data(double lower_bound, double upper_bound, unsigned int size, double output) { if (this->input_dim != 1 || this->output_dim != 1) { THROW_RUNTIME_ERROR("Cannot add data with dimensionality 1:1 when the data set " "is of different dimensionality!"); } double frac; if(size < 1) { THROW_INVALID_ARGUMENT_ERROR("Size of added data has to be >=1 !"); } else if (size == 1) { frac = 1; } else { frac = (upper_bound - lower_bound) / (size - 1); } std::vector<double> inp, out; out = {output}; for (unsigned int i = 0; i < size; ++i) { inp = {frac * i}; this->data.emplace_back(std::make_pair(inp, out)); } this->n_elements += size; } void DataSet::add_isotropic_data(std::vector<double> &bounds, unsigned int no_elems_in_one_dim, std::vector<double> (*output_func)(std::vector<double> &)) { // TODO add check of dataset dimensions std::vector<std::vector<double>> grid; std::vector<double> tmp; double frac; if(no_elems_in_one_dim < 1) { THROW_INVALID_ARGUMENT_ERROR("Number of elements in one dimension has to be >=1 !"); } for (unsigned int i = 0; i < bounds.size(); i += 2) { if (no_elems_in_one_dim == 1) { frac = 1; } else { frac = (bounds[i] - bounds[i+1]) / (no_elems_in_one_dim - 1); } tmp.clear(); for (double j = bounds[i]; j <= bounds[i + 1]; j += frac) { tmp.emplace_back(j); } grid.emplace_back(tmp); } grid = this->cartesian_product(&grid); for (auto vec : grid) { this->n_elements++; this->data.emplace_back(std::make_pair(vec, output_func(vec))); } } std::vector<std::pair<std::vector<double>, std::vector<double>>> *DataSet::get_data() { return &(this->data); } size_t DataSet::get_n_elements() { return this->n_elements; } size_t DataSet::get_input_dim() { return this->input_dim; } size_t DataSet::get_output_dim() { return this->output_dim; } void DataSet::print_data() { if (n_elements) { for (auto p : this->data) { /* INPUT */ for (auto v : std::get<0>(p)) { std::cout << v << " "; } std::cout << "-> "; /* OUTPUT */ for (auto v : std::get<1>(p)) { std::cout << v << " "; } std::cout << std::endl; } } } void DataSet::store_text(std::string file_path) { std::ofstream ofs(file_path); if(!ofs.is_open()) { THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!"); } else { boost::archive::text_oarchive oa(ofs); oa << *this; ofs.close(); } } void DataSet::store_data_text(std::ofstream* file_path) { for (auto e : this->data) { /* First part of the pair */ for (unsigned int i = 0; i < e.first.size() - 1; i++) { *file_path << e.first.at(i) << ","; } *file_path << e.first.back() << " "; /* Second part of the pair */ for (unsigned int i = 0; i < e.second.size() - 1; i++) { *file_path << e.second.at(i) << ","; } *file_path << e.second.back() << std::endl; } } void DataSet::store_data_text(std::string file_path) { std::ofstream ofs(file_path); if(!ofs.is_open()) { THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!"); } else { for (auto e : this->data) { /* First part of the pair */ for (unsigned int i = 0; i < e.first.size() - 1; i++) { ofs << e.first.at(i) << ","; } ofs << e.first.back() << " "; /* Second part of the pair */ for (unsigned int i = 0; i < e.second.size() - 1; i++) { ofs << e.second.at(i) << ","; } ofs << e.second.back() << std::endl; } } } template<class T> std::vector<std::vector<T>> DataSet::cartesian_product(const std::vector<std::vector<T>> *v) { std::vector<std::vector<double>> v_combined_old, v_combined, v_tmp; std::vector<double> tmp; for (const auto &e : v->at(0)) { tmp = {e}; v_combined.emplace_back(tmp); } for (unsigned int i = 1; i < v->size(); i++) { // Iterate through remaining vectors of 'v' v_combined_old = v_combined; v_combined.clear(); for (const auto &e : v->at(i)) { for (const auto &vec : v_combined_old) { tmp = vec; tmp.emplace_back(e); /* Add only unique elements */ if (std::find(v_combined.begin(), v_combined.end(), tmp) == v_combined.end()) { v_combined.emplace_back(tmp); } } } } return v_combined; } void DataSet::normalize() { this->normalized = false; if(!this->normalization_strategy) { THROW_INVALID_ARGUMENT_ERROR("There is no normalization strategy given for this data set, so it can not be " "normalized!"); } /* Find maximum and minimum values */ if(this->max_min_inp_val.empty()) { this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0)); this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0)); } double tmp, tmp2; for(auto pair : this->data) { /* Finding maximum */ //TODO make more efficiently tmp = *std::max_element(pair.first.begin(), pair.first.end()); tmp2 = *std::max_element(pair.second.begin(), pair.second.end()); tmp = std::max(tmp, tmp2); /* Testing for a new maxima */ if (tmp > this->max_min_inp_val.at(0)) { this->max_min_inp_val.at(0) = tmp; } /* Finding minimum */ tmp = *std::min_element(pair.first.begin(), pair.first.end()); tmp2 = *std::min_element(pair.second.begin(), pair.second.end()); tmp = std::min(tmp, tmp2); /* Testing for a new minima */ if (tmp < this->max_min_inp_val.at(1)) { this->max_min_inp_val.at(1) = tmp; } } /* Normalize every number in the data set */ for(auto& pair : this->data) { for(auto& v : pair.first) { v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1)); } for(auto& v : pair.second) { v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1)); } } this->normalized = true; } double DataSet::get_normalized_value(double val){ if(!this->normalized || !this->normalization_strategy) { return val; } return this->normalization_strategy->normalize(val, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1)); } void DataSet::get_input(std::vector<double> &d, size_t idx){ assert(d.size() == this->data[idx].first.size()); for (size_t j = 0; j < this->data[idx].first.size(); ++j) { d[j] = this->data[idx].first[j]; } } void DataSet::get_output(std::vector<double> &d, size_t idx){ assert(d.size() == this->data[idx].second.size()); for (size_t j = 0; j < this->data[idx].second.size(); ++j) { d[j] = this->data[idx].second[j]; } } void DataSet::de_normalize() { std::vector<double> tmp_inp(this->data.at(0).first.size()); std::vector<double> tmp_out(this->data.at(0).second.size()); for(auto& pair: this->data) { for(size_t i=0; i < pair.first.size(); i++) { tmp_inp.at(i) = this->normalization_strategy->de_normalize(pair.first.at(i)); } pair.first = tmp_inp; } for(auto& pair: this->data) { for(size_t i=0; i < pair.second.size(); i++) { tmp_out.at(i) = this->normalization_strategy->de_normalize(pair.second.at(i)); } pair.second = tmp_out; } /* Remove found max and minimal values, because of is_normalized() method */ this->max_min_inp_val.clear(); } void DataSet::de_normalize_single(std::vector<double> &d1, std::vector<double> &d2){ assert(d1.size() == d2.size()); for (size_t j = 0; j < d1.size(); ++j) { d2[j] = this->normalization_strategy->de_normalize(d1[j]); } } NormalizationStrategy* DataSet::get_normalization_strategy() { return this->normalization_strategy; } bool DataSet::is_normalized() { return !this->max_min_inp_val.empty(); } double DataSet::get_max_inp_val() { return this->max_min_inp_val.at(0); } double DataSet::get_min_inp_val() { return this->max_min_inp_val.at(1); } /** * Method returning random amount of data pairs between 1-max */ std::vector<std::pair<std::vector<double>, std::vector<double>>> DataSet::get_random_data_batch(size_t max) { if (max <= 0) { return this->data; } else { std::vector<std::pair<std::vector<double>, std::vector<double>>> newData; srand(time(NULL)); //TODO use Mersen twister from Boost size_t n_chosen = rand() % std::min(max, this->data.size())+1; n_chosen = max; std::vector<size_t> chosens; size_t chosen; for (int i = 0; i < n_chosen; i++) { chosen = rand() % this->data.size(); auto it = std::find(chosens.begin(), chosens.end(), chosen); if (it != chosens.end()) { i--; } else { newData.push_back(this->data.at(chosen)); chosens.push_back( chosen ); } } return newData; } } }