Newer
Older
Martin Beseda
committed
#include <algorithm>
#include <filesystem>
Martin Beseda
committed
#include <boost/serialization/export.hpp>
Martin Beseda
committed
#include "DataSetSerialization.h"
#include "exceptions.h"
Martin Beseda
committed
BOOST_CLASS_EXPORT_IMPLEMENT(lib4neuro::DataSet);
Martin Beseda
committed
namespace lib4neuro {
Martin Beseda
committed
Martin Beseda
committed
DataSet::DataSet() {
this->n_elements = 0;
this->input_dim = 0;
this->output_dim = 0;
}
Martin Beseda
committed
DataSet::DataSet(std::string file_path) {
std::ifstream ifs(file_path);
Martin Beseda
committed
if(ifs.is_open()) {
boost::archive::text_iarchive ia(ifs);
ia >> *this;
ifs.close();
} else {
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
Martin Beseda
committed
}
Martin Beseda
committed
}
Martin Beseda
committed
Martin Beseda
committed
DataSet::DataSet(std::vector<std::pair<std::vector<double>, std::vector<double>>> *data_ptr,
NormalizationStrategy* ns) {
Martin Beseda
committed
this->n_elements = data_ptr->size();
this->data = *data_ptr;
this->input_dim = this->data[0].first.size();
this->output_dim = this->data[0].second.size();
Martin Beseda
committed
if(ns) {
this->normalization_strategy = ns;
// this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value());
// this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value());
Martin Beseda
committed
//TODO check the complete data set for input/output dimensions
}
Martin Beseda
committed
DataSet::DataSet(double lower_bound,
double upper_bound,
unsigned int size,
double output,
NormalizationStrategy* ns) {
Martin Beseda
committed
std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
this->data = new_data_vec;
this->n_elements = 0;
this->input_dim = 1;
this->output_dim = 1;
if(ns) {
this->normalization_strategy = ns;
// this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value());
// this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value());
Martin Beseda
committed
this->add_isotropic_data(lower_bound, upper_bound, size, output);
}
DataSet::DataSet(std::vector<double> &bounds,
unsigned int no_elems_in_one_dim,
std::vector<double> (*output_func)(std::vector<double> &),
unsigned int output_dim,
NormalizationStrategy* ns) {
Martin Beseda
committed
std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
this->data = new_data_vec;
this->input_dim = bounds.size() / 2;
this->output_dim = output_dim;
this->n_elements = 0;
if(ns) {
this->normalization_strategy = ns;
}
Martin Beseda
committed
this->add_isotropic_data(bounds, no_elems_in_one_dim, output_func);
}
Martin Beseda
committed
void DataSet::add_data_pair(std::vector<double> &inputs, std::vector<double> &outputs) {
if(this->n_elements == 0 && this->input_dim == 0 && this->output_dim == 0) {
this->input_dim = inputs.size();
this->output_dim = outputs.size();
}
Martin Beseda
committed
if (inputs.size() != this->input_dim) {
THROW_RUNTIME_ERROR("Bad input dimension.");
Martin Beseda
committed
} else if (outputs.size() != this->output_dim) {
THROW_RUNTIME_ERROR("Bad output dimension.");
Martin Beseda
committed
}
Martin Beseda
committed
this->n_elements++;
this->data.emplace_back(std::make_pair(inputs, outputs));
Martin Beseda
committed
}
Martin Beseda
committed
void DataSet::add_isotropic_data(double lower_bound, double upper_bound, unsigned int size, double output) {
Martin Beseda
committed
if (this->input_dim != 1 || this->output_dim != 1) {
THROW_RUNTIME_ERROR("Cannot add data with dimensionality 1:1 when the data set "
"is of different dimensionality!");
Martin Beseda
committed
}
Martin Beseda
committed
double frac = (upper_bound - lower_bound) / (size - 1);
std::vector<double> inp, out;
Martin Beseda
committed
out = {output};
Martin Beseda
committed
for (unsigned int i = 0; i < size; ++i) {
inp = {frac * i};
this->data.emplace_back(std::make_pair(inp, out));
}
Martin Beseda
committed
this->n_elements += size;
Martin Beseda
committed
void DataSet::add_isotropic_data(std::vector<double> &bounds, unsigned int no_elems_in_one_dim,
std::vector<double> (*output_func)(std::vector<double> &)) {
// TODO add check of dataset dimensions
Martin Beseda
committed
std::vector<std::vector<double>> grid;
std::vector<double> tmp;
double frac;
Martin Beseda
committed
for (unsigned int i = 0; i < bounds.size(); i += 2) {
frac = (bounds[i] + bounds[i + 1]) / (no_elems_in_one_dim - 1);
tmp.clear();
for (double j = bounds[i]; j <= bounds[i + 1]; j += frac) {
tmp.emplace_back(j);
}
Martin Beseda
committed
grid.emplace_back(tmp);
Martin Beseda
committed
grid = this->cartesian_product(&grid);
for (auto vec : grid) {
this->n_elements++;
this->data.emplace_back(std::make_pair(vec, output_func(vec)));
}
Martin Beseda
committed
std::vector<std::pair<std::vector<double>, std::vector<double>>> *DataSet::get_data() {
return &(this->data);
}
Martin Beseda
committed
size_t DataSet::get_n_elements() {
return this->n_elements;
Martin Beseda
committed
size_t DataSet::get_input_dim() {
return this->input_dim;
}
Martin Beseda
committed
size_t DataSet::get_output_dim() {
return this->output_dim;
}
void DataSet::print_data() {
if (n_elements) {
for (auto p : this->data) {
/* INPUT */
for (auto v : std::get<0>(p)) {
std::cout << v << " ";
}
std::cout << "-> ";
/* OUTPUT */
for (auto v : std::get<1>(p)) {
std::cout << v << " ";
}
Martin Beseda
committed
std::cout << std::endl;
}
Martin Beseda
committed
void DataSet::store_text(std::string file_path) {
Martin Beseda
committed
std::ofstream ofs(file_path);
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
if(!ofs.is_open()) {
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
} else {
boost::archive::text_oarchive oa(ofs);
oa << *this;
ofs.close();
}
}
void DataSet::store_data_text(std::string file_path) {
std::ofstream ofs(file_path);
if(!ofs.is_open()) {
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
} else {
for (auto e : this->data) {
/* First part of the pair */
for (unsigned int i = 0; i < e.first.size() - 1; i++) {
ofs << e.first.at(i) << ",";
}
ofs << e.first.back() << " ";
/* Second part of the pair */
for (unsigned int i = 0; i < e.second.size() - 1; i++) {
ofs << e.second.at(i) << ",";
}
ofs << e.second.back() << std::endl;
}
}
Martin Beseda
committed
template<class T>
std::vector<std::vector<T>> DataSet::cartesian_product(const std::vector<std::vector<T>> *v) {
std::vector<std::vector<double>> v_combined_old, v_combined, v_tmp;
std::vector<double> tmp;
Martin Beseda
committed
for (const auto &e : v->at(0)) {
tmp = {e};
v_combined.emplace_back(tmp);
}
Martin Beseda
committed
for (unsigned int i = 1; i < v->size(); i++) { // Iterate through remaining vectors of 'v'
v_combined_old = v_combined;
v_combined.clear();
for (const auto &e : v->at(i)) {
for (const auto &vec : v_combined_old) {
tmp = vec;
tmp.emplace_back(e);
/* Add only unique elements */
if (std::find(v_combined.begin(), v_combined.end(), tmp) == v_combined.end()) {
v_combined.emplace_back(tmp);
}
Martin Beseda
committed
return v_combined;
Martin Beseda
committed
void DataSet::normalize() {
if(!this->normalization_strategy) {
THROW_INVALID_ARGUMENT_ERROR("There is no normalization strategy given for this data set, so it can not be "
"normalized!");
Martin Beseda
committed
/* Find maximum and minimum values */
if(this->max_min_inp_val.empty()) {
this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
}
Martin Beseda
committed
double tmp, tmp2;
for(auto pair : this->data) {
/* Finding maximum */
//TODO make more efficiently
tmp = *std::max_element(pair.first.begin(), pair.first.end());
tmp2 = *std::max_element(pair.second.begin(), pair.second.end());
tmp = std::max(tmp, tmp2);
Martin Beseda
committed
/* Testing for a new maxima */
if (tmp > this->max_min_inp_val.at(0)) {
this->max_min_inp_val.at(0) = tmp;
Martin Beseda
committed
}
/* Finding minimum */
tmp = *std::min_element(pair.first.begin(), pair.first.end());
tmp2 = *std::min_element(pair.second.begin(), pair.second.end());
tmp = std::min(tmp, tmp2);
Martin Beseda
committed
/* Testing for a new minima */
if (tmp < this->max_min_inp_val.at(1)) {
this->max_min_inp_val.at(1) = tmp;
Martin Beseda
committed
}
}
/* Normalize every number in the data set */
for(auto& pair : this->data) {
for(auto& v : pair.first) {
Martin Beseda
committed
v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1));
Martin Beseda
committed
}
for(auto& v : pair.second) {
Martin Beseda
committed
v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1));
Martin Beseda
committed
}
}
// this->normalized = true;
Martin Beseda
committed
}
void DataSet::get_input(std::vector<double> &d, size_t idx){
assert(d.size() == this->data[idx].first.size());
for (size_t j = 0; j < this->data[idx].first.size(); ++j) {
d[j] = this->data[idx].first[j];
}
}
void DataSet::get_output(std::vector<double> &d, size_t idx){
assert(d.size() == this->data[idx].second.size());
for (size_t j = 0; j < this->data[idx].second.size(); ++j) {
d[j] = this->data[idx].second[j];
}
}
void DataSet::de_normalize_single(std::vector<double> &d1, std::vector<double> &d2){
assert(d1.size() == d2.size());
for (size_t j = 0; j < d1.size(); ++j) {
d2[j] = this->normalization_strategy->de_normalize(d1[j]);
NormalizationStrategy* DataSet::get_normalization_strategy() {
return this->normalization_strategy;
}
bool DataSet::is_normalized() {
return !this->max_min_inp_val.empty();
}
double DataSet::get_max_inp_val() {
Martin Beseda
committed
return this->max_min_inp_val.at(0);
}
double DataSet::get_min_inp_val() {
Martin Beseda
committed
return this->max_min_inp_val.at(1);
Martin Beseda
committed
/**
* Method returning random amount of data pairs between 1-max
*/
std::vector<std::pair<std::vector<double>, std::vector<double>>> DataSet::get_random_data_batch(size_t max) {
if (max <= 0) {
return this->data;
} else {
std::vector<std::pair<std::vector<double>, std::vector<double>>> newData;
srand(time(NULL)); //TODO use Mersen twister from Boost
size_t n_chosen = rand() % std::min(max, this->data.size())+1;
std::vector<size_t> chosens;
size_t chosen;
Martin Beseda
committed
for (int i = 0; i < n_chosen; i++) {
chosen = rand() % this->data.size();
auto it = std::find(chosens.begin(), chosens.end(), chosen);
Martin Beseda
committed
if (it != chosens.end()) {
i--;
} else {
newData.push_back(this->data.at(chosen));
}
}
return newData;
}
}