Newer
Older
Martin Beseda
committed
#include <algorithm>
Martin Beseda
committed
#include <boost/serialization/export.hpp>
Martin Beseda
committed
#include "DataSetSerialization.h"
#include "exceptions.h"
Martin Beseda
committed
BOOST_CLASS_EXPORT_IMPLEMENT(lib4neuro::DataSet);
Martin Beseda
committed
namespace lib4neuro {
Martin Beseda
committed
Martin Beseda
committed
DataSet::DataSet() {
this->n_elements = 0;
this->input_dim = 0;
this->output_dim = 0;
this->normalization_strategy = std::make_shared<DoubleUnitStrategy>(DoubleUnitStrategy());
Martin Beseda
committed
}
Martin Beseda
committed
DataSet::DataSet(std::string file_path) {
std::ifstream ifs(file_path);
Martin Beseda
committed
try {
boost::archive::text_iarchive ia(ifs);
ia >> *this;
}
catch (boost::archive::archive_exception& e) {
"Serialized archive error: '" + e.what() + "'! Please, check if your file is really "
"the serialized DataSet.");
Martin Beseda
committed
}
Martin Beseda
committed
ifs.close();
} else {
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
Martin Beseda
committed
}
this->normalization_strategy = std::make_shared<DoubleUnitStrategy>(DoubleUnitStrategy());
Martin Beseda
committed
}
Martin Beseda
committed
DataSet::DataSet(std::vector<std::pair<std::vector<double>, std::vector<double>>>* data_ptr,
NormalizationStrategy* ns) {
this->data.clear();
Martin Beseda
committed
this->n_elements = data_ptr->size();
this->data = *data_ptr;
this->input_dim = this->data[0].first.size();
Martin Beseda
committed
this->output_dim = this->data[0].second.size();
Martin Beseda
committed
std::shared_ptr<NormalizationStrategy> ns_tmp;
ns_tmp.reset(ns);
this->normalization_strategy = ns_tmp;
Martin Beseda
committed
//TODO check the complete data set for input/output dimensions
}
Martin Beseda
committed
DataSet::DataSet(double lower_bound,
double upper_bound,
unsigned int size,
double output,
NormalizationStrategy* ns) {
Martin Beseda
committed
std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
Martin Beseda
committed
this->n_elements = 0;
Martin Beseda
committed
this->output_dim = 1;
std::shared_ptr<NormalizationStrategy> ns_tmp(ns);
this->add_isotropic_data(lower_bound,
upper_bound,
size,
output);
Martin Beseda
committed
}
unsigned int no_elems_in_one_dim,
std::vector<double> (* output_func)(std::vector<double>&),
unsigned int output_dim,
NormalizationStrategy* ns) {
Martin Beseda
committed
std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
this->data = new_data_vec;
this->input_dim = bounds.size() / 2;
Martin Beseda
committed
this->output_dim = output_dim;
this->n_elements = 0;
std::shared_ptr<NormalizationStrategy> ns_tmp;
ns_tmp.reset(ns);
this->normalization_strategy = ns_tmp;
this->add_isotropic_data(bounds,
no_elems_in_one_dim,
output_func);
Martin Beseda
committed
}
void DataSet::shift_outputs_to_zero() {
auto first_elem = this->data.at(0).second;
for(size_t j = 0; j < this->data.size(); ++j){
for(size_t i = 0; i < this->get_output_dim(); ++i){
this->data.at(j).second[i] -= first_elem[i];
}
}
}
void DataSet::add_data_pair(std::vector<double>& inputs,
std::vector<double>& outputs) {
if (this->n_elements == 0 && this->input_dim == 0 && this->output_dim == 0) {
this->output_dim = outputs.size();
}
Martin Beseda
committed
if (inputs.size() != this->input_dim) {
THROW_RUNTIME_ERROR("Bad input dimension.");
Martin Beseda
committed
} else if (outputs.size() != this->output_dim) {
THROW_RUNTIME_ERROR("Bad output dimension.");
Martin Beseda
committed
}
Martin Beseda
committed
this->n_elements++;
this->data.emplace_back(std::make_pair(inputs,
outputs));
Martin Beseda
committed
}
void DataSet::add_isotropic_data(double lower_bound,
double upper_bound,
unsigned int size,
double output) {
Martin Beseda
committed
if (this->input_dim != 1 || this->output_dim != 1) {
THROW_RUNTIME_ERROR("Cannot add data with dimensionality 1:1 when the data set "
"is of different dimensionality!");
Martin Beseda
committed
}
THROW_INVALID_ARGUMENT_ERROR("Size of added data has to be >=1 !");
} else if (size == 1) {
frac = 1;
} else {
frac = (upper_bound - lower_bound) / (size - 1);
}
Martin Beseda
committed
std::vector<double> inp, out;
Martin Beseda
committed
out = {output};
Martin Beseda
committed
for (unsigned int i = 0; i < size; ++i) {
inp = {frac * i};
this->data.emplace_back(std::make_pair(inp,
out));
Martin Beseda
committed
}
Martin Beseda
committed
this->n_elements += size;
void DataSet::add_isotropic_data(std::vector<double>& bounds,
unsigned int no_elems_in_one_dim,
std::vector<double> (* output_func)(std::vector<double>&)) {
Martin Beseda
committed
// TODO add check of dataset dimensions
Martin Beseda
committed
std::vector<std::vector<double>> grid;
std::vector<double> tmp;
double frac;
THROW_INVALID_ARGUMENT_ERROR("Number of elements in one dimension has to be >=1 !");
}
Martin Beseda
committed
for (unsigned int i = 0; i < bounds.size(); i += 2) {
frac = (bounds[i] - bounds[i + 1]) / (no_elems_in_one_dim - 1);
Martin Beseda
committed
tmp.clear();
for (double j = bounds[i]; j <= bounds[i + 1]; j += frac) {
tmp.emplace_back(j);
}
Martin Beseda
committed
grid.emplace_back(tmp);
Martin Beseda
committed
grid = this->cartesian_product(&grid);
for (auto vec : grid) {
this->n_elements++;
this->data.emplace_back(std::make_pair(vec,
output_func(vec)));
Martin Beseda
committed
}
std::vector<std::pair<std::vector<double>, std::vector<double>>>* DataSet::get_data() {
Martin Beseda
committed
return &(this->data);
}
Martin Beseda
committed
size_t DataSet::get_n_elements() {
return this->n_elements;
Martin Beseda
committed
size_t DataSet::get_input_dim() {
return this->input_dim;
}
Martin Beseda
committed
size_t DataSet::get_output_dim() {
return this->output_dim;
}
void DataSet::print_data() {
if (n_elements) {
for (auto p : this->data) {
/* INPUT */
for (auto v : std::get<0>(p)) {
std::cout << v << " ";
}
std::cout << "-> ";
/* OUTPUT */
for (auto v : std::get<1>(p)) {
std::cout << v << " ";
}
Martin Beseda
committed
std::cout << std::endl;
}
Martin Beseda
committed
void DataSet::store_text(std::string file_path) {
Martin Beseda
committed
std::ofstream ofs(file_path);
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
} else {
boost::archive::text_oarchive oa(ofs);
oa << *this;
ofs.close();
}
}
void DataSet::store_data_text(std::ofstream* file_path) {
for (auto e : this->data) {
/* First part of the pair */
for (unsigned int i = 0; i < e.first.size() - 1; i++) {
*file_path << this->get_denormalized_value(e.first.at(i)) << ",";
*file_path << this->get_denormalized_value(e.first.back()) << " ";
/* Second part of the pair */
for (unsigned int i = 0; i < e.second.size() - 1; i++) {
*file_path << this->get_denormalized_value(e.second.at(i)) << ",";
*file_path << this->get_denormalized_value(e.second.back()) << std::endl;
}
}
void DataSet::store_data_text(std::string file_path) {
std::ofstream ofs(file_path);
THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
} else {
ofs.close();
Martin Beseda
committed
template<class T>
std::vector<std::vector<T>> DataSet::cartesian_product(const std::vector<std::vector<T>>* v) {
Martin Beseda
committed
std::vector<std::vector<double>> v_combined_old, v_combined, v_tmp;
Martin Beseda
committed
tmp = {e};
v_combined.emplace_back(tmp);
}
Martin Beseda
committed
for (unsigned int i = 1; i < v->size(); i++) { // Iterate through remaining vectors of 'v'
v_combined_old = v_combined;
v_combined.clear();
for (const auto& e : v->at(i)) {
for (const auto& vec : v_combined_old) {
Martin Beseda
committed
tmp = vec;
tmp.emplace_back(e);
/* Add only unique elements */
if (std::find(v_combined.begin(),
v_combined.end(),
tmp) == v_combined.end()) {
Martin Beseda
committed
v_combined.emplace_back(tmp);
}
Martin Beseda
committed
return v_combined;
Martin Beseda
committed
void DataSet::normalize() {

Michal Kravcenko
committed
this->normalized = false;
THROW_INVALID_ARGUMENT_ERROR("There is no normalization strategy given for this data set, so it can not be "
"normalized!");
Martin Beseda
committed
/* Find maximum and minimum values */
this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
}
Martin Beseda
committed
/* Finding maximum */
//TODO make more efficiently
tmp = *std::max_element(pair.first.begin(),
pair.first.end());
tmp2 = *std::max_element(pair.second.begin(),
pair.second.end());
Martin Beseda
committed
Martin Beseda
committed
Martin Beseda
committed
/* Testing for a new maxima */
if (tmp > this->max_min_inp_val.at(0)) {
this->max_min_inp_val.at(0) = tmp;
Martin Beseda
committed
}
/* Finding minimum */
tmp = *std::min_element(pair.first.begin(),
pair.first.end());
tmp2 = *std::min_element(pair.second.begin(),
pair.second.end());
Martin Beseda
committed
Martin Beseda
committed
Martin Beseda
committed
/* Testing for a new minima */
if (tmp < this->max_min_inp_val.at(1)) {
this->max_min_inp_val.at(1) = tmp;
Martin Beseda
committed
}
}
/* Normalize every number in the data set */
for (auto& pair : this->data) {
for (auto& v : pair.first) {
v = this->normalization_strategy->normalize(v,
this->max_min_inp_val.at(0),
this->max_min_inp_val.at(1));
Martin Beseda
committed
}
for (auto& v : pair.second) {
v = this->normalization_strategy->normalize(v,
this->max_min_inp_val.at(0),
this->max_min_inp_val.at(1));
Martin Beseda
committed
}
}

Michal Kravcenko
committed
this->normalized = true;
Martin Beseda
committed
}
double DataSet::get_normalized_value(double val) {
if (!this->normalized || !this->normalization_strategy) {

Michal Kravcenko
committed
return val;
}
return this->normalization_strategy->normalize(val,
this->max_min_inp_val.at(0),
this->max_min_inp_val.at(1));

Michal Kravcenko
committed
}
double DataSet::get_denormalized_value(double val) {
if (!this->normalized || !this->normalization_strategy) {
return val;
}
return this->normalization_strategy->de_normalize(val);
void DataSet::get_input(std::vector<double>& d,
size_t idx) {
assert(d.size() == this->data[idx].first.size());
for (size_t j = 0; j < this->data[idx].first.size(); ++j) {
d[j] = this->data[idx].first[j];
}
}
void DataSet::get_output(std::vector<double>& d,
size_t idx) {
assert(d.size() == this->data[idx].second.size());
for (size_t j = 0; j < this->data[idx].second.size(); ++j) {
d[j] = this->data[idx].second[j];
}
}
void DataSet::de_normalize() {
std::vector<double> tmp_inp(this->data.at(0).first.size());
std::vector<double> tmp_out(this->data.at(0).second.size());
for (auto& pair: this->data) {
for (size_t i = 0; i < pair.first.size(); i++) {
tmp_inp.at(i) = this->normalization_strategy->de_normalize(pair.first.at(i));
}
pair.first = tmp_inp;
}
for (auto& pair: this->data) {
for (size_t i = 0; i < pair.second.size(); i++) {
tmp_out.at(i) = this->normalization_strategy->de_normalize(pair.second.at(i));
}
pair.second = tmp_out;
}
Martin Beseda
committed
/* Remove found max and minimal values, because of is_normalized() method */
this->max_min_inp_val.clear();
void DataSet::de_normalize_single(std::vector<double>& d1,
std::vector<double>& d2) {
assert(d1.size() == d2.size());
for (size_t j = 0; j < d1.size(); ++j) {
d2[j] = this->normalization_strategy->de_normalize(d1[j]);
NormalizationStrategy* DataSet::get_normalization_strategy() {
void DataSet::set_normalization_strategy(NormalizationStrategy* ns) {
this->normalization_strategy.reset(ns);
}
}
bool DataSet::is_normalized() {
return !this->max_min_inp_val.empty();
}
double DataSet::get_max_inp_val() {
Martin Beseda
committed
return this->max_min_inp_val.at(0);
}
double DataSet::get_min_inp_val() {
Martin Beseda
committed
return this->max_min_inp_val.at(1);
Martin Beseda
committed
/**
* Method returning random amount of data pairs between 1-max
*/
std::vector<std::pair<std::vector<double>, std::vector<double>>> DataSet::get_random_data_batch(size_t max) {
Martin Beseda
committed
return this->data;
} else {
std::vector<std::pair<std::vector<double>, std::vector<double>>> newData;
size_t n_chosen = rand() % std::min(max,
this->data.size()) + 1;
std::vector<size_t> chosens;
for (size_t i = 0; i < n_chosen; i++) {
auto it = std::find(chosens.begin(),
chosens.end(),
chosen);
Martin Beseda
committed
if (it != chosens.end()) {
i--;
} else {
newData.push_back(this->data.at(chosen));
return newData;
}
}
void DataSet::add_zero_output_columns(size_t n_columns)
{
for (size_t i = 0; i < this->n_elements; i++)
{
for (size_t j = 0; j < n_columns; j++)
{
this->data.at(i).second.push_back(0);
}
}
this->output_dim += n_columns;
}