Skip to content
Snippets Groups Projects
DataSet.cpp 13.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • Martin Beseda's avatar
    Martin Beseda committed
    //
    // Created by martin on 7/13/18.
    //
    
    
    #include <boost/serialization/export.hpp>
    
    BOOST_CLASS_EXPORT_IMPLEMENT(lib4neuro::DataSet);
    
        DataSet::DataSet() {
            this->n_elements = 0;
            this->input_dim = 0;
            this->output_dim = 0;
        }
    
        DataSet::DataSet(std::string file_path) {
            std::ifstream ifs(file_path);
    
                try {
                    boost::archive::text_iarchive ia(ifs);
                    ia >> *this;
                }catch(boost::archive::archive_exception& e) {
                    THROW_RUNTIME_ERROR("Serialized archive error: '" + e.what() + "'! Please, check if your file is really "
                                                                                   "the serialized DataSet.");
                }
    
                THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
    
        DataSet::DataSet(std::vector<std::pair<std::vector<double>, std::vector<double>>> *data_ptr,
    
            this->n_elements = data_ptr->size();
            this->data = *data_ptr;
            this->input_dim = this->data[0].first.size();
            this->output_dim = this->data[0].second.size();
    
            if(ns) {
                this->normalization_strategy = ns;
    
    //            this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value());
    //            this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value());
    
            //TODO check the complete data set for input/output dimensions
        }
    
        DataSet::DataSet(double lower_bound,
                         double upper_bound,
                         unsigned int size,
                         double output,
                         NormalizationStrategy* ns) {
    
            std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
            this->data = new_data_vec;
            this->n_elements = 0;
            this->input_dim = 1;
            this->output_dim = 1;
    
            if(ns) {
                this->normalization_strategy = ns;
    
    //            this->max_min_inp_val.emplace_back(this->normalization_strategy->get_max_value());
    //            this->max_min_inp_val.emplace_back(this->normalization_strategy->get_min_value());
    
            this->add_isotropic_data(lower_bound, upper_bound, size, output);
        }
    
        DataSet::DataSet(std::vector<double> &bounds,
                         unsigned int no_elems_in_one_dim,
                         std::vector<double> (*output_func)(std::vector<double> &),
                         unsigned int output_dim,
                         NormalizationStrategy* ns) {
    
            std::vector<std::pair<std::vector<double>, std::vector<double>>> new_data_vec;
            this->data = new_data_vec;
            this->input_dim = bounds.size() / 2;
            this->output_dim = output_dim;
            this->n_elements = 0;
    
            if(ns) {
                this->normalization_strategy = ns;
            }
    
    
            this->add_isotropic_data(bounds, no_elems_in_one_dim, output_func);
        }
    
        void DataSet::add_data_pair(std::vector<double> &inputs, std::vector<double> &outputs) {
    
            if(this->n_elements == 0 && this->input_dim == 0 && this->output_dim == 0) {
                this->input_dim = inputs.size();
                this->output_dim = outputs.size();
            }
    
    
            if (inputs.size() != this->input_dim) {
    
                THROW_RUNTIME_ERROR("Bad input dimension.");
    
            } else if (outputs.size() != this->output_dim) {
    
                THROW_RUNTIME_ERROR("Bad output dimension.");
    
            this->n_elements++;
            this->data.emplace_back(std::make_pair(inputs, outputs));
    
        void DataSet::add_isotropic_data(double lower_bound, double upper_bound, unsigned int size, double output) {
    
            if (this->input_dim != 1 || this->output_dim != 1) {
    
                THROW_RUNTIME_ERROR("Cannot add data with dimensionality 1:1 when the data set "
                                    "is of different dimensionality!");
    
            double frac;
            if(size < 1) {
                THROW_INVALID_ARGUMENT_ERROR("Size of added data has to be >=1 !");
            } else if (size == 1) {
                frac = 1;
            } else {
                frac = (upper_bound - lower_bound) / (size - 1);
            }
    
    
            for (unsigned int i = 0; i < size; ++i) {
                inp = {frac * i};
                this->data.emplace_back(std::make_pair(inp, out));
            }
    
        void DataSet::add_isotropic_data(std::vector<double> &bounds, unsigned int no_elems_in_one_dim,
                                         std::vector<double> (*output_func)(std::vector<double> &)) {
            // TODO add check of dataset dimensions
    
            std::vector<std::vector<double>> grid;
            std::vector<double> tmp;
            double frac;
    
            if(no_elems_in_one_dim < 1) {
                THROW_INVALID_ARGUMENT_ERROR("Number of elements in one dimension has to be >=1 !");
            }
    
            for (unsigned int i = 0; i < bounds.size(); i += 2) {
    
    Martin Beseda's avatar
    Martin Beseda committed
                if (no_elems_in_one_dim == 1) {
    
                    frac = 1;
                } else {
                    frac = (bounds[i] - bounds[i+1]) / (no_elems_in_one_dim - 1);
                }
    
    
                tmp.clear();
                for (double j = bounds[i]; j <= bounds[i + 1]; j += frac) {
                    tmp.emplace_back(j);
                }
    
            grid = this->cartesian_product(&grid);
    
            for (auto vec : grid) {
                this->n_elements++;
                this->data.emplace_back(std::make_pair(vec, output_func(vec)));
            }
    
        std::vector<std::pair<std::vector<double>, std::vector<double>>> *DataSet::get_data() {
            return &(this->data);
        }
    
        size_t DataSet::get_n_elements() {
            return this->n_elements;
    
        size_t DataSet::get_input_dim() {
            return this->input_dim;
        }
    
        size_t DataSet::get_output_dim() {
            return this->output_dim;
        }
    
        void DataSet::print_data() {
            if (n_elements) {
                for (auto p : this->data) {
                    /* INPUT */
                    for (auto v : std::get<0>(p)) {
                        std::cout << v << " ";
                    }
    
                    std::cout << "-> ";
    
                    /* OUTPUT */
                    for (auto v : std::get<1>(p)) {
                        std::cout << v << " ";
                    }
    
        void DataSet::store_text(std::string file_path) {
    
    
            if(!ofs.is_open()) {
                THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
            } else {
                boost::archive::text_oarchive oa(ofs);
                oa << *this;
                ofs.close();
            }
        }
    
    
        void DataSet::store_data_text(std::ofstream* file_path) {
            for (auto e : this->data) {
                /* First part of the pair */
                for (unsigned int i = 0; i < e.first.size() - 1; i++) {
                    *file_path << e.first.at(i) << ",";
                }
                *file_path << e.first.back() << " ";
    
                /* Second part of the pair */
                for (unsigned int i = 0; i < e.second.size() - 1; i++) {
                    *file_path << e.second.at(i) << ",";
                }
                *file_path << e.second.back() << std::endl;
            }
        }
    
    
        void DataSet::store_data_text(std::string file_path) {
            std::ofstream ofs(file_path);
    
            if(!ofs.is_open()) {
                THROW_RUNTIME_ERROR("File " + file_path + " couldn't be open!");
            } else {
                for (auto e : this->data) {
                    /* First part of the pair */
                    for (unsigned int i = 0; i < e.first.size() - 1; i++) {
                        ofs << e.first.at(i) << ",";
                    }
                    ofs << e.first.back() << " ";
    
                    /* Second part of the pair */
                    for (unsigned int i = 0; i < e.second.size() - 1; i++) {
                        ofs << e.second.at(i) << ",";
                    }
                    ofs << e.second.back() << std::endl;
                }
            }
    
        template<class T>
        std::vector<std::vector<T>> DataSet::cartesian_product(const std::vector<std::vector<T>> *v) {
            std::vector<std::vector<double>> v_combined_old, v_combined, v_tmp;
            std::vector<double> tmp;
    
            for (const auto &e : v->at(0)) {
                tmp = {e};
                v_combined.emplace_back(tmp);
            }
    
            for (unsigned int i = 1; i < v->size(); i++) {  // Iterate through remaining vectors of 'v'
                v_combined_old = v_combined;
                v_combined.clear();
    
                for (const auto &e : v->at(i)) {
                    for (const auto &vec : v_combined_old) {
                        tmp = vec;
                        tmp.emplace_back(e);
    
                        /* Add only unique elements */
                        if (std::find(v_combined.begin(), v_combined.end(), tmp) == v_combined.end()) {
                            v_combined.emplace_back(tmp);
                        }
    
            if(!this->normalization_strategy) {
    
                THROW_INVALID_ARGUMENT_ERROR("There is no normalization strategy given for this data set, so it can not be "
                                             "normalized!");
    
            if(this->max_min_inp_val.empty()) {
                this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
                this->max_min_inp_val.emplace_back(this->data.at(0).first.at(0));
            }
    
    
            double tmp, tmp2;
            for(auto pair : this->data) {
                /* Finding maximum */
                //TODO make more efficiently
                tmp = *std::max_element(pair.first.begin(), pair.first.end());
                tmp2 = *std::max_element(pair.second.begin(), pair.second.end());
    
                tmp = std::max(tmp, tmp2);
    
    
                /* Testing for a new maxima */
                if (tmp > this->max_min_inp_val.at(0)) {
                    this->max_min_inp_val.at(0) = tmp;
    
                }
    
                /* Finding minimum */
                tmp = *std::min_element(pair.first.begin(), pair.first.end());
                tmp2 = *std::min_element(pair.second.begin(), pair.second.end());
    
                tmp = std::min(tmp, tmp2);
    
    
                /* Testing for a new minima */
                if (tmp < this->max_min_inp_val.at(1)) {
                    this->max_min_inp_val.at(1) = tmp;
    
                }
            }
    
            /* Normalize every number in the data set */
            for(auto& pair : this->data) {
                for(auto& v : pair.first) {
    
                    v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1));
    
                    v = this->normalization_strategy->normalize(v, this->max_min_inp_val.at(0), this->max_min_inp_val.at(1));
    
        void DataSet::get_input(std::vector<double> &d, size_t idx){
            assert(d.size() == this->data[idx].first.size());
            for (size_t j = 0; j < this->data[idx].first.size(); ++j) {
                d[j] = this->data[idx].first[j];
            }
        }
    
        void DataSet::get_output(std::vector<double> &d, size_t idx){
            assert(d.size() == this->data[idx].second.size());
            for (size_t j = 0; j < this->data[idx].second.size(); ++j) {
                d[j] = this->data[idx].second[j];
            }
        }
    
    
        void DataSet::de_normalize() {
            std::vector<double> tmp_inp(this->data.at(0).first.size());
            std::vector<double> tmp_out(this->data.at(0).second.size());
    
            for(auto& pair: this->data) {
                for(size_t i=0; i < pair.first.size(); i++) {
                    tmp_inp.at(i) = this->normalization_strategy->de_normalize(pair.first.at(i));
                }
                pair.first = tmp_inp;
            }
    
            for(auto& pair: this->data) {
                for(size_t i=0; i < pair.second.size(); i++) {
                    tmp_out.at(i) = this->normalization_strategy->de_normalize(pair.second.at(i));
                }
                pair.second = tmp_out;
            }
    
    
            /* Remove found max and minimal values, because of is_normalized() method */
            this->max_min_inp_val.clear();
    
        void DataSet::de_normalize_single(std::vector<double> &d1, std::vector<double> &d2){
            assert(d1.size() == d2.size());
            for (size_t j = 0; j < d1.size(); ++j) {
    
                d2[j] = this->normalization_strategy->de_normalize(d1[j]);
    
        NormalizationStrategy* DataSet::get_normalization_strategy() {
            return this->normalization_strategy;
        }
    
    
        bool DataSet::is_normalized() {
            return !this->max_min_inp_val.empty();
        }
    
    
        double DataSet::get_max_inp_val() {
    
    
        /**
         * Method returning random amount of data pairs between 1-max
         */
        std::vector<std::pair<std::vector<double>, std::vector<double>>> DataSet::get_random_data_batch(size_t max) {
    
    Martin Beseda's avatar
    Martin Beseda committed
            if (max <= 0) {
    
            } else {
                std::vector<std::pair<std::vector<double>, std::vector<double>>> newData;
    
    Martin Beseda's avatar
    Martin Beseda committed
                srand(time(NULL));  //TODO use Mersen twister from Boost
    
    Martin Beseda's avatar
    Martin Beseda committed
                size_t n_chosen = rand() % std::min(max, this->data.size())+1;
                n_chosen = max;
    
                std::vector<size_t> chosens;
                size_t chosen;
    
    
    Martin Beseda's avatar
    Martin Beseda committed
                for (int i = 0; i < n_chosen; i++) {
                    chosen = rand() % this->data.size();
    
                    auto it = std::find(chosens.begin(), chosens.end(), chosen);
    
    
                        i--;
                    } else {
                        newData.push_back(this->data.at(chosen));
    
    Michal Kravcenko's avatar
    Michal Kravcenko committed
                        chosens.push_back( chosen );