Newer
Older
//
// Created by martin on 14.11.18.
//
#include <string>
#include <fstream>
#include <sstream>
#include <filesystem>
#include <algorithm>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string/erase.hpp>
#include "../exceptions.h"
namespace lib4neuro {
CSVReader::CSVReader(std::string file_path, std::string delimiter, bool ignore_first_line) {
if(!std::filesystem::exists(file_path)) {
THROW_RUNTIME_ERROR("The specified file path in CSVReader does not exist!");
this->file_path = file_path;
this->delimiter = delimiter;
this->ignore_first_line = ignore_first_line;
this->header_included = ignore_first_line;
this->data = new std::vector<std::vector<std::string>>;
}
void CSVReader::read() {
std::ifstream ifs(this->file_path);
std::string line;
if(this->ignore_first_line) {
std::getline(ifs, line);
}
/* Read single line from the file */
while(std::getline(ifs, line)) {
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* Ignore empty line */
if(line == "") {
continue;
}
/* Separate elements of the line according to the delimiter */
size_t last = 0;
size_t next = 0;
std::vector<std::string> separated_line;
while ((next = line.find(this->delimiter, last)) != std::string::npos) {
separated_line.emplace_back(line.substr(last, next - last));
last = next + 1;
}
separated_line.emplace_back(line.substr(last));
/* Store the elements from the line to the vector with data */
this->data->emplace_back(separated_line);
}
ifs.close();
}
std::vector<std::vector<std::string>>* CSVReader::get_data() {
return this->data;
}
void CSVReader::print_data() {
for(auto line : *this->data) {
for(auto e : line) {
std::cout << e << " ";
}
std::cout << std::endl;
}
}
DataSet CSVReader::get_data_set(std::vector<unsigned int>* input_col_indices,
std::vector<unsigned int>* output_col_indices) {
std::vector<std::pair<std::vector<double>, std::vector<double>>> data_set_contents;
for(auto line : *this->data) {
//TODO check empty values in data
std::vector<double> input;
for(auto ind : *input_col_indices) {
std::string s;
/* Remove remaining spaces */
s = line.at(ind);
boost::algorithm::erase_all(s, " ");
/* Strip BOM */
// TODO solve in another way - work properly with different encodings!
boost::algorithm::erase_all(s, "\uEFBBBF"); // UTF-8
boost::algorithm::erase_all(s, "\uFEFF"); // UTF-16
/* Check, if the string is a number */
double tmp = boost::lexical_cast<double>(s);
/* Add loaded number to the vector of inputs */
input.push_back(tmp);
} catch(const std::out_of_range& e) {
THROW_OUT_OF_RANGE_ERROR("Non-existing index specified (" + std::to_string(ind) + ")!");
} catch (const boost::bad_lexical_cast& e) {
THROW_RUNTIME_ERROR("Value \"" + s + "\" is not numerical and so it cannot be used in Data Set!");
}
std::vector<double> output;
for(auto ind : *output_col_indices) {
output.emplace_back(std::stod(line.at(ind)));
}
data_set_contents.emplace_back(std::make_pair(input, output));