/**
 * DESCRIPTION OF THE FILE
 *
 * @author Michal Kravčenko
 * @date 30.7.18 -
 */

#ifndef INC_4NEURO_GRADIENTDESCENT_H
#define INC_4NEURO_GRADIENTDESCENT_H

#include "../settings.h"
#include "../constants.h"
#include "LearningMethod.h"
#include "../ErrorFunction/ErrorFunctions.h"

namespace lib4neuro {
    /**
     *
     */
    class GradientDescent : public GradientLearningMethod {

    private:

        /**
         * Threshold for the successful ending of the optimization - deviation from minima
         */
        double tolerance;

        /**
         * Number of iterations to reset step size to tolerance/10.0
         */
        size_t restart_frequency;

        /**
         *
         */
		size_t batch;

        /**
         * Maximal number of iterations - optimization will stop after that, even if not converged
         */
        long long int maximum_niters;

        /**
         * Vector of minima coordinates
         */
        std::vector<double> optimal_parameters;

        /**
         * Adaptive calculation of the step-size based on several historical characteristics.
         * ----------------------------------------------------------------------------------
         * If the current error @fi is larger than the error in the previous step @fim, the rate of step-size change decreases (the algorithm is going in the direction too quickly)
         * Otherwise the rate of step-size change increases (the algorithm is on the right path, we can attempts to push through more rapidly)
         * ----------------------------------------------------------------------------------
         * The step size is then calculated via: @c^(1-2@beta) * (@grad_norm_prev/@grad_norm)^(1/@c)
         * If the previous gradient norm is lower then the current gradient norm, then the step-size decreases (as we probably took a too large of a step)
         * Otherwise it increases (as we are likely on the right track, we can try to speed-up the convergence)
         *
         * @param gamma[in, out] a step size used in the last iteration
         * @param beta[in] a number in the interval [0, 1]. it represents a measure of direction change between two last steps, 0: no change, 1:opposite directions
         * @param c[in, out] greater than zero. it is a measure of a non-linear step-size change. the higher @c is, the more rapidly the step-size increases/decreases
         * @param grad_norm_prev[in] gradient norm of the error in the previous iteration
         * @param grad_norm[in] gradient norm of the error in the current iteration
         * @param fi[in] value of the error
         * @param fim[in] value of the error in the previous iteration
         */
        virtual void
        eval_step_size_mk(double &gamma,
                          double beta,
                          double &c,
                          double grad_norm_prev,
                          double grad_norm,
                          double fi,
                          double fim);

        /**
         * Analyses direction of parameters change and performs the most feasible step in one parameter
         * @param ef[in] error function to be optimized
         * @param error_previous[in] evaluation of the error function on the @parameters_before state
         * @param step_coefficient[in] scalar value denoting the scaling of the step in one direction
         * @param direction direction[in] vector to be analyzed
         * @param parameters_before[in] state of the parameter space before analysis
         * @param parameters_after[out] suggested state of the parameters after the analysis completes
         */
        virtual bool perform_feasible_1D_step(
                lib4neuro::ErrorFunction &ef,
                double error_previous,
                double step_coefficient,
                std::vector<double> * direction,
                std::vector<double> *parameters_before,
                std::vector<double> *parameters_after
                );

    public:

        /**
         * Creates an instance of Gradient Descent Optimizer (i.e. back-propagation)
         * @param epsilon Threshold for the successful ending of the optimization - deviation from minima
         * @param n_to_restart Number of iterations to reset step size to tolerance/10.0
         * @param max_iters Maximal number of iterations - optimization will stop after that, even if not converged
         */
        LIB4NEURO_API explicit GradientDescent(double epsilon = 1e-3, size_t n_to_restart = 100, int max_iters = 1000, size_t batch = 0);

        /**
         * Deallocates the instance
         */
        LIB4NEURO_API ~GradientDescent();

        /**
         *
         * @param ef
         */
        LIB4NEURO_API void optimize(lib4neuro::ErrorFunction &ef, std::ofstream* ofs = nullptr) override;

        /**
         *
         * @return
         */
        LIB4NEURO_API std::shared_ptr<std::vector<double>> get_parameters() override;
    };
}

#endif //INC_4NEURO_GRADIENTDESCENT_H