LevenbergMarquardt.cpp

#define ARMA_ALLOW_FAKE_GCC
#include <armadillo>
#include "../mpi_wrapper.h"

#include "LevenbergMarquardt.h"
#include "../message.h"

struct lib4neuro::LevenbergMarquardt::LevenbergMarquardtImpl {
    /**
 * Threshold for the successful ending of the optimization - deviation from minima
 */
    double tolerance;

    double tolerance_gradient;

    double tolerance_parameters;

    double LM_step_acceptance_threshold;

    double lambda_initial;

    double lambda_increase;

    double lambda_decrease;

    /**
     * Maximal number of iterations - optimization will stop after that, even if not converged
     */
    size_t maximum_niters;

    size_t batch_size;

    /**
     * Returns Jacobian matrix of the residual function using the backpropagation algorithm
     * Returns the right hand side of the resulting system of equations related to data errors in @data and approximating function @f
     * @param f
     * @param J
     * @param rhs
     * @param data
     */
    void get_jacobian_and_rhs(lib4neuro::ErrorFunction& ef,
                              arma::Mat<double>& H,
                              arma::Col<double>& rhs,
                              size_t data_subset_size);

	size_t solve_system_CG(
		const arma::Mat<double>& A,
		const arma::Col<double>& rhs,
		arma::Col<double>& sol
	);
};
//*****************************************************************
// Iterative template routine -- GMRES
//
// GMRES solves the unsymmetric linear system Ax = b using the
// Generalized Minimum Residual method
//
// GMRES follows the algorithm described on p. 20 of the
// SIAM Templates book.
//
// The return value indicates convergence within max_iter (input)
// iterations (0), or no convergence within max_iter iterations (1).
//
// Upon successful return, output arguments have the following values:
//
//        x  --  approximate solution to Ax = b
// max_iter  --  the number of iterations performed before the
//               tolerance was reached
//      tol  --  the residual after the final iteration
//
//*****************************************************************


//template < class Matrix, class Vector >
//void
//Update(Vector &x, int k, Matrix &h, Vector &s, Vector v[])
//{
//  Vector y(s);
//
//  // Backsolve:
//  for (int i = k; i >= 0; i--) {
//    y(i) /= h(i,i);
//    for (int j = i - 1; j >= 0; j--)
//      y(j) -= h(j,i) * y(i);
//  }
//
//  for (int j = 0; j <= k; j++)
//    x += v[j] * y(j);
//}
//
//
//template < class Real >
//Real
//abs(Real x)
//{
//  return (x > 0 ? x : -x);
//}
//
//
//template < class Operator, class Vector, class Preconditioner,
//           class Matrix, class Real >
//int
//GMRES(const Operator &A, Vector &x, const Vector &b,
//      const Preconditioner &M, Matrix &H, int &m, int &max_iter,
//      Real &tol)
//{
//  Real resid;
//  int i, j = 1, k;
//  Vector s(m+1), cs(m+1), sn(m+1), w;
//
//  Real normb = norm(M.solve(b));
//  Vector r = M.solve(b - A * x);
//  Real beta = norm(r);
//
//  if (normb == 0.0)
//    normb = 1;
//
//  if ((resid = norm(r) / normb) <= tol) {
//    tol = resid;
//    max_iter = 0;
//    return 0;
//  }
//
//  Vector *v = new Vector[m+1];
//
//  while (j <= max_iter) {
//    v[0] = r * (1.0 / beta);    // ??? r / beta
//    s = 0.0;
//    s(0) = beta;
//
//    for (i = 0; i < m && j <= max_iter; i++, j++) {
//      w = M.solve(A * v[i]);
//      for (k = 0; k <= i; k++) {
//        H(k, i) = dot(w, v[k]);
//        w -= H(k, i) * v[k];
//      }
//      H(i+1, i) = norm(w);
//      v[i+1] = w * (1.0 / H(i+1, i)); // ??? w / H(i+1, i)
//
//      for (k = 0; k < i; k++)
//        ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k));
//
//      GeneratePlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i));
//      ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i));
//      ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
//
//      if ((resid = abs(s(i+1)) / normb) < tol) {
//        Update(x, i, H, s, v);
//        tol = resid;
//        max_iter = j;
//        delete [] v;
//        return 0;
//      }
//    }
//    Update(x, m - 1, H, s, v);
//    r = M.solve(b - A * x);
//    beta = norm(r);
//    if ((resid = beta / normb) < tol) {
//      tol = resid;
//      max_iter = j;
//      delete [] v;
//      return 0;
//    }
//  }
//
//  tol = resid;
//  delete [] v;
//  return 1;
//}
//
//
//#include <math.h>
//
//
//template<class Real>
//void GeneratePlaneRotation(Real &dx, Real &dy, Real &cs, Real &sn)
//{
//  if (dy == 0.0) {
//    cs = 1.0;
//    sn = 0.0;
//  } else if (abs(dy) > abs(dx)) {
//    Real temp = dx / dy;
//    sn = 1.0 / sqrt( 1.0 + temp*temp );
//    cs = temp * sn;
//  } else {
//    Real temp = dy / dx;
//    cs = 1.0 / sqrt( 1.0 + temp*temp );
//    sn = temp * cs;
//  }
//}
//
//
//template<class Real>
//void ApplyPlaneRotation(Real &dx, Real &dy, Real &cs, Real &sn)
//{
//  Real temp  =  cs * dx + sn * dy;
//  dy = -sn * dx + cs * dy;
//  dx = temp;
//}


size_t lib4neuro::LevenbergMarquardt::LevenbergMarquardtImpl::solve_system_CG(
		const arma::Mat<double>& A,
		const arma::Col<double>& rhs,
		arma::Col<double>& sol
	){

		const double zero = 1e-32;
		const double prec = 1e-6;
		const auto n = rhs.n_elem;
		size_t iter = 1;
		if( sol.n_elem != n ){
			sol.resize( n );
		}


		arma::Col<double> r( n );
		r = rhs - A * sol;

		double norm_r = arma::dot(r, r);
		double norm_r_prev;
		if ( norm_r < zero ) {
		  return 0;
		}
		arma::Col<double> p( n );
		p = r;

        arma::Col<double> Ap( n );
		double alpha, beta;
		while( true ){
            ++iter;
            norm_r_prev = norm_r;

            Ap = A * p;
            alpha = norm_r_prev / arma::dot(p, Ap);
            sol += alpha * p;

            r -= alpha * Ap;
            norm_r = arma::dot(r, r);
            if ( norm_r < zero || iter >= n) {
              break;
            }

            beta = norm_r / norm_r_prev;
            p = r + beta * p;
		}


		return iter;
  }

void lib4neuro::LevenbergMarquardt::LevenbergMarquardtImpl::get_jacobian_and_rhs(
    lib4neuro::ErrorFunction& ef,
    arma::Mat<double>& H,
    arma::Col<double>& rhs,
    size_t data_subset_size) {


    if (data_subset_size <= 0) {
        data_subset_size = ef.get_n_data_set();
    }

    if (data_subset_size < ef.get_n_data_set()) {
        ef.divide_data_train_test((double) data_subset_size / (double) ef.get_n_data_set());
    }

    size_t n_parameters = ef.get_dimension( );
	if( H.n_rows != n_parameters || H.n_cols != n_parameters ){
		H.reshape( n_parameters, n_parameters );
	}
	if( rhs.n_elem != n_parameters ){
		rhs.resize( n_parameters );
	}
	H.fill(0.0);
	rhs.fill(0.0);

	for( size_t entry_idx = 0; entry_idx  < ef.get_n_data_set( ); ++entry_idx ){
		ef.add_to_hessian_and_rhs_single( H, rhs, entry_idx );
	}

    if (data_subset_size < ef.get_n_data_set()) {
        ef.return_full_data_set_for_training();
    }
}

namespace lib4neuro {
    LevenbergMarquardt::LevenbergMarquardt(size_t max_iters,
                                           size_t bs,
                                           double tolerance,
                                           double tolerance_gradient,
                                           double tolerance_parameters,
                                           double LM_step_acceptance_threshold,
                                           double lambda_initial,
                                           double lambda_increase,
                                           double lambda_decrease) : p_impl(new LevenbergMarquardtImpl()) {

        this->p_impl->batch_size                   = bs;
        this->p_impl->tolerance                    = tolerance;
        this->p_impl->tolerance_gradient           = tolerance_gradient;
        this->p_impl->tolerance_parameters         = tolerance_parameters;
        this->p_impl->LM_step_acceptance_threshold = LM_step_acceptance_threshold;
        this->p_impl->lambda_initial               = lambda_initial;
        this->p_impl->lambda_increase              = lambda_increase;
        this->p_impl->lambda_decrease              = lambda_decrease;
        this->p_impl->maximum_niters               = max_iters;
    }

    void LevenbergMarquardt::optimize(lib4neuro::ErrorFunction& ef,
                                      std::ofstream* ofs) {
        optimize(ef,
                 LM_UPDATE_TYPE::MARQUARDT,
                 ofs);
    }

    void LevenbergMarquardt::optimize(lib4neuro::ErrorFunction& ef,
                                      lib4neuro::LM_UPDATE_TYPE update_type,
                                      std::ofstream* ofs) {
        double current_err = ef.eval( );

        COUT_INFO(
            "Finding a solution via the Levenberg-Marquardt method... Starting error: " << current_err);
        if (ofs && ofs->is_open() && lib4neuro::mpi_rank == 0) {
            *ofs << "Finding a solution via a Levenberg-Marquardt method... Starting error: " << current_err;
        }

        size_t n_parameters  = ef.get_dimension();
        size_t n_data_points = ef.get_n_data_set();
        if (this->p_impl->batch_size > 0) {
            n_data_points = this->p_impl->batch_size;
        }
        std::vector<double>* params_current = new std::vector<double>(ef.get_parameters());

        std::shared_ptr<std::vector<double>> params_tmp;
        params_tmp.reset(new std::vector<double>(n_parameters));
        arma::Mat<double> H(n_parameters,
                            n_parameters);  // Hessian matrix
        arma::Mat<double> H_new(n_parameters,
                                n_parameters);

        double lambda   = this->p_impl->lambda_initial;  // Dumping parameter
        double prev_err = 0, update_norm = 0, gradient_norm = 0, mem_double = 0, jacobian_norm = 1;


        bool                update_J = true;
        arma::Col<double>   update;
        arma::Col<double>   rhs;
        arma::Col<double>   d;

        double slowdown_coeff = 0.25;
        //-------------------//
        // Solver iterations //
        //-------------------//
        size_t iter_counter   = 0;
		size_t update_counter = 0;

		double update_time = 0.0;
		double update_sync_rhs = 0.0;
		double update_sync_H = 0.0;
		double compute_time = 0.0;
		double compute_sync = 0.0;
		double total_time = -MPI_Wtime( );
		size_t ninner_iters = 0;

        do {
			iter_counter++;
//			COUT_INFO("Iteration: " << iter_counter << " Current error: " << current_err << ", Current gradient norm: "
//                                     << gradient_norm << ", Direction norm: " << update_norm << "\r");

            if (update_J) {
				update_counter++;
                /* Get Jacobian matrix */
				update_time -= MPI_Wtime( );
                this->p_impl->get_jacobian_and_rhs(ef,
                                                   H,
                                                   rhs,
                                                   this->p_impl->batch_size);
				update_time += MPI_Wtime( );
				update_sync_rhs -= MPI_Wtime( );
				MPI_Allreduce(
					MPI_IN_PLACE,
					rhs.begin(),
					rhs.n_elem,
					MPI_DOUBLE,
					MPI_SUM,
					lib4neuro::mpi_active_comm
				);
				update_sync_rhs += MPI_Wtime( );
				update_sync_H -= MPI_Wtime( );
				MPI_Allreduce(
					MPI_IN_PLACE,
					H.begin(),
					H.n_elem,
					MPI_DOUBLE,
					MPI_SUM,
					lib4neuro::mpi_active_comm
				);
				update_sync_H += MPI_Wtime( );

                gradient_norm = 0;
                // TODO parallelize with OpenMP?
                for (size_t ci = 0; ci < n_parameters; ++ci) {
                    mem_double = rhs[ci];
                    mem_double *= mem_double;
                    gradient_norm += mem_double;
                }
                gradient_norm  = std::sqrt(gradient_norm) / n_parameters;

                /* Evaluate the error before updating parameters */
                prev_err = ef.eval();
            }

            /* H_new = H + lambda*I */
            H_new = H;
			for( size_t i = 0; i < n_parameters; ++i ){
				// H_new.at( i, i ) += lambda * H.at( i, i );
				H_new.at( i, i ) += lambda;
			}


            /* Compute the update vector */
            // update = arma::solve(
				// H_new,
				// rhs,
				// arma::solve_opts::flag_fast + arma::solve_opts::flag_allow_ugly + arma::solve_opts::flag_no_band
			// );
			compute_time -= MPI_Wtime( );
             update = arma::solve(
				 H_new,
				 rhs
			 );
//			ninner_iters += this->p_impl->solve_system_CG( H, rhs, update );
			compute_time += MPI_Wtime( );

			// compute_sync -= MPI_Wtime( );
            // MPI_Allreduce( MPI_IN_PLACE, &update[0], update.size(), MPI_DOUBLE, MPI_SUM, lib4neuro::mpi_active_comm );
			// compute_sync += MPI_Wtime( );

            /* Compute the error after update of parameters */
            update_norm = 0.0;
            for (size_t i = 0; i < n_parameters; i++) {
                params_tmp->at(i) = params_current->at(i) + update.at(i);
                update_norm += update.at(i) * update.at(i);
            }
            update_norm   = std::sqrt(update_norm);
            current_err   = ef.eval(params_tmp.get());

            /* Check, if the parameter update improved the function */
            if (current_err < prev_err) {

                /* If the error is lowered after parameter update, accept the new parameters and lower the damping
                 * factor lambda */

                //TODO rewrite without division!
                lambda /= this->p_impl->lambda_decrease;

                for (size_t i = 0; i < n_parameters; i++) {
                    params_current->at(i) = params_tmp->at(i);
                }

                prev_err = current_err;
                update_J = true;

                /* If the convergence threshold is achieved, finish the computation */
                if (current_err < this->p_impl->tolerance) {
                    break;
                }

            } else {
                /* If the error after parameters update is not lower, increase the damping factor lambda */
                update_J = false;
                lambda *= this->p_impl->lambda_increase;
            }
            // COUT_DEBUG("Iteration: " << iter_counter << " Current error: " << current_err << ", Current gradient norm: "
                                     // << gradient_norm << ", Direction norm: " << update_norm << "\r");


            if (ofs && ofs->is_open() && lib4neuro::mpi_rank == 0) {
                *ofs << "Iteration: " << iter_counter << " Current error: " << current_err << ", Current gradient norm: "
                     << gradient_norm << ", Direction norm: " << update_norm << std::endl;
            }
        } while (iter_counter < this->p_impl->maximum_niters);
		total_time += MPI_Wtime( );


        COUT_DEBUG("Iteration: " << iter_counter << " Current error: " << current_err << ", Current gradient norm: "
                                 << gradient_norm << ", Direction norm: " << update_norm << std::endl);
        if (ofs && ofs->is_open() && lib4neuro::mpi_rank == 0) {
            *ofs << "Iteration: " << iter_counter << " Current error: " << current_err << ", Current gradient norm: "
                 << gradient_norm << ", Direction norm: " << update_norm << std::endl;
        }

        /* Store the optimized parameters */
        this->optimal_parameters = *params_current;

        /* Dealloc vector of parameters */
        if (params_current) {
            delete params_current;
            params_current = nullptr;
        }

        ef.set_parameters(this->optimal_parameters);

		// COUT_INFO("Finished in " << iter_counter << " iterations with error: " << current_err << ", gradient norm: " << gradient_norm << ", update norm: " << update_norm << std::endl);
		COUT_INFO( "Finished in " << total_time << " [s], with " << update_counter << " updates of hessian, in " << iter_counter << " outer iterations, in " << ninner_iters << " inner iterations" );
		COUT_INFO( " error " << current_err );
		COUT_INFO( " update norm " << update_norm );
		COUT_INFO( " calculation of Hessian  " << 100*(update_time + update_sync_rhs + update_sync_H) / (update_time + update_sync_rhs + update_sync_H + compute_time + compute_sync) << " [%], time per one calculation " << (update_time + update_sync_rhs + update_sync_H) / update_counter << " [s]" );
		COUT_INFO( " calculation of direction " << 100*(compute_time + compute_sync) / (update_time + update_sync_rhs + update_sync_H + compute_time + compute_sync) << " [%], time per one iteration " << (compute_time + compute_sync) / iter_counter << " [s]" );
    }

    LevenbergMarquardt::~LevenbergMarquardt() = default;
}