diff --git a/include/MpiUncertainty_exc.h b/include/MpiUncertainty_exc.h index 8425751eefe2505b6f358c163613d6a2bc6cd1f3..bda5c0aef9f5bd231f9dbe2798771c8282a21bd0 100644 --- a/include/MpiUncertainty_exc.h +++ b/include/MpiUncertainty_exc.h @@ -46,7 +46,7 @@ int m_rank = -1; int m_numProc = -1; int m_master = -1; int m_threadsNumberTotal = 1; - +int m_number_of_active_nodes = 1; //1, 2, 4, 8, 16. int m_workDone = 0; public: diff --git a/src/MpiUncertainty_exc.cc b/src/MpiUncertainty_exc.cc index 92ad5d58b587d67b53484f9eae9b8137a1f1c55b..f320c231e1de7b0d54636a2dfe81dbef2b1d6d49 100644 --- a/src/MpiUncertainty_exc.cc +++ b/src/MpiUncertainty_exc.cc @@ -234,7 +234,6 @@ RTLIB_ExitCode_t MpiUncertainty::onRun() { { // Set number of OMP threads to uncertainty, generate chunk of MC samples and execute simulations m_uncertainty->RunMC(m_threadsNumber); - } if(m_rank == m_master) @@ -315,6 +314,8 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() { 100.0 * (float)m_jobsDone / (float)m_jobsNumber[m_qosMode], Cycles(), m_threadsNumber, GetCPS() * m_threadsNumber); + int goal_gap = 0; + std::vector<int> accelerate_flags(m_numProc, 0); if (GetCPS() == 0.0) return RTLIB_OK; @@ -330,56 +331,64 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() { // defined by the QoS mode, the higher AWM is requested // int timeFrameMs = m_timeFrame[m_qosMode] * 1000; // int goalGap = (1 - (timeFrameMs / estimatedTotalTime)) * 100; - int remaining_jobs = m_jobsNumber[m_qosMode] - m_jobsDone; - float remaining_time = - std::max(1.0, m_timeFrame[m_qosMode] - (m_timer.getElapsedTimeMs() / 1000.0)); - float jobs_per_second = (float)m_threadsNumber * GetCPS(); - float ideal_jobs_per_second = (float)remaining_jobs / remaining_time; - float goal_gap = 100.0 * (jobs_per_second - ideal_jobs_per_second) / ideal_jobs_per_second; - logger->Warn("Time spend: %f", m_timer.getElapsedTimeMs() / 1000.0); + + int remaining_jobs = m_jobsNumber[m_qosMode] - m_jobsDone; + float jobs_per_second = (float) m_threadsNumber * GetCPS(); + float remaining_time = m_timeFrame[m_qosMode] - (m_timer.getElapsedTimeMs() / 1000.0); + float ideal_jobs_per_second = (float) remaining_jobs / remaining_time; + + if(remaining_time <= 0.0) + { + + goal_gap = -1000; // Really low value to motivate BBQUE to give the highest amount of resources possible + } + else + { + + goal_gap = 100 * (jobs_per_second - ideal_jobs_per_second) / ideal_jobs_per_second; + } + + logger->Warn("Time spend: %f s", m_timer.getElapsedTimeMs() / 1000.0); logger->Warn("Remaining jobs: %d", remaining_jobs); logger->Warn("Remaining time: %f", remaining_time); logger->Warn("Current cps: %f", jobs_per_second); logger->Warn("Ideal cps: %f", ideal_jobs_per_second); - logger->Warn("Goal Gap for Cycle %d is %.2f", Cycles(), goal_gap); - SetGoalGap(goal_gap); - MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves) - - //std::cout << "FRAME " << m_timeFrame[m_qosMode] << std::endl; - // if(estimatedTotalTime > timeFrameMs) - //{ - //int goalGap = (1 - (estimatedTotalTime / timeRequired - 1)) * 100; - // logger->Warn(RANK("MpiUncertainty::onMonitor() : Requesting higher AWM, gap: %d, estimation: %f, limit: %d"), - // goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]); - // SetGoalGap(goalGap); - // } - // else if (estimatedTotalTime < 0.85 * timeFrameMs) - // { - // logger->Warn(RANK("MpiUncertainty::onMonitor() : Requesting lower AWM, gap: %d, estimation: %f, limit: %f"), - // goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]); - // //SetGoalGap(0); - // } - //else - //{ - // logger->Info(RANK("MpiUncertainty::onMonitor() : Current AWM is OK, gap: %d, estimation: %f, limit: %d"), - // goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]); - //SetGoalGap(0); - // } - - // Exploit less threads if less jobs remain - // if ( m_jobsNumber[m_qosMode] - m_jobsDone < m_threadsNumber ) - // m_threadsNumber = m_jobsNumber[m_qosMode] - m_jobsDone; - + logger->Warn("Goal Gap for Cycle %d is %d", Cycles(), goal_gap); + + //Option A ==> Goal Gap is sent to all the slaves + // MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves) + //Option B + + int goal_gap_threshold = 20; //if it is higher or lower of a threshold, then we activate more nodes1->2->8->16. + + + if(goal_gap >= goal_gap_threshold) + --m_number_of_active_nodes; + if(goal_gap <= -goal_gap_threshold) + ++m_number_of_active_nodes; + + + if(m_number_of_active_nodes <= 0) m_number_of_active_nodes = 0; + if(m_number_of_active_nodes > m_numProc) m_number_of_active_nodes = m_numProc; + logger->Info(RANK("MpiUncertainty::onMonitor() : Active slaves: %d"), m_number_of_active_nodes); + + if(m_number_of_active_nodes) + goal_gap = goal_gap / m_number_of_active_nodes; + + SetGoalGap(goal_gap); + MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD); + + std::fill_n(accelerate_flags.begin(), m_number_of_active_nodes, 1); // Master rank should have accelerate always eq 1 + MPI_Scatter(accelerate_flags.data(), 1, MPI_INT, &m_accelerate, 1, MPI_INT, m_master, MPI_COMM_WORLD); } else { - // TODO always ask for higher AWM (accelerator mode) ? - - // Receive goal gap from master rank - //int goalGap = 0; - // MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD); - //logger->Info(RANK("MpiUncertainty::onMonitor() : Received goal gap %d"),goal_gap); - //SetGoalGap(goal_gap); + // Receive broadcasted goal gap + MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD); + SetGoalGap(goal_gap); + // Receive accelerate flag + MPI_Scatter(accelerate_flags.data(), 1, MPI_INT, &m_accelerate, 1, MPI_INT, m_master, MPI_COMM_WORLD); + logger->Info(RANK("MpiUncertainty::onMonitor() : Received accelerate flag %d"), m_accelerate); } return RTLIB_OK;