Added goal gap threshold for slaves

ae66d7a4 · root · 65198ca1 · ae66d7a4 · ae66d7a4
Commit ae66d7a4 authored 8 years ago by root
--- a/include/MpiUncertainty_exc.h
+++ b/include/MpiUncertainty_exc.h
@@ -46,7 +46,7 @@ int m_rank = -1;
 int m_numProc = -1;
 int m_master = -1;
 int m_threadsNumberTotal = 1;
+int m_number_of_active_nodes = 1; //1, 2, 4, 8, 16.
 int m_workDone = 0;
 public:

--- a/src/MpiUncertainty_exc.cc
+++ b/src/MpiUncertainty_exc.cc
@@ -234,7 +234,6 @@ RTLIB_ExitCode_t MpiUncertainty::onRun() {
 	{
 		// Set number of OMP threads to uncertainty, generate chunk of MC samples and execute simulations
 		m_uncertainty->RunMC(m_threadsNumber);
 	}
 	if(m_rank == m_master)
@@ -315,6 +314,8 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
 				100.0 * (float)m_jobsDone / (float)m_jobsNumber[m_qosMode],
 			  Cycles(), m_threadsNumber, GetCPS() * m_threadsNumber);
+	int goal_gap = 0;
+	std::vector<int> accelerate_flags(m_numProc, 0);
 	if (GetCPS() == 0.0)
 		return RTLIB_OK;
@@ -330,56 +331,64 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
 		// defined by the QoS mode, the higher AWM is requested
 	  //	int timeFrameMs = m_timeFrame[m_qosMode] * 1000; 
 	  //	int goalGap = (1 - (timeFrameMs / estimatedTotalTime)) * 100;
-	  int   remaining_jobs   = m_jobsNumber[m_qosMode] - m_jobsDone;
-	  float remaining_time   =
+	  int remaining_jobs = m_jobsNumber[m_qosMode] - m_jobsDone;
-	    std::max(1.0, m_timeFrame[m_qosMode] - (m_timer.getElapsedTimeMs() / 1000.0));
+	  float jobs_per_second = (float) m_threadsNumber * GetCPS();
-	  float jobs_per_second = (float)m_threadsNumber * GetCPS();
+	  float remaining_time = m_timeFrame[m_qosMode] - (m_timer.getElapsedTimeMs() / 1000.0);
-	  float ideal_jobs_per_second = (float)remaining_jobs / remaining_time;
+	  float ideal_jobs_per_second = (float) remaining_jobs / remaining_time;
-	  float goal_gap = 100.0 * (jobs_per_second - ideal_jobs_per_second) / ideal_jobs_per_second;
-	  logger->Warn("Time spend: %f", m_timer.getElapsedTimeMs() / 1000.0);
+          if(remaining_time <= 0.0)
+	  {
+          	goal_gap = -1000; // Really low value to motivate BBQUE to give the highest amount of resources possible	
+          }
+	  else
+	  {
+      	  	goal_gap = 100 * (jobs_per_second - ideal_jobs_per_second) / ideal_jobs_per_second;
+	  }
+	  logger->Warn("Time spend: %f s", m_timer.getElapsedTimeMs() / 1000.0);
 	  logger->Warn("Remaining jobs: %d", remaining_jobs);
 	  logger->Warn("Remaining time: %f", remaining_time);
 	  logger->Warn("Current cps: %f", jobs_per_second);
 	  logger->Warn("Ideal cps: %f", ideal_jobs_per_second);
-	  logger->Warn("Goal Gap for Cycle %d is %.2f", Cycles(), goal_gap);
+	  logger->Warn("Goal Gap for Cycle %d is %d", Cycles(), goal_gap);
-	  SetGoalGap(goal_gap);
-	  MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves) 
+          //Option A ==> Goal Gap is sent to all the slaves
+	  // MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves) 
-		//std::cout << "FRAME " << m_timeFrame[m_qosMode] << std::endl;
+          //Option B
-		//	if(estimatedTotalTime > timeFrameMs)
-		//{
+	  int goal_gap_threshold = 20; //if it is higher or lower of a threshold, then we activate more nodes1->2->8->16.
-			//int goalGap = (1 - (estimatedTotalTime / timeRequired - 1)) * 100;
-		//	logger->Warn(RANK("MpiUncertainty::onMonitor()  : Requesting higher AWM, gap: %d, estimation: %f, limit: %d"), 
-		//		goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
+		if(goal_gap >= goal_gap_threshold)
-		//	SetGoalGap(goalGap);
+			--m_number_of_active_nodes;
-		//	}
+		if(goal_gap <= -goal_gap_threshold)
-		// else if (estimatedTotalTime < 0.85 * timeFrameMs)
+			++m_number_of_active_nodes;
-		// {
-		// 	logger->Warn(RANK("MpiUncertainty::onMonitor()  : Requesting lower AWM, gap: %d, estimation: %f, limit: %f"),
-		// 	 goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
+		if(m_number_of_active_nodes <= 0) m_number_of_active_nodes = 0;
-		// 	//SetGoalGap(0);
+		if(m_number_of_active_nodes > m_numProc) m_number_of_active_nodes = m_numProc;	
-		// }
+	  	logger->Info(RANK("MpiUncertainty::onMonitor() : Active slaves: %d"), m_number_of_active_nodes);
-		//else
-		//{
+		if(m_number_of_active_nodes)
-		//	logger->Info(RANK("MpiUncertainty::onMonitor()  : Current AWM is OK, gap: %d, estimation: %f, limit: %d"),
+			goal_gap = goal_gap / m_number_of_active_nodes;
-		//	 goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
-			//SetGoalGap(0);
+	  	SetGoalGap(goal_gap);
-		//	}
+		MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);	  
-		// Exploit less threads if less jobs remain
+		std::fill_n(accelerate_flags.begin(), m_number_of_active_nodes, 1); // Master rank should have accelerate always eq 1 
-		// if ( m_jobsNumber[m_qosMode] - m_jobsDone < m_threadsNumber )
+		MPI_Scatter(accelerate_flags.data(), 1, MPI_INT, &m_accelerate, 1, MPI_INT, m_master, MPI_COMM_WORLD);  		
-		// 	m_threadsNumber = m_jobsNumber[m_qosMode] - m_jobsDone;
 	}
 	else
 	{
-		// TODO always ask for higher AWM (accelerator mode) ?
+		// Receive broadcasted goal gap
+	 	MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);	  
-		// Receive goal gap from master rank
+		SetGoalGap(goal_gap);
-		//int goalGap = 0;
+		// Receive accelerate flag	
-		//	MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);
+		MPI_Scatter(accelerate_flags.data(), 1, MPI_INT, &m_accelerate, 1, MPI_INT, m_master, MPI_COMM_WORLD);  		
-		//logger->Info(RANK("MpiUncertainty::onMonitor() : Received goal gap %d"),goal_gap);	
+	  	logger->Info(RANK("MpiUncertainty::onMonitor() : Received accelerate flag %d"), m_accelerate);
-		//SetGoalGap(goal_gap);			
 	}
 	return RTLIB_OK;