Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
harpa-mpiuncertainty
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Radim Vavřík
harpa-mpiuncertainty
Commits
ae66d7a4
Commit
ae66d7a4
authored
8 years ago
by
root
Browse files
Options
Downloads
Patches
Plain Diff
Added goal gap threshold for slaves
parent
65198ca1
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
include/MpiUncertainty_exc.h
+1
-1
1 addition, 1 deletion
include/MpiUncertainty_exc.h
src/MpiUncertainty_exc.cc
+53
-44
53 additions, 44 deletions
src/MpiUncertainty_exc.cc
with
54 additions
and
45 deletions
include/MpiUncertainty_exc.h
+
1
−
1
View file @
ae66d7a4
...
@@ -46,7 +46,7 @@ int m_rank = -1;
...
@@ -46,7 +46,7 @@ int m_rank = -1;
int
m_numProc
=
-
1
;
int
m_numProc
=
-
1
;
int
m_master
=
-
1
;
int
m_master
=
-
1
;
int
m_threadsNumberTotal
=
1
;
int
m_threadsNumberTotal
=
1
;
int
m_number_of_active_nodes
=
1
;
//1, 2, 4, 8, 16.
int
m_workDone
=
0
;
int
m_workDone
=
0
;
public
:
public
:
...
...
This diff is collapsed.
Click to expand it.
src/MpiUncertainty_exc.cc
+
53
−
44
View file @
ae66d7a4
...
@@ -234,7 +234,6 @@ RTLIB_ExitCode_t MpiUncertainty::onRun() {
...
@@ -234,7 +234,6 @@ RTLIB_ExitCode_t MpiUncertainty::onRun() {
{
{
// Set number of OMP threads to uncertainty, generate chunk of MC samples and execute simulations
// Set number of OMP threads to uncertainty, generate chunk of MC samples and execute simulations
m_uncertainty
->
RunMC
(
m_threadsNumber
);
m_uncertainty
->
RunMC
(
m_threadsNumber
);
}
}
if
(
m_rank
==
m_master
)
if
(
m_rank
==
m_master
)
...
@@ -315,6 +314,8 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
...
@@ -315,6 +314,8 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
100.0
*
(
float
)
m_jobsDone
/
(
float
)
m_jobsNumber
[
m_qosMode
],
100.0
*
(
float
)
m_jobsDone
/
(
float
)
m_jobsNumber
[
m_qosMode
],
Cycles
(),
m_threadsNumber
,
GetCPS
()
*
m_threadsNumber
);
Cycles
(),
m_threadsNumber
,
GetCPS
()
*
m_threadsNumber
);
int
goal_gap
=
0
;
std
::
vector
<
int
>
accelerate_flags
(
m_numProc
,
0
);
if
(
GetCPS
()
==
0.0
)
if
(
GetCPS
()
==
0.0
)
return
RTLIB_OK
;
return
RTLIB_OK
;
...
@@ -330,56 +331,64 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
...
@@ -330,56 +331,64 @@ RTLIB_ExitCode_t MpiUncertainty::onMonitor() {
// defined by the QoS mode, the higher AWM is requested
// defined by the QoS mode, the higher AWM is requested
// int timeFrameMs = m_timeFrame[m_qosMode] * 1000;
// int timeFrameMs = m_timeFrame[m_qosMode] * 1000;
// int goalGap = (1 - (timeFrameMs / estimatedTotalTime)) * 100;
// int goalGap = (1 - (timeFrameMs / estimatedTotalTime)) * 100;
int
remaining_jobs
=
m_jobsNumber
[
m_qosMode
]
-
m_jobsDone
;
float
remaining_time
=
int
remaining_jobs
=
m_jobsNumber
[
m_qosMode
]
-
m_jobsDone
;
std
::
max
(
1.0
,
m_timeFrame
[
m_qosMode
]
-
(
m_timer
.
getElapsedTimeMs
()
/
1000.0
));
float
jobs_per_second
=
(
float
)
m_threadsNumber
*
GetCPS
();
float
jobs_per_second
=
(
float
)
m_threadsNumber
*
GetCPS
();
float
remaining_time
=
m_timeFrame
[
m_qosMode
]
-
(
m_timer
.
getElapsedTimeMs
()
/
1000.0
);
float
ideal_jobs_per_second
=
(
float
)
remaining_jobs
/
remaining_time
;
float
ideal_jobs_per_second
=
(
float
)
remaining_jobs
/
remaining_time
;
float
goal_gap
=
100.0
*
(
jobs_per_second
-
ideal_jobs_per_second
)
/
ideal_jobs_per_second
;
logger
->
Warn
(
"Time spend: %f"
,
m_timer
.
getElapsedTimeMs
()
/
1000.0
);
if
(
remaining_time
<=
0.0
)
{
goal_gap
=
-
1000
;
// Really low value to motivate BBQUE to give the highest amount of resources possible
}
else
{
goal_gap
=
100
*
(
jobs_per_second
-
ideal_jobs_per_second
)
/
ideal_jobs_per_second
;
}
logger
->
Warn
(
"Time spend: %f s"
,
m_timer
.
getElapsedTimeMs
()
/
1000.0
);
logger
->
Warn
(
"Remaining jobs: %d"
,
remaining_jobs
);
logger
->
Warn
(
"Remaining jobs: %d"
,
remaining_jobs
);
logger
->
Warn
(
"Remaining time: %f"
,
remaining_time
);
logger
->
Warn
(
"Remaining time: %f"
,
remaining_time
);
logger
->
Warn
(
"Current cps: %f"
,
jobs_per_second
);
logger
->
Warn
(
"Current cps: %f"
,
jobs_per_second
);
logger
->
Warn
(
"Ideal cps: %f"
,
ideal_jobs_per_second
);
logger
->
Warn
(
"Ideal cps: %f"
,
ideal_jobs_per_second
);
logger
->
Warn
(
"Goal Gap for Cycle %d is %.2f"
,
Cycles
(),
goal_gap
);
logger
->
Warn
(
"Goal Gap for Cycle %d is %d"
,
Cycles
(),
goal_gap
);
SetGoalGap
(
goal_gap
);
MPI_Bcast
(
&
goal_gap
,
1
,
MPI_INT
,
m_master
,
MPI_COMM_WORLD
);
//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves)
//Option A ==> Goal Gap is sent to all the slaves
// MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);//tonipat@201610101 send Goal gap to the slaves goal_gap/(#of slaves)
//std::cout << "FRAME " << m_timeFrame[m_qosMode] << std::endl;
//Option B
// if(estimatedTotalTime > timeFrameMs)
//{
int
goal_gap_threshold
=
20
;
//if it is higher or lower of a threshold, then we activate more nodes1->2->8->16.
//int goalGap = (1 - (estimatedTotalTime / timeRequired - 1)) * 100;
// logger->Warn(RANK("MpiUncertainty::onMonitor() : Requesting higher AWM, gap: %d, estimation: %f, limit: %d"),
// goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
if
(
goal_gap
>=
goal_gap_threshold
)
// SetGoalGap(goalGap);
--
m_number_of_active_nodes
;
// }
if
(
goal_gap
<=
-
goal_gap_threshold
)
// else if (estimatedTotalTime < 0.85 * timeFrameMs)
++
m_number_of_active_nodes
;
// {
// logger->Warn(RANK("MpiUncertainty::onMonitor() : Requesting lower AWM, gap: %d, estimation: %f, limit: %f"),
// goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
if
(
m_number_of_active_nodes
<=
0
)
m_number_of_active_nodes
=
0
;
// //SetGoalGap(0);
if
(
m_number_of_active_nodes
>
m_numProc
)
m_number_of_active_nodes
=
m_numProc
;
// }
logger
->
Info
(
RANK
(
"MpiUncertainty::onMonitor() : Active slaves: %d"
),
m_number_of_active_nodes
);
//else
//{
if
(
m_number_of_active_nodes
)
// logger->Info(RANK("MpiUncertainty::onMonitor() : Current AWM is OK, gap: %d, estimation: %f, limit: %d"),
goal_gap
=
goal_gap
/
m_number_of_active_nodes
;
// goalGap, estimatedTotalTime/1000.0, m_timeFrame[m_qosMode]);
//SetGoalGap(0);
SetGoalGap
(
goal_gap
);
// }
MPI_Bcast
(
&
goal_gap
,
1
,
MPI_INT
,
m_master
,
MPI_COMM_WORLD
);
// Exploit less threads if less jobs remain
std
::
fill_n
(
accelerate_flags
.
begin
(),
m_number_of_active_nodes
,
1
);
// Master rank should have accelerate always eq 1
// if ( m_jobsNumber[m_qosMode] - m_jobsDone < m_threadsNumber )
MPI_Scatter
(
accelerate_flags
.
data
(),
1
,
MPI_INT
,
&
m_accelerate
,
1
,
MPI_INT
,
m_master
,
MPI_COMM_WORLD
);
// m_threadsNumber = m_jobsNumber[m_qosMode] - m_jobsDone;
}
}
else
else
{
{
// TODO always ask for higher AWM (accelerator mode) ?
// Receive broadcasted goal gap
MPI_Bcast
(
&
goal_gap
,
1
,
MPI_INT
,
m_master
,
MPI_COMM_WORLD
);
// Receive goal gap from master rank
SetGoalGap
(
goal_gap
);
//int goalGap = 0;
// Receive accelerate flag
// MPI_Bcast(&goal_gap, 1, MPI_INT, m_master, MPI_COMM_WORLD);
MPI_Scatter
(
accelerate_flags
.
data
(),
1
,
MPI_INT
,
&
m_accelerate
,
1
,
MPI_INT
,
m_master
,
MPI_COMM_WORLD
);
//logger->Info(RANK("MpiUncertainty::onMonitor() : Received goal gap %d"),goal_gap);
logger
->
Info
(
RANK
(
"MpiUncertainty::onMonitor() : Received accelerate flag %d"
),
m_accelerate
);
//SetGoalGap(goal_gap);
}
}
return
RTLIB_OK
;
return
RTLIB_OK
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment