Commit 7509f496 authored by iff's avatar iff
Browse files

opt-pilot: expose group id and use in simluation as unqiue work group identifier

parent 3ca77f74
......@@ -8,8 +8,6 @@
#include "Util/CmdArguments.h"
#include "Util/OptPilotException.h"
//TODO: what is the performance difference between using MPI_COMM_WORLD and
// p2p communication vs. communicator groups??
/**
* \brief Role assignment according to strategy (that might use hardware
* network information).
......@@ -89,9 +87,10 @@ public:
bundle.island_id = Strategy_t::group_id_;
bundle.leader_pid = Strategy_t::leader_;
bundle.leader_local_pid = Strategy_t::leader_local_pid_;
bundle.master_pid = Strategy_t::poller_;
//FIXME: is it always 0?
bundle.master_local_pid = 0; //poller_local_pid_;
bundle.master_local_pid = Strategy_t::master_local_pid_;
bundle.worker = my_worker_comm_;
bundle.opt = my_opt_comm_;
......@@ -117,6 +116,7 @@ private:
/// local (wrt. the communicator group) rank of the master/pilot process
int poller_local_pid_;
int leader_local_pid_;
};
#endif
......@@ -40,48 +40,60 @@ public:
group_id_ = rank_ / group_size;
Comm::id_t group_start = group_id_ * group_size;
// Fix Pilot to core start_group + 1
// fix Pilot to the of the group
poller_ = group_start;
master_local_pid_ = 0;
// Master and Optimizer fixed to first two cores of group
worker_group_id_ = 0;
// Pilot/Master and Optimizer fixed to first two cores of group
if(rank_ % group_size == 0) {
role_ = POLLER;
leader_ = 0;
leader_local_pid_ = 0;
} else if(rank_ % group_size == 1) {
role_ = OPTIMIZER;
leader_ = 1;
leader_local_pid_ = 1;
} else {
role_ = WORKER;
Comm::localId_t worker_group = ((rank_ % group_size) - 2) /
worker_group_id_ = ((rank_ - 2) % (group_size - 2)) /
num_coworkers_worker_;
leader_ = group_start + 2 + worker_group * num_coworkers_worker_;
leader_ = leader_ % group_size;
leader_local_pid_ = 2 + worker_group_id_ * num_coworkers_worker_;
// worker group id is global rank of leader
worker_group_id_ = group_start + leader_;
}
// define coloring for splitting starting with INTERNAL comm
colorings_.push_back(group_start + leader_);
// define coloring for splitting starting with INTERGROUP comm
// this is always the "color" of the leader (global rank)
leader_ = leader_local_pid_ + group_start;
colorings_.push_back(leader_);
// now we can create individual comm group between leaders of
// different groups, i.e. the optimizer leader and pilot leader.
// all non-participating ranks put MPI_UNDEFINED.
// .. and optimizer -- poller leaders
// for optimizer -- pilot leaders
if(role_ == WORKER ||
rank_ % group_size != static_cast<size_t>(leader_))
rank_ % group_size != static_cast<size_t>(local_leader_pid_))
colorings_.push_back(MPI_UNDEFINED);
else
colorings_.push_back(group_id_);
// .. and worker -- poller leaders
// for worker -- pilot leaders
if(role_ == OPTIMIZER ||
rank_ % group_size != static_cast<size_t>(leader_))
rank_ % group_size != static_cast<size_t>(local_leader_pid_))
colorings_.push_back(MPI_UNDEFINED);
else
colorings_.push_back(group_id_);
colorings_.push_back(worker_group_id_);
// .. and finally the "world" communicator
// and finally the "world" communicator for all ranks
// with the same role
if(role_ == WORKER)
colorings_.push_back(0);
else if(role_ == OPTIMIZER)
......@@ -89,9 +101,11 @@ public:
else
colorings_.push_back(2);
//FIXME:
if(role_ == POLLER)
leader_ = 1;
//FIXME: pilot/master is its own leader? unused most likely!
if(role_ == POLLER) {
poller_ = MPI_UNDEFINED;
local_leader_pid_ = MPI_UNDEFINED;
}
}
private:
......@@ -110,7 +124,6 @@ private:
MPI_Abort(getComm(), -111);
}
num_masters_ = 1;
try {
num_masters_ = cmd_args_->getArg<size_t>("num-masters");
......
......@@ -9,7 +9,24 @@
enum commGroupColorings_t {Internal, ExternalToOpt, ExternalToWorker};
/// Defines an interface for splitter strategy implementations
/**
* \brief Defines an interface for splitter strategy implementations.
* \see CommSplitter
*
* A split strategy is a coloring from MPI ranks to colors (ints) to form distinct
* communicator groups (MPI_Comms):
*
* - Internal: all ranks beloning to the same subgroup, i.e. all workers running
* one simulation concurrently (as i.e. OPAL needs a "comm group" and
* not a list of ranks).
* - ExternalToOpt: the leader ranks to exchange information about the optimization
* part.
* - ExternalToWorker: the leader ranks to start new simulations, get results, ..
* - Broadcast: all ranks with the same role to broadcast commands, i.e. STOP.
*
* Every implementation needs to populate the colorings_ vector, which is turn used
* by the CommSplitter.
*/
class SplitStrategy {
public:
......@@ -45,9 +62,11 @@ public:
MPI_Comm getComm() const { return comm_; }
int getRank() const { return rank_; }
int getGlobalRank() const { return global_rank_; }
int getNP() const { return num_procs_; }
int getRank() const { return rank_; }
int getGlobalRank() const { return global_rank_; }
int getNP() const { return num_procs_; }
int getGroupId() const { return group_id_; }
int getWorkerGroupId() const { return worker_group_id_; }
Role_t getRole() const { return role_; }
int getLeader() const { return leader_; }
......@@ -70,6 +89,7 @@ protected:
int global_rank_;
int num_procs_;
int group_id_;
int worker_group_id_;
CmdArguments_t cmd_args_;
......@@ -81,7 +101,13 @@ protected:
/// every core specifies a leader (master is its own leader)
int leader_;
/// every core can specifies a master
/// the leaders local pid of worker groups
int leader_local_pid_;
/// the pilot local pid of group
int master_local_pid_;
/// the master running the pilot
int poller_;
/// used in master <-> workers communicator
......
......@@ -28,13 +28,28 @@ namespace Comm {
/// bundles all communicators for a specific role/pid
struct Bundle_t {
/// unique island identifier (group of pilot, opt, workers)
int island_id;
/// global pid of the group (pilot, opt, worker) leader
int leader_pid;
/// global pid of the pilot for this group
int master_pid;
/// local pid of the pilot in the communicator groups
int master_local_pid;
/// communicator of all workers
MPI_Comm worker;
/// communicator of all optimizers
MPI_Comm opt;
/// communicator of coworkers, i.e. a subset of ranks assigned to one worker
MPI_Comm coworkers;
/// world communicator as passed to the CommSplitter
MPI_Comm world;
};
}
......
......@@ -43,7 +43,9 @@ public:
pilot_rank_ = comms.master_local_pid;
is_idle_ = true;
coworker_comm_ = comms.coworkers;
group_id_ = comms.leader_pid;
// the leader is fixed to the first rank in the internal comm group
leader_pid_ = 0;
int my_local_pid = 0;
MPI_Comm_rank(coworker_comm_, &my_local_pid);
......@@ -65,6 +67,7 @@ private:
bool is_idle_;
MPI_Comm coworker_comm_;
int group_id_;
Expressions::Named_t objectives_;
Expressions::Named_t constraints_;
......@@ -102,7 +105,7 @@ private:
try {
SimPtr_t sim(new Sim_t(objectives_, constraints_,
params, simulation_name_, coworker_comm_,
cmd_args_));
group_id_, cmd_args_));
sim->run();
} catch(OptPilotException &ex) {
......@@ -184,7 +187,8 @@ protected:
reqVarContainer_t requested_results;
try {
SimPtr_t sim(new Sim_t(objectives_, constraints_,
params, simulation_name_, coworker_comm_, cmd_args_));
params, simulation_name_, coworker_comm_,
group_id_, cmd_args_));
// run simulation in a "blocking" fashion
sim->run();
......
......@@ -34,7 +34,8 @@
OpalSimulation::OpalSimulation(Expressions::Named_t objectives,
Expressions::Named_t constraints,
Param_t params, std::string name,
MPI_Comm comm, CmdArguments_t args)
MPI_Comm comm, int groupId,
CmdArguments_t args)
: Simulation(args)
, objectives_(objectives)
, constraints_(constraints)
......@@ -67,34 +68,13 @@ OpalSimulation::OpalSimulation(Expressions::Named_t objectives,
std::pair<std::string, std::string>(parameter.first, value.str()));
}
/*
This is a copy from Comm/Splitter/ManyMasterSplit.h
in order to calculate the leader which is the unique ID in case
of more than one core per worker.
*/
int my_rank=0;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
int world_size=0;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
unsigned num_coworkers_worker_ = 0;
num_coworkers_worker_ = args->getArg<size_t>("num-coworkers");
unsigned group_start = 0;
unsigned worker_group = ((my_rank % world_size) - 2) / num_coworkers_worker_;
unsigned leader_ = group_start + 2 + worker_group * num_coworkers_worker_;
leader_ = leader_ % world_size;
// hash the dictionary to get a short unique directory name for temporary
// simulation data
std::string hash = HashNameGenerator::generate(dict);
std::ostringstream tmp;
tmp.precision(15);
tmp << simTmpDir_ << "/" << hash << "_" << leader_;
tmp << simTmpDir_ << "/" << hash << "_" << groupId;
simulationDirName_ = tmp.str();
......@@ -466,4 +446,4 @@ void OpalSimulation::cleanUp() {
}
#endif
}
\ No newline at end of file
}
......@@ -39,12 +39,13 @@ public:
* @param[in] params
* @param[in] name of the simulation
* @param[in] comm MPI communicator used for running the simulation
* @param[in] groupId worker group (unique) identifier
* @param[in] args command line arguments passed to the framework
*/
OpalSimulation(Expressions::Named_t objectives,
Expressions::Named_t constraints,
Param_t params, std::string name, MPI_Comm comm,
CmdArguments_t args);
int groupId, CmdArguments_t args);
virtual ~OpalSimulation();
......@@ -112,4 +113,4 @@ private:
void restoreOut();
};
#endif
\ No newline at end of file
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment