Code indexing in gitaly is broken and leads to code not being visible to the user. We work on the issue with highest priority.

Skip to content
Snippets Groups Projects

Resolve "mixed-precision"

Merged montan_v requested to merge 146-mixed-precision into master
9 files
+ 512
525
Compare changes
  • Side-by-side
  • Inline
Files
9
+ 70
82
@@ -11,14 +11,14 @@
// stype = Field solver type (FFT and CG supported)
// lbthres = Load balancing threshold i.e., lbthres*100 is the maximum load imbalance
// percentage which can be tolerated and beyond which
// particle load balancing occurs. A value of 0.01 is good for many typical
// particle load balancing occurs. A value of 0.01 is good for many typical
// simulations.
// ovfactor = Over-allocation factor for the buffers used in the communication. Typical
// values are 1.0, 2.0. Value 1.0 means no over-allocation.
// Example:
// srun ./PenningTrap 128 128 128 10000 300 FFT 0.01 --overallocate 1.0 --info 10
//
// Copyright (c) 2021, Sriramkrishnan Muralikrishnan,
// Copyright (c) 2021, Sriramkrishnan Muralikrishnan,
// Paul Scherrer Institut, Villigen PSI, Switzerland
// All rights reserved
//
@@ -89,19 +89,17 @@ struct Newton1D {
}
};
template <typename T, class GeneratorPool, unsigned Dim>
struct generate_random {
using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
using value_type = typename T::value_type;
// Output View for the random numbers
view_type x, v;
using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
using value_type = typename T::value_type;
// Output View for the random numbers
view_type x, v;
// The GeneratorPool
GeneratorPool rand_pool;
// The GeneratorPool
GeneratorPool rand_pool;
T mu, sigma, minU, maxU;
T mu, sigma, minU, maxU;
double pi = Kokkos::numbers::pi_v<double>;
// Initialize all members
@@ -128,17 +126,16 @@ struct generate_random {
v(i)[d] = rand_gen.normal(0.0, 1.0);
}
// Give the state back, which will allow another thread to acquire it
rand_pool.free_state(rand_gen);
}
// Give the state back, which will allow another thread to acquire it
rand_pool.free_state(rand_gen);
}
};
double CDF(const double& x, const double& mu, const double& sigma) {
double cdf = 0.5 * (1.0 + std::erf((x - mu)/(sigma * std::sqrt(2))));
return cdf;
double cdf = 0.5 * (1.0 + std::erf((x - mu) / (sigma * std::sqrt(2))));
return cdf;
}
KOKKOS_FUNCTION
double PDF(const Vector_t<Dim>& xvec, const Vector_t<Dim>& mu, const Vector_t<Dim>& sigma,
const unsigned Dim) {
@@ -158,8 +155,7 @@ int main(int argc, char* argv[]) {
static_assert(Dim == 3, "Penning trap must be 3D");
Ippl ippl(argc, argv);
Inform msg("PenningTrap");
Inform msg2all("PenningTrap",INFORM_ALL_NODES);
Inform msg2all("PenningTrap", INFORM_ALL_NODES);
auto start = std::chrono::high_resolution_clock::now();
ippl::Vector<int, Dim> nr = {std::atoi(argv[1]), std::atoi(argv[2]), std::atoi(argv[3])};
@@ -173,26 +169,20 @@ int main(int argc, char* argv[]) {
static IpplTimings::TimerRef DummySolveTimer = IpplTimings::getTimer("solveWarmup");
static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("Solve");
static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance");
IpplTimings::startTimer(mainTimer);
size_type totalP = std::atol(argv[4]);
const unsigned int nt = std::atoi(argv[5]);
msg << "Penning Trap "
<< endl
<< "nt " << nt << " Np= "
<< totalP << " grid = " << nr
<< endl;
size_type totalP = std::atol(argv[4]);
const unsigned int nt = std::atoi(argv[5]);
msg << "Penning Trap " << endl << "nt " << nt << " Np= " << totalP << " grid = " << nr << endl;
using bunch_type = ChargedParticles<PLayout_t<Dim>, Dim, double>;
std::unique_ptr<bunch_type> P;
std::unique_ptr<bunch_type> P;
ippl::NDIndex<Dim> domain;
for (unsigned i = 0; i< Dim; i++) {
for (unsigned i = 0; i < Dim; i++) {
domain[i] = ippl::Index(nr[i]);
}
@@ -227,27 +217,27 @@ int main(int argc, char* argv[]) {
Vector_t<Dim> mu, sd;
for (unsigned d = 0; d<Dim; d++) {
for (unsigned d = 0; d < Dim; d++) {
mu[d] = 0.5 * length[d];
}
sd[0] = 0.15*length[0];
sd[1] = 0.05*length[1];
sd[2] = 0.20*length[2];
sd[0] = 0.15 * length[0];
sd[1] = 0.05 * length[1];
sd[2] = 0.20 * length[2];
P->initializeFields(mesh, FL);
bunch_type bunchBuffer(PL);
P->initSolver();
P->time_m = 0.0;
P->time_m = 0.0;
P->loadbalancethreshold_m = std::atof(argv[7]);
bool isFirstRepartition;
if ((P->loadbalancethreshold_m != 1.0) && (Ippl::Comm->size() > 1)) {
msg << "Starting first repartition" << endl;
IpplTimings::startTimer(domainDecomposition);
isFirstRepartition = true;
isFirstRepartition = true;
const ippl::NDIndex<Dim>& lDom = FL.getLocalNDIndex();
const int nghost = P->rho_m.getNghost();
using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
@@ -273,13 +263,12 @@ int main(int argc, char* argv[]) {
});
Kokkos::fence();
P->initializeORB(FL, mesh);
P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
IpplTimings::stopTimer(domainDecomposition);
}
msg << "First domain decomposition done" << endl;
IpplTimings::startTimer(particleCreation);
@@ -288,47 +277,46 @@ int main(int argc, char* argv[]) {
const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
Vector_t<Dim> Nr, Dr, minU, maxU;
int myRank = Ippl::Comm->rank();
for (unsigned d = 0; d <Dim; ++d) {
Nr[d] = CDF(Regions(myRank)[d].max(), mu[d], sd[d]) -
CDF(Regions(myRank)[d].min(), mu[d], sd[d]);
Dr[d] = CDF(rmax[d], mu[d], sd[d]) - CDF(rmin[d], mu[d], sd[d]);
for (unsigned d = 0; d < Dim; ++d) {
Nr[d] = CDF(Regions(myRank)[d].max(), mu[d], sd[d])
- CDF(Regions(myRank)[d].min(), mu[d], sd[d]);
Dr[d] = CDF(rmax[d], mu[d], sd[d]) - CDF(rmin[d], mu[d], sd[d]);
minU[d] = CDF(Regions(myRank)[d].min(), mu[d], sd[d]);
maxU[d] = CDF(Regions(myRank)[d].max(), mu[d], sd[d]);
maxU[d] = CDF(Regions(myRank)[d].max(), mu[d], sd[d]);
}
double factor = (Nr[0] * Nr[1] * Nr[2]) / (Dr[0] * Dr[1] * Dr[2]);
size_type nloc = (size_type)(factor * totalP);
double factor = (Nr[0] * Nr[1] * Nr[2]) / (Dr[0] * Dr[1] * Dr[2]);
size_type nloc = (size_type)(factor * totalP);
size_type Total_particles = 0;
MPI_Allreduce(&nloc, &Total_particles, 1,
MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
int rest = (int) (totalP - Total_particles);
int rest = (int)(totalP - Total_particles);
if ( Ippl::Comm->rank() < rest )
if (Ippl::Comm->rank() < rest)
++nloc;
P->create(nloc);
Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100 * Ippl::Comm->rank()));
Kokkos::parallel_for(nloc,
generate_random<Vector_t<Dim>, Kokkos::Random_XorShift64_Pool<>, Dim>(
P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU));
Kokkos::fence();
Ippl::Comm->barrier();
IpplTimings::stopTimer(particleCreation);
P->q = P->Q_m/totalP;
IpplTimings::stopTimer(particleCreation);
P->q = P->Q_m / totalP;
msg << "particles created and initial conditions assigned " << endl;
isFirstRepartition = false;
//The update after the particle creation is not needed as the
//particles are generated locally
// The update after the particle creation is not needed as the
// particles are generated locally
IpplTimings::startTimer(DummySolveTimer);
P->rho_m = 0.0;
P->runSolver();
IpplTimings::stopTimer(DummySolveTimer);
P->scatterCIC(totalP, 0, hr);
IpplTimings::startTimer(SolveTimer);
@@ -340,16 +328,15 @@ int main(int argc, char* argv[]) {
IpplTimings::startTimer(dumpDataTimer);
P->dumpData();
P->gatherStatistics(totalP);
//P->dumpLocalDomains(FL, 0);
// P->dumpLocalDomains(FL, 0);
IpplTimings::stopTimer(dumpDataTimer);
double alpha = -0.5 * dt;
double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
// begin main timestep loop
msg << "Starting iterations ..." << endl;
for (unsigned int it=0; it<nt; it++) {
// Staggered Leap frog or Boris algorithm as per
for (unsigned int it = 0; it < nt; it++) {
// Staggered Leap frog or Boris algorithm as per
// https://www.sciencedirect.com/science/article/pii/S2590055219300526
// eqns 4(a)-4(c). Note we don't use the Boris trick here and do
// the analytical matrix inversion which is not complex in this case.
@@ -380,31 +367,31 @@ int main(int argc, char* argv[]) {
});
IpplTimings::stopTimer(PTimer);
//drift
// drift
IpplTimings::startTimer(RTimer);
P->R = P->R + dt * P->P;
IpplTimings::stopTimer(RTimer);
//Since the particles have moved spatially update them to correct processors
IpplTimings::startTimer(updateTimer);
// Since the particles have moved spatially update them to correct processors
IpplTimings::startTimer(updateTimer);
PL.update(*P, bunchBuffer);
IpplTimings::stopTimer(updateTimer);
// Domain Decomposition
if (P->balance(totalP, it+1)) {
msg << "Starting repartition" << endl;
IpplTimings::startTimer(domainDecomposition);
P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
IpplTimings::stopTimer(domainDecomposition);
//IpplTimings::startTimer(dumpDataTimer);
//P->dumpLocalDomains(FL, it+1);
//IpplTimings::stopTimer(dumpDataTimer);
if (P->balance(totalP, it + 1)) {
msg << "Starting repartition" << endl;
IpplTimings::startTimer(domainDecomposition);
P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
IpplTimings::stopTimer(domainDecomposition);
// IpplTimings::startTimer(dumpDataTimer);
// P->dumpLocalDomains(FL, it+1);
// IpplTimings::stopTimer(dumpDataTimer);
}
//scatter the charge onto the underlying grid
P->scatterCIC(totalP, it+1, hr);
//Field solve
// scatter the charge onto the underlying grid
P->scatterCIC(totalP, it + 1, hr);
// Field solve
IpplTimings::startTimer(SolveTimer);
P->runSolver();
IpplTimings::stopTimer(SolveTimer);
@@ -412,7 +399,7 @@ int main(int argc, char* argv[]) {
// gather E field
P->gatherCIC();
//kick
// kick
IpplTimings::startTimer(PTimer);
auto R2view = P->R.getView();
auto P2view = P->P.getView();
@@ -447,7 +434,7 @@ int main(int argc, char* argv[]) {
P->dumpData();
P->gatherStatistics(totalP);
IpplTimings::stopTimer(dumpDataTimer);
msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl;
}
msg << "Penning Trap: End." << endl;
@@ -456,7 +443,8 @@ int main(int argc, char* argv[]) {
IpplTimings::print(std::string("timing.dat"));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_chrono = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
std::chrono::duration<double> time_chrono =
std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Elapsed time: " << time_chrono.count() << std::endl;
return 0;
Loading