Commit 7475754a authored by snuverink_j's avatar snuverink_j
Browse files

Resolve "Remove DKS"

parent db2860f9
......@@ -137,26 +137,6 @@ message ("Found Boost library dir: ${Boost_LIBRARY_DIR}")
message ("Found Boost libraries: ${Boost_LIBRARIES}")
include_directories (SYSTEM ${Boost_INCLUDE_DIRS})
option (ENABLE_DKS "Enable DKS" OFF)
if (ENABLE_DKS)
find_package (DKS 1.1.1 REQUIRED HINTS $ENV{DKS_PREFIX} $ENV{DKS_DIR} $ENV{DKS})
set (DKS_VERSION_OPAL \"${DKS_VERSION}\")
message ("Found DKS version: ${DKS_VERSION}")
message ("Found DKS library: ${DKS_LIBRARY}")
message ("Found DKS include dir: ${DKS_INCLUDE_DIR}")
add_definitions(-DDKS_MPI)
### OpenCL compiler flags ###
#add_compile_options (-lOpenCL -pthread -DDKS_OPENCL)
### CUDA compiler flags ###
add_definitions(-DDKS_CUDA)
### if any accelerator enabled set flag to use DKS ###
add_definitions(-DIPPL_DKS -DIPPL_DKS_CUDA -DOPAL_DKS)
endif ()
# Handle options
option (BUILD_OPAL_UNIT_TESTS "Unit tests" OFF)
if (BUILD_OPAL_UNIT_TESTS)
......
......@@ -86,21 +86,7 @@ include_directories (
add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
if (ENABLE_DKS)
message ( "DKS include dirs: ${DKS_INCLUDE_DIR}")
include_directories (${DKS_INCLUDE_DIR})
link_directories (${DKS_LIBRARY_DIR})
target_link_libraries (
${TARGET_LINK_LIBRARIES}
ippl
dks
${DKS_CUDA_LIBS}
)
else()
target_link_libraries(
ippl
)
endif ()
target_link_libraries(ippl)
install (TARGETS ippl DESTINATION lib)
install (FILES ${IPPL_BASEDIR_HDRS} DESTINATION include)
......
......@@ -20,10 +20,6 @@
#include "FFT/FFTBase.h"
#ifdef IPPL_DKS
#include "DKSOPAL.h"
#endif
// forward declarations
//template <unsigned Dim> class FieldLayout;
#include "FieldLayout/FieldLayout.h"
......@@ -57,11 +53,6 @@ class FFT : public FFTBase<Dim,T> {};
template <size_t Dim, class T>
class FFT<CCTransform,Dim,T> : public FFTBase<Dim,T> {
private:
#ifdef IPPL_DKS
DKSOPAL base;
#endif
public:
typedef FieldLayout<Dim> Layout_t;
......@@ -102,18 +93,6 @@ FFT(const Domain_t& cdomain, const bool& compressTemps=false)
normFact /= lengths[d];
}
#if defined(IPPL_DKS) && defined(IPPL_DKS_CUDA)
INFOMSG("Init DKS base cuda" << endl);
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
#elseif defined(IPPL_DKS) && defined(IPPL_DKS_OPENCL)
INFOMSG("Init DKS base opencl" << endl);
base.setAPI("OpenCL", 6);
base.setDevice("-gpu", 4);
base.initDevice();
#endif
// set up FFT Engine
this->getEngine().setup(Dim, transformTypes, lengths);
// set up the temporary fields
......@@ -461,11 +440,7 @@ public:
/** real-to-complex FFT on GPU: transfer the real field to GPU execute FFT
return the pointer to memory on GPU where complex results are stored
*/
#ifdef IPPL_DKS
void transformDKSRC(int direction, RealField_t &f, void* real_ptr, void* comp_ptr,
DKSOPAL &dksbase, int streamId = -1, const bool& constInput=false);
#endif
*/
/** complex-to-real FFT
Same as above, but with input and output field types reversed.
*/
......@@ -473,14 +448,10 @@ public:
const bool& constInput=false);
void transform(const char* directionName, ComplexField_t& f,
RealField_t& g, const bool& constInput=false);
/** complex-to-real FFT on GPU: pass pointer to GPU memory where complex field
is stored, do the inverse FFT and transfer real field back to host memory
*/
#ifdef IPPL_DKS
void transformDKSCR(int direction, RealField_t& g, void* real_ptr, void* comp_ptr,
DKSOPAL &dksbase, int streamId = -1, const bool& constInput=false);
#endif
private:
......
......@@ -937,135 +937,6 @@ FFT<RCTransform,Dim,T>::~FFT(void) {
}
//-----------------------------------------------------------------------------
// real-to-complex fft; direction is +1 or -1
//-----------------------------------------------------------------------------
/*
gpu version of fft if dks enabled transfers realfield_t to gpu, allocates memory
on gpu for result field (complex), does the fft and returns
*/
#ifdef IPPL_DKS
template <size_t Dim, class T>
void
FFT<RCTransform,Dim,T>::transformDKSRC(
int direction,
typename FFT<RCTransform,Dim,T>::RealField_t& f,
void* real_ptr,
void* comp_ptr,
DKSOPAL &dksbase,
int streamId,
const bool& constInput)
{
//check the domain of incoming field
const Layout_t& in_layout = f.getLayout();
const Domain_t& in_dom = in_layout.getDomain();
PAssert_EQ( this->checkDomain(this->getDomain(), in_dom), true);
size_t nTransformDims = this->numTransformDims();
//*** just use f field as is and keep decomposition as defined in input file ***//
RealField_t* tempR = &f;
typename RealField_t::const_iterator_if rl_i = tempR->begin_if();
// get the lfields
RealLField_t* rldf = (*rl_i).second.get();
// make sure we are uncompressed
rldf->Uncompress();
// get the raw data pointers
T* localreal = rldf->getP();
/** get global dimensions of real domain and local dimensions of real subdomain
calc global dimensions of complex subdomain */
int NR_l[Dim], NR_g[Dim], NC_g[Dim];
for (size_t d = 0; d < Dim; d++) {
NR_l[d] = (int)rldf->size(d);
NR_g[d] = (int)tempR->getDomain()[d].length();
NC_g[d] = NR_g[d];
}
NC_g[0] = (NC_g[0] / 2) + 1;
//get global and local domain sizes
int sizereal = NR_l[0]*NR_l[1]*NR_l[2];
int totalreal = tempR->getDomain().size();
//int totalcomp = NC_g[0]*NC_g[1]*NC_g[2];
//local vnodes get starting position for real field subdomains
int *idx = new int[Ippl::getNodes()];
int *idy = new int[Ippl::getNodes()];
int *idz = new int[Ippl::getNodes()];
for (typename Layout_t::const_iterator_iv i_s = tempR->getLayout().begin_iv(); i_s != tempR->getLayout().end_iv(); ++i_s) {
Domain_t tmp = (*i_s).second->getDomain();
int node = (*i_s).second->getNode();
idx[node] = tmp[0].min();
idy[node] = tmp[1].min();
idz[node] = tmp[2].min();
}
//remote vnodes get starting position for real field subdomains
for (typename Layout_t::iterator_dv remote = tempR->getLayout().begin_rdv(); remote != tempR->getLayout().end_rdv(); ++remote) {
Domain_t tmp = (*remote).second->getDomain();
int node = (*remote).second->getNode();
idx[node] = tmp[0].min();
idy[node] = tmp[1].min();
idz[node] = tmp[2].min();
}
int id[3] = {idx[Ippl::myNode()], idy[Ippl::myNode()], idz[Ippl::myNode()]};
if (Ippl::myNode() == 0) {
//if only one node is working do dksbase write otherwise use cuda aware mpi
if (Ippl::getNodes() > 1) {
if (streamId == -1) {
//gather data from different mpi processes directly into gpu buffer
dksbase.gather3DData( real_ptr, localreal, sizereal, MPI_DOUBLE, NR_g, NR_l,
idx, idy, idz,
Ippl::getNodes(), Ippl::myNode(), 0, Ippl::getComm() );
} else {
//gather data using CUDA IPC for async data transfer
dksbase.gather3DDataAsync<T>( real_ptr, localreal, NR_g, NR_l, id, streamId);
//sync needed to wait for data transfer to finish
dksbase.syncDevice();
MPI_Barrier(Ippl::getComm());
}
} else {
//write real data to device
dksbase.writeDataAsync<T>(real_ptr, localreal, totalreal, streamId);
//dksbase.writeData<T>(real_ptr, localreal, totalreal);
}
//call real to complex fft
dksbase.callR2CFFT(real_ptr, comp_ptr, nTransformDims, (int*)NR_g, streamId);
//normalize fft
if (direction == +1)
dksbase.callNormalizeFFT(comp_ptr, nTransformDims, (int*) NC_g, streamId);
} else {
if (streamId == -1) {
//send data via gatherv to gpu controled by root process
dksbase.gather3DData( NULL, localreal, sizereal, MPI_DOUBLE, NR_g, NR_l, idx, idy, idz,
Ippl::getNodes(), Ippl::myNode(), 0, Ippl::getComm() );
} else {
//transfer data to device memory
dksbase.gather3DDataAsync<T>( real_ptr, localreal, NR_g, NR_l, id, streamId);
//sync needed to wait for data transfer to finish
dksbase.syncDevice();
MPI_Barrier(Ippl::getComm());
}
}
/* end dks part */
// finish timing the whole mess
}
#endif
template <size_t Dim, class T>
void
FFT<RCTransform,Dim,T>::transform(
......@@ -1074,8 +945,6 @@ FFT<RCTransform,Dim,T>::transform(
typename FFT<RCTransform,Dim,T>::ComplexField_t& g,
const bool& constInput)
{
// time the whole mess
// indicate we're doing another fft
// incipplstat(incffts);
......@@ -1314,140 +1183,6 @@ FFT<RCTransform,Dim,T>::transform(
// RC FFT; opposite direction, from complex to real
//-----------------------------------------------------------------------------
/*
GPU version of CR inverse FFT uses complex field stored on GPU to perform inverse fft
transfers back the real field
*/
#ifdef IPPL_DKS
template <size_t Dim, class T>
void
FFT<RCTransform,Dim,T>::transformDKSCR(
int direction,
RealField_t& g,
void* real_ptr,
void* comp_ptr,
DKSOPAL &dksbase,
int streamId,
const bool& constInput)
{
const Layout_t& out_layout = g.getLayout();
const Domain_t& out_dom = out_layout.getDomain();
//if (Ippl::myNode() == 0)
// std::cout << "DEBUG INVERSE g: " << g.getLayout() << std::endl;
PAssert_EQ( this->checkDomain(this->getDomain(),out_dom), true);
size_t nTransformDims = this->numTransformDims();
// see if we can put final result directly into g
RealField_t* tempR;
//***Use g as is and keep decomposition as defined in input file***/
tempR = &g;
typename RealField_t::const_iterator_if rl_i = tempR->begin_if();
// Get the LFields
RealLField_t* rldf = (*rl_i).second.get();
// make sure we are uncompressed
rldf->Uncompress();
// get the raw data pointers
T* localreal = rldf->getP();
//get sizes of global domains and local subdomains
int NR_l[Dim], NR_g[Dim], NC_g[Dim];
for (size_t d=0; d<Dim; d++) {
NR_l[d] = (int)rldf->size(d);
NR_g[d] = (int)tempR->getDomain()[d].length();
NC_g[d] = NR_g[d];
}
NC_g[0] = (NC_g[0] / 2) + 1;
//get sizes of global and local domains
int totalreal = tempR->getDomain().size();
//local vnodes get starting position for real field subdomains
int *idx = new int[Ippl::getNodes()];
int *idy = new int[Ippl::getNodes()];
int *idz = new int[Ippl::getNodes()];
for (typename Layout_t::const_iterator_iv i_s = tempR->getLayout().begin_iv(); i_s != tempR->getLayout().end_iv(); ++i_s) {
Domain_t tmp = (*i_s).second->getDomain();
int node = (*i_s).second->getNode();
idx[node] = tmp[0].min();
idy[node] = tmp[1].min();
idz[node] = tmp[2].min();
}
//remote vnodes get starting position for real field subdomains
for (typename Layout_t::iterator_dv remote = tempR->getLayout().begin_rdv(); remote != tempR->getLayout().end_rdv(); ++remote) {
Domain_t tmp = (*remote).second->getDomain();
int node = (*remote).second->getNode();
idx[node] = tmp[0].min();
idy[node] = tmp[1].min();
idz[node] = tmp[2].min();
}
int id[3] = {idx[Ippl::myNode()], idy[Ippl::myNode()], idz[Ippl::myNode()]};
/* DKS part */
if (Ippl::myNode() == 0) {
//call real to complex fft
dksbase.callC2RFFT(real_ptr, comp_ptr, nTransformDims, (int*)NR_g, streamId);
//normalize
if (direction == +1)
dksbase.callNormalizeC2RFFT(real_ptr, nTransformDims, (int*)NR_g, streamId);
if (Ippl::getNodes() > 1) {
dksbase.syncDevice();
MPI_Barrier(Ippl::getComm());
/*
dksbase.scatter3DData(real_ptr, localreal, sizereal, MPI_DOUBLE, NR_g, NR_l,
idx, idy, idz, Ippl::getNodes(), Ippl::myNode(),
0, Ippl::getComm() );
*/
dksbase.scatter3DDataAsync<T>(real_ptr, localreal, NR_g, NR_l, id);
MPI_Barrier(Ippl::getComm());
dksbase.syncDevice();
MPI_Barrier(Ippl::getComm());
} else {
//read real data from device
dksbase.readDataAsync<T>(real_ptr, localreal, totalreal, streamId);
dksbase.syncDevice();
//dksbase.readData<T>(real_ptr, localreal, totalreal);
}
} else {
//receive data from GPU controled by root process
MPI_Barrier(Ippl::getComm());
/*
dksbase.scatter3DData(NULL, localreal, sizereal, MPI_DOUBLE, NR_g, NR_l, idx, idy, idz,
Ippl::getNodes(), Ippl::myNode(), 0, Ippl::getComm() );
*/
dksbase.scatter3DDataAsync<T>(real_ptr, localreal, NR_g, NR_l, id);
MPI_Barrier(Ippl::getComm());
dksbase.syncDevice();
MPI_Barrier(Ippl::getComm());
}
/* end dks part */
// Now assign into output Field, and compress last temp's storage:
if (tempR != &g) {
g[out_dom] = (*tempR)[tempR->getLayout().getDomain()];
if (this->compressTemps()) *tempR = 0;
}
// finish timing the whole mess
}
#endif
template <size_t Dim, class T>
void
FFT<RCTransform,Dim,T>::transform(
......@@ -3563,4 +3298,4 @@ FFT<SineTransform,1U,T>::transform(
// c-basic-offset: 4
// indent-tabs-mode: nil
// require-final-newline: nil
// End:
// End:
\ No newline at end of file
......@@ -698,8 +698,6 @@ LField<T,Dim>::swapData( LField<T,Dim>& a )
//
//////////////////////////////////////////////////////////////////////
// allocate memory for LField and if DKS is used and page-locked (pl) is +1 allocate
// page-locked memory for storage
template<class T, unsigned Dim>
void
LField<T,Dim>::allocateStorage(int newsize)
......
......@@ -12,7 +12,4 @@ const char *ippl_compile_machine = ${IPPL_COMPILE_MACHINE};
const char *ippl_compile_options = ${IPPL_COMPILE_OPTIONS};
const char *ippl_compile_user = ${IPPL_COMPILE_USER};
/* DKS version number */
#define IPPL_DKS_VERSION ${DKS_VERSION_OPAL}
#endif
......@@ -81,15 +81,6 @@ void IpplInfo::deleteGlobals() {
std::stack<StaticIpplInfo> IpplInfo::stashedStaticMembers;
//dks base member of IpplInfo initialized to default values
bool IpplInfo::DKSEnabled = false;
#if defined(IPPL_DKS) && defined(IPPL_DKS_CUDA)
DKSOPAL *IpplInfo::DKS = new DKSOPAL("Cuda", "-gpu");
#elseif defined(IPPL_DKS) && defined(IPPL_DKS_OPENCL)
DKSOPAL *IpplInfo::DKS = new DKSOPAL("OpenCL", "-gpu");
#endif
// should we use the optimization of deferring guard cell fills until
// absolutely needed? Can be changed to true by specifying the
// flag --defergcfill
......@@ -365,27 +356,6 @@ IpplInfo::IpplInfo(int& argc, char**& argv, int removeargs, MPI_Comm mpicomm) {
param_error(argv[i],
"Please specify an output level from 0 to 5", 0);
} else if ( ( strcmp(argv[i], "--use-dks") == 0 ) ) {
// Set DKSEnabled to true if OPAL is compiled with DKS.
#ifdef IPPL_DKS
int ndev = 0;
DKS->getDeviceCount(ndev);
if (ndev > 0) {
DKSEnabled = true;
INFOMSG("DKS enabled OPAL will use GPU where possible");
INFOMSG(endl);
} else {
DKSEnabled = false;
INFOMSG("No GPU device detected! --use-dks flag will have no effect");
INFOMSG(endl);
}
//TODO: check if any device is available and disable DKS if there isn't
#else
DKSEnabled = false;
INFOMSG("OPAL compiled without DKS, " << argv[i] << " flag has no effect");
INFOMSG(endl);
#endif
} else if ( ( strcmp(argv[i], "--warn") == 0 ) ) {
// Set the output level for warning messages.
if ( (i + 1) < argc && argv[i+1][0] != '-' && atoi(argv[i+1]) >= 0 )
......@@ -760,9 +730,6 @@ int IpplInfo::mySMPNode() {
// printVersion: print out a version summary. If the argument is true,
// print out a detailed listing, otherwise a summary.
void IpplInfo::printVersion(void) {
#ifdef OPAL_DKS
INFOMSG("DKS Version " << IPPL_DKS_VERSION << endl);
#endif
INFOMSG("IPPL Framework version " << version() << endl);
INFOMSG("Last build date: " << compileDate() << " by user ");
INFOMSG(compileUser() << endl);
......
......@@ -55,11 +55,6 @@
//(without further increasing the number of defines).
#include <mpi.h>
//DKS include
#ifdef IPPL_DKS
#include "DKSOPAL.h"
#endif
// forward declarations
class IpplStats;
class IpplInfo;
......@@ -87,10 +82,6 @@ public:
static IpplStats *Stats;
#ifdef IPPL_DKS
static DKSOPAL *DKS;
#endif
// Constructor 1: specify the argc, argv values from the cmd line.
// The second argument controls whether the IPPL-specific command line
// arguments are stripped out (the default) or left in (if the setting
......@@ -252,9 +243,6 @@ public:
// library (from IpplVersions.h)
static const char *compileUser();
//Static flag telling wheteher to use DKS when runnign OPAL
static bool DKSEnabled;
// stash all static members
static void stash();
......
......@@ -2,9 +2,5 @@
#include "Message/Communicate.h"
#include "Utility/IpplStats.h"
#ifdef IPPL_DKS
#include "DKSOPAL.h"
#endif
StaticIpplInfo::StaticIpplInfo() { }
StaticIpplInfo::~StaticIpplInfo() { }
......@@ -6,10 +6,6 @@
class Communicate;
class IpplStats;
class Inform;
//DKS include
#ifdef IPPL_DKS
class DKSOPAL;
#endif
class StaticIpplInfo {
public:
......@@ -32,10 +28,6 @@ public:
Inform *Error;
Inform *Debug;
#ifdef IPPL_DKS
DKSOPAL *DKS;
#endif
// flag telling whether to use optimization for reducing
// communication by deferring guard cell fills.
bool deferGuardCellFills;
......
......@@ -16,12 +16,6 @@ link_directories (
set (IPPL_LIBS ippl)
if (ENABLE_DKS)
include_directories (${DKS_INCLUDE_DIR})
link_directories (${DKS_LIBRARY_DIR})
set (IPPL_LIBS ${IPPL_LIBS} ${DKS_LIBRARY_DIR}/libdks.a)
endif ()
add_executable (fftspeed fftspeed.cpp)
add_executable (TestFFT TestFFT.cpp)
add_executable (TestFFT-1 TestFFT-1.cpp)
......@@ -45,7 +39,7 @@ target_link_libraries (TestRCMIC ${IPPL_LIBS} ${MPI_CXX_LIBRARIES} boost_timer
add_subdirectory (SeaborgRes)
# vi: set et ts=4 sw=4 sts=4:
# Local Variables:
# mode: cmake
# cmake-tab-width: 4
......
......@@ -2,8 +2,8 @@
/***************************************************************************
*
* The IPPL Framework
*
* This program was prepared by PSI.
*
* This program was prepared by PSI.
* All rights in the program are reserved by PSI.
* Neither PSI nor the author(s)
* makes any warranty, express or implied, or assumes any liability or
......@@ -16,15 +16,11 @@
#include <complex>
#include <string>
#ifdef IPPL_DKS
#include "DKSOPAL.h"
#endif
using namespace std;
bool Configure(int argc, char *argv[],
unsigned int *nx, unsigned int *ny, unsigned int *nz,
int *serialDim, unsigned int *processes, unsigned int *nLoop)
unsigned int *nx, unsigned int *ny, unsigned int *nz,
int *serialDim, unsigned int *processes, unsigned int *nLoop)
{
Inform msg("Configure ");
......@@ -43,7 +39,7 @@ bool Configure(int argc, char *argv[],
*nLoop = atoi(argv[++i]);
} else if (s == "-Decomp") {
*serialDim = atoi(argv[++i]);
}
}
else {
errmsg << "Illegal format for or unknown option '" << s.c_str() << "'.";
errmsg << endl;
......@@ -72,7 +68,7 @@ std::complex<double> printComplex(std::complex<double> in) {
r = in.real();
if (in.imag() > 0.00001 || in.imag() < -0.00001)
i = in.imag();
return std::complex<double>(r, i);