Commit b3a12e02 authored by Uldis Locans's avatar Uldis Locans
Browse files

OpenCL FFT using clfft and tests

parent 4432d324
......@@ -167,6 +167,40 @@ public:
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error zeroing cuda memory!\n");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
int dkserror = DKS_SUCCESS;
cudaError cerror;
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
cuda_getStream(streamId));
if (cerror != cudaSuccess)
dkserror = DKS_ERROR;
} else
dkserror = DKS_ERROR;
return dkserror;
}
/**
* Info: write data to memory
* Retrun: success or error code
......
......@@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
......@@ -273,7 +272,6 @@ __global__ void mirroredRhoField(double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1];
if (i != 0) rho2_m[id2] = data;
......@@ -389,8 +387,10 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
int thread = 128;
int block = (I * J * K / thread) + 1;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
if (streamId == -1) {
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
......@@ -399,6 +399,7 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
......
......@@ -114,6 +114,7 @@ DKSBase::DKSBase() {
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
oclgreens = new OpenCLGreensFunction(oclbase);
#endif
#ifdef DKS_MIC
......@@ -149,6 +150,7 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
oclgreens = new OpenCLGreensFunction(oclbase);
#endif
#ifdef DKS_MIC
......@@ -187,6 +189,7 @@ DKSBase::~DKSBase() {
delete oclchi;
delete oclcol;
delete oclbase;
delete oclgreens;
#endif
......@@ -613,6 +616,9 @@ int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int
if (apiCuda()) {
return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenCL()) {
return OPENCL_SAFECALL(oclgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2) );
} else if (apiOpenMP()) {
//BENI:
return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
......@@ -627,6 +633,8 @@ int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
if (apiCuda())
return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
......@@ -638,6 +646,8 @@ int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId
if (apiCuda())
return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
......@@ -649,6 +659,8 @@ int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size,
if (apiCuda())
return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
......
......@@ -32,6 +32,7 @@
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLChiSquare.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#include "OpenCL/OpenCLGreensFunction.h"
#endif
#ifdef DKS_CUDA
......@@ -76,6 +77,7 @@ private:
OpenCLFFT *oclfft;
OpenCLChiSquare *oclchi;
OpenCLCollimatorPhysics *oclcol;
OpenCLGreensFunction *oclgreens;
#endif
#ifdef DKS_CUDA
......
......@@ -4,6 +4,7 @@ SET (_SRCS
OpenCLChiSquare.cpp
OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp
OpenCLGreensFunction.cpp
)
SET (_HDRS
......@@ -12,6 +13,7 @@ SET (_HDRS
OpenCLChiSquare.h
OpenCLCollimatorPhysics.h
OpenCLChiSquareRuntime.h
OpenCLGreensFunction.h
)
#INCLUDE_DIRECTORIES (
......@@ -25,6 +27,7 @@ SET (_KERNELS
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLChiSquareRuntime.cl
OpenCLKernels/OpenCLGreensFunction.cl
)
ADD_SOURCES (${_SRCS})
......
......@@ -428,7 +428,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
int ierr;
//create program from kernel
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
NULL, &ierr);
if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
return DKS_ERROR;
......@@ -438,7 +439,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
/*
check if compileng kernel source succeded, if failed return error code
check if compiling kernel source succeded, if failed return error code
if in debug mode get compilation info and print program build log witch
will give indication what made the compilation fail
*/
......@@ -447,7 +448,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
//get build status
cl_build_status status;
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
//get log size
size_t log_size;
......
......@@ -30,24 +30,11 @@
#include <CL/cl_ext.h>
#endif
#include "clRNG/clRNG.h"
#include "clRNG/mrg31k3p.h"
#include "../DKSDefinitions.h"
/* struct for random number state */
typedef struct {
double s10;
double s11;
double s12;
double s20;
double s21;
double s22;
double z;
bool gen;
} RNDState;
class OpenCLBase {
private:
......@@ -195,7 +182,7 @@ public:
Return: return pointer to memory
*/
cl_mem ocl_allocateMemory(size_t size, int &ierr);
/*
Name: allocateMemory
Info: allocate memory on device
......@@ -203,6 +190,20 @@ public:
*/
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
cl_int ierr;
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
sizeof(T)*size, 0, nullptr, nullptr);
if (ierr != CL_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
/*
Name: writeData
Info: write data to device memory (needs ptr to mem object)
......
......@@ -117,15 +117,13 @@ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool for
*/
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
std::cout << "execute RCFFT" << std::endl;
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &real_in, &comp_out, NULL);
0, NULL, NULL, &real_in, &comp_out, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
......@@ -144,8 +142,6 @@ int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3],
*/
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
std::cout << "execute CRFFT" << std::endl;
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
......@@ -214,7 +210,13 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
cl_int err;
clfftDim dim = CLFFT_3D;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
/* Create 3D fft plan*/
......@@ -244,9 +246,20 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim = CLFFT_3D;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
size_t clOutStride[3] = {1, half, half * N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
......@@ -254,6 +267,8 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
......@@ -269,9 +284,20 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim = CLFFT_3D;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, half, half * N[1]};
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
......@@ -279,6 +305,8 @@ int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
......
#include "OpenCLGreensFunction.h"
#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl"
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
m_base = base;
......@@ -29,6 +29,8 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
......@@ -42,26 +44,28 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_oclbase->ocl_createKernel("kernelTmpgreen");
ierr = m_base->ocl_createKernel("kernelTmpgreen");
//set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->setKernelArg(1, sizeof(double), &hr_m0);
m_base->setKernelArg(2, sizeof(double), &hr_m1);
m_base->setKernelArg(3, sizeof(double), &hr_m2);
m_base->setKernelArg(4, sizeof(int), &I);
m_base->setKernelArg(5, sizeof(int), &J);
m_base->setKernelArg(6, sizeof(int), &K);
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId)
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
......@@ -70,8 +74,6 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
int NI = 2*(I - 1);
int NJ = 2*(J - 1);
int NK = 2*(K - 1);
//set the work item size
size_t work_size = 128;
......@@ -80,20 +82,22 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_oclbase->ocl_createKernel("kernelIntegration");
ierr = m_base->ocl_createKernel("kernelIntegration");
//set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->setKernelArg(2, sizeof(int), &I);
m_base->setKernelArg(3, sizeof(int), &J);
m_base->setKernelArg(4, sizeof(int), &K);
m_base->setKernelArg(5, sizeof(int), &NI);
m_base->setKernelArg(6, sizeof(int), &NJ);
m_base->setKernelArg(7, sizeof(int), &NK);
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
double zero = 0.0;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
......@@ -102,6 +106,8 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
......@@ -114,6 +120,8 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
int J2 = 2*J;
int K2 = 2*K;
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
//set the work item size
size_t work_size = 128;
size_t work_items = NI * NJ * NK;
......@@ -121,19 +129,20 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField");
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
//set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->setKernelArg(1, sizeof(int), &I2);
m_base->setKernelArg(2, sizeof(int), &J2);
m_base->setKernelArg(3, sizeof(int), &K2);
m_base->setKernelArg(4, sizeof(int), &NI);
m_base->setKernelArg(5, sizeof(int), &NJ);
m_base->setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
//execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
......@@ -141,4 +150,32 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem mem_ptr1 = (cl_mem) ptr1;
cl_mem mem_ptr2 = (cl_mem) ptr2;
//set the work item size
size_t work_size = 128;
size_t work_items = size;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("multiplyComplexFields");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
m_base->ocl_setKernelArg(2, sizeof(int), &size);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
......@@ -60,4 +60,4 @@ public:
};
#endif H_OPENCL_GREENSFUNCTION
#endif
......@@ -81,28 +81,29 @@ __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgre
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
/** miror rho-field */
__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ];
}
__kernel void mirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp) {
__kernel void kernelMirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp,
int size)
{
int tid = get_local_id(0);
int id = get_global_id(0);
if (id == 0)
rho2_m[0] = rho2_m[NI * NJ];
rho2_m[0] = rho2_m[NI * NJ];
barrier(CLK_GLOBAL_MEM_FENCE);
......@@ -127,27 +128,29 @@ __kernel void mirroredRhoField(__global double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = 0.0;
if (id1 < size)
data = rho2_m[id1];
double data = rho2_m[id1];
if (i != 0) rho2_m[id2] = data;
if (i != 0 && id2 < size) rho2_m[id2] = data;
if (j != 0) rho2_m[id3] = data;
if (j != 0 && id3 < size) rho2_m[id3] = data;
if (i != 0 && j != 0) rho2_m[id4] = data;
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
if (k != 0) rho2_m[id5] = data;
if (k != 0 && id5 < size) rho2_m[id5] = data;