mdgpu - a molecular dynamic simulation program using GPUs: /home/julian/svn/gpucpumd/mdgpu2/gpu

00001 
00022 #include "gpu_forces.cuh"
00023 
00024 
00025 
00026 
00027 __device__ void gpuAddLJForce(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *boxsize)
00028 {
00029     float invR2, invR6, fd;
00030     FLOAT_ARRAY_TYPE d;
00031     gpuDistPBC( &d, dA, dB, boxsize);
00032     invR2 = 1.0f / ( d.x * d.x + d.y * d.y + d.z * d.z );
00033     invR6 = powf( invR2, 3.0f );
00034     fd = 48.0f * invR2 * invR6 * ( invR6 - 0.5f );
00035     (*dForce).x += fd * d.x;
00036     (*dForce).y += fd * d.y;
00037     (*dForce).z += fd * d.z;
00038 }
00039 
00040 __device__ void gpuAddLJForceCut(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *boxsize, float *rCutSq)
00041 {
00042     float invR2, invR6, fd;
00043     FLOAT_ARRAY_TYPE d;
00044     gpuDistPBC( &d, dA, dB, boxsize);
00045     invR2 = d.x * d.x + d.y * d.y + d.z * d.z;
00046     if ( (*rCutSq) > invR2 )
00047     {
00048         invR2 = 1.0f / invR2;
00049         invR6 = powf( invR2, 3.0f );
00050         fd = 48.0f * invR2 * invR6 * ( invR6 - 0.5f );
00051         (*dForce).x += fd * d.x;
00052         (*dForce).y += fd * d.y;
00053         (*dForce).z += fd * d.z;
00054     }
00055 }
00056 
00057 
00058 __device__ void gpuAddHarmonicBondForce(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *dK, float *boxsize)
00059 {
00060     FLOAT_ARRAY_TYPE d;
00061     gpuDistPBC(&d, dA, dB, boxsize);
00062     (*dForce).x -= (*dK) * d.x;
00063     (*dForce).y -= (*dK) * d.y;
00064     (*dForce).z -= (*dK) * d.z;
00065 }
00066 
00067 
00068 __global__ void gpuLJForces(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00069 {
00070     int tid = threadIdx.x;
00071     int idx = blockDim.x * blockIdx.x + tid;
00072     if (idx < N)
00073     {
00074         FLOAT_ARRAY_TYPE tPos = dPos[idx];
00075         FLOAT_ARRAY_TYPE f;
00076         f.x = 0.0f;
00077         f.y = 0.0f;
00078         f.z = 0.0f;
00079         for (int i = 0; i < N; ++i)
00080         {
00081             if ( i != idx) //exclude self interaction
00082             {
00083                 gpuAddLJForce( &f, &tPos, &dPos[i], &boxsize);
00084             }
00085         }
00086         gpuCumulateVec(&dForce[idx], &f);
00087     }
00088 }
00089 
00090 
00091 __global__ void gpuLJForcesSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00092 {
00093     //Reduced units are used -> epsilon=sigma=1
00094     int tid = threadIdx.x;
00095     int offset = blockDim.x * blockIdx.x;
00096     int rowOffset = offset + blockDim.x;
00097     int idx = offset + tid;
00098     //load particle positions into shared memory
00099     extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00100     sPos[tid] = dPos[idx];
00101 
00102     //sync such that everything is in shared memory before going on
00103     __syncthreads();
00104 
00105     //calculate forces for particles which are already in shared memory
00106     FLOAT_ARRAY_TYPE f;
00107 
00108     f.x = 0.0f;
00109     f.y = 0.0f;
00110     f.z = 0.0f;
00111     //block before thread block in device memory
00112     for (int i = 0; i < offset; ++i)
00113     {
00114         gpuAddLJForce( &f, &sPos[tid], &dPos[i], &boxsize);
00115     }
00116     //block in shared memory
00117     for (int i = 0; i < blockDim.x; ++i)
00118     {
00119         if ( i != tid) //excude self interaction
00120         {
00121             gpuAddLJForce( &f, &sPos[tid], &sPos[i], &boxsize);
00122         }
00123     }
00124     //block after threadblock in device memory
00125     for (int i = rowOffset; i < N; ++i)
00126     {
00127         gpuAddLJForce( &f, &sPos[tid], &dPos[i], &boxsize);
00128     }
00129     gpuCumulateVec(&dForce[idx], &f);
00130 }
00131 
00132 
00133 __global__ void gpuLJForcesSmemSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00134 {
00135     //Reduced units are used -> epsilon=sigma=1
00136     int tid = threadIdx.x;
00137     int bid = blockIdx.x;
00138     int bOffset;//current block offset
00139     int bdim = blockDim.x;
00140     int offset = blockDim.x * blockIdx.x;
00141     int idx = offset + tid;
00142     //load particle positions into shared memory
00143     extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00144     sPos[tid] = dPos[idx];
00145 
00146     //sync such that everything is in shared memory before going on
00147     __syncthreads();
00148 
00149     //calculate forces for particles which are already in shared memory
00150     FLOAT_ARRAY_TYPE f;
00151     f.x = 0.0f;
00152     f.y = 0.0f;
00153     f.z = 0.0f;
00154 
00155     for (int j = 0; j < bid; ++j)
00156     {
00157         bOffset = j * bdim;
00158         //load block of positions into shared memory
00159         sPos[blockDim.x+tid] = dPos[bOffset+tid];
00160         __syncthreads();
00161         //do force calculation for this block
00162         for (int i = 0; i < bdim; ++i)
00163         {
00164             gpuAddLJForce( &f, &sPos[tid], &sPos[bdim+i], &boxsize);
00165         }
00166         __syncthreads();
00167     }
00168 
00169     //do force calculation inside thread block
00170     for (int i = 0; i < blockDim.x; ++i)
00171     {
00172         if ( i != tid )
00173         {
00174             gpuAddLJForce( &f, &sPos[tid], &sPos[i], &boxsize);
00175         }
00176     }
00177     __syncthreads();
00178 
00179     //do force calculation for blocks after thread block
00180     for (int j = bid + 1; j < gridDim.x; ++j)
00181     {
00182         bOffset = j * bdim;
00183         //load block of positions into shared memory
00184         sPos[bdim+tid] = dPos[bOffset+tid];
00185         __syncthreads();
00186         for (int i = 0; i < bdim; ++i)
00187         {
00188             gpuAddLJForce( &f, &sPos[tid], &sPos[bdim+i], &boxsize);
00189         }
00190         __syncthreads();
00191     }
00192     gpuCumulateVec(&dForce[idx], &f);
00193 }
00194 
00195 
00196 __global__ void gpuLJForcesCut(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00197 {
00198     //Reduced units are used -> epsilon=sigma=1
00199     int tid = threadIdx.x;
00200     int idx = blockDim.x * blockIdx.x + tid;
00201     if (idx < N)
00202     {
00203         FLOAT_ARRAY_TYPE f;
00204         f.x = 0.0f;
00205         f.y = 0.0f;
00206         f.z = 0.0f;
00207         for (int i = 0; i < N; ++i)
00208         {
00209             if ( i != idx) //excude self interaction
00210             {
00211                 gpuAddLJForceCut( &f, &dPos[idx], &dPos[i], &boxsize, &rCutSq);
00212             }
00213         }
00214         gpuCumulateVec(&dForce[idx], &f);
00215     }
00216 }
00217 
00218 
00219 __global__ void gpuLJForcesCutVar1(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00220 {
00221     //Reduced units are used -> epsilon=sigma=1
00222     int tid = threadIdx.x;
00223     int idx = blockDim.x * blockIdx.x + tid;
00224     if (idx < N)
00225     {
00226         FLOAT_ARRAY_TYPE tPos = dPos[idx]; //reference particle stored in register
00227         FLOAT_ARRAY_TYPE f;
00228         f.x = 0.0f;
00229         f.y = 0.0f;
00230         f.z = 0.0f;
00231         for (int i = 0; i < N; ++i)
00232         {
00233             if ( i != idx) //excude self interaction
00234             {
00235                 gpuAddLJForceCut( &f, &tPos, &dPos[i], &boxsize, &rCutSq);
00236             }
00237         }
00238         gpuCumulateVec(&dForce[idx], &f);
00239     }
00240 }
00241 
00242 
00243 __global__ void gpuLJForcesCutRSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00244 {
00245     //Reduced units are used -> epsilon=sigma=1
00246     int tid = threadIdx.x;
00247     int offset = blockDim.x * blockIdx.x;
00248     int rowOffset = offset + blockDim.x;
00249     int idx = offset + tid;
00250     if(idx < N)
00251     {
00252         //load particle positions into shared memory
00253         extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00254         sPos[tid] = dPos[idx];
00255 
00256         //sync such that everything is in shared memory before going on
00257         __syncthreads();
00258 
00259         //calculate forces for particles which are already in shared memory
00260         FLOAT_ARRAY_TYPE f;
00261         f.x = 0.;
00262         f.y = 0.;
00263         f.z = 0.;
00264 
00265         //block before thread block in device memory
00266         for (int i = 0; i < offset; ++i)
00267         {
00268             gpuAddLJForceCut( &f, &sPos[tid], &dPos[i], &boxsize, &rCutSq);
00269         }
00270         //block in shared memory
00271         for (int i = 0; i < blockDim.x; ++i)
00272         {
00273             if ( i != tid) //excude self interaction
00274             {
00275                 gpuAddLJForceCut( &f, &sPos[tid], &sPos[i], &boxsize, &rCutSq);
00276             }
00277         }
00278         //block after threadblock in device memory
00279         for (int i = rowOffset; i < N; ++i)
00280         {
00281             gpuAddLJForceCut( &f, &sPos[tid], &dPos[i], &boxsize, &rCutSq);
00282         }
00283         gpuCumulateVec(&dForce[idx], &f);
00284     }
00285 }
00286 
00287 
00288 
00289 __global__ void gpuLJForcesCutSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00290 {
00291     //Reduced units are used -> epsilon=sigma=1
00292     int tid = threadIdx.x;
00293     int idx = blockIdx.x * blockDim.x + tid;
00294     extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00295     if(idx < N)
00296     {
00297         FLOAT_ARRAY_TYPE tPos = dPos[idx];//save reference particle position in register
00298         FLOAT_ARRAY_TYPE f;
00299         f.x = 0.;
00300         f.y = 0.;
00301         f.z = 0.;
00302         for(int j=0; j<gridDim.x; ++j)
00303         {
00304             int bOffset = j * blockDim.x;
00305             sPos[tid] = dPos[bOffset+tid]; //fetch block of particle positions and save in shared memory
00306             __syncthreads();
00307             for (int i=0; i<blockDim.x; ++i)
00308             {
00309                 int id = bOffset + i;
00310                 if ( id != idx && id < N)
00311                 {
00312                     gpuAddLJForceCut( &f, &tPos, &sPos[i], &boxsize, &rCutSq);
00313                 }
00314             }
00315             __syncthreads();
00316         }
00317         gpuCumulateVec(&dForce[idx], &f);
00318     }
00319 }
00320 
00321 
00322 
00323 __global__ void gpuLJForcesCutSmemSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00324 {
00325     //Reduced units are used -> epsilon=sigma=1
00326     int tid = threadIdx.x;
00327     int bid = blockIdx.x;
00328     int bOffset;//current block offset
00329     int bdim = blockDim.x;
00330     int offset = blockDim.x * blockIdx.x;
00331     int idx = offset + tid;
00332     if(idx < N)
00333     {
00334         //load particle positions into shared memory
00335         extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00336         sPos[tid] = dPos[idx];
00337 
00338         //sync such that everything is in shared memory before going on
00339         __syncthreads();
00340 
00341         //calculate forces for particles which are already in shared memory
00342         FLOAT_ARRAY_TYPE f;//temporary force storage
00343         f.x = 0.;
00344         f.y = 0.;
00345         f.z = 0.;
00346 
00347         for (int j = 0; j < bid; ++j)
00348         {
00349             bOffset = j * bdim;
00350             //load block of positions into shared memory
00351             sPos[blockDim.x+tid] = dPos[bOffset+tid];
00352             __syncthreads();
00353             for (int i = 0; i < bdim; ++i)
00354             {
00355                 gpuAddLJForceCut( &f, &sPos[tid], &sPos[bdim+i], &boxsize, &rCutSq);
00356             }
00357             __syncthreads();
00358         }
00359 
00360         //now do force calculation inside thread block
00361         for (int i = 0; i < blockDim.x; ++i)
00362         {
00363             if ( i != tid )
00364             {
00365                 gpuAddLJForceCut( &f, &sPos[tid], &sPos[i], &boxsize, &rCutSq);
00366             }
00367         }
00368         __syncthreads();
00369 
00370         //do force calculation for blocks after thread block
00371         for (int j = bid + 1; j < gridDim.x; ++j)
00372         {
00373             bOffset = j * bdim;
00374             //load block of positions into shared memory
00375             sPos[bdim+tid] = dPos[bOffset+tid];
00376             __syncthreads();
00377             //do force calculation for this block
00378             for (int i = 0; i < bdim; ++i)
00379             {
00380                 gpuAddLJForceCut( &f, &sPos[tid], &sPos[bdim+i], &boxsize, &rCutSq);
00381             }
00382             __syncthreads();
00383         }
00384         gpuCumulateVec(&dForce[idx], &f);
00385     }
00386 }
00387 
00388 __global__ void gpuResetForces(FLOAT_ARRAY_TYPE *dForces, int N)
00389 {
00390     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00391     if ( idx < N )
00392     {
00393         dForces[idx].x = 0.;
00394         dForces[idx].y = 0.;
00395         dForces[idx].z = 0.;
00396     }
00397 }
00398 
00399 
00400 __global__ void gpuLJForcesVerletList(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, int *dVlists, size_t vlists_pitch, int *dVcount, float boxsize, float rCutSq)
00401 {
00402     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00403 
00404     if ( idx < N )
00405     {
00406         FLOAT_ARRAY_TYPE tPos = dPos[idx];
00407         FLOAT_ARRAY_TYPE f;
00408         f.x = 0.;
00409         f.y = 0.;
00410         f.z = 0.;
00411         int* dVlist;
00412         for (int i = 0; i < dVcount[idx]; ++i)
00413         {
00414             dVlist = (int*)((char*)dVlists + i * vlists_pitch);
00415             gpuAddLJForceCut( &f, &tPos, &dPos[dVlist[idx]], &boxsize, &rCutSq);
00416         }
00417         gpuCumulateVec(&dForce[idx], &f);
00418     }
00419 }
00420 
00421 
00422 __global__ void gpuHarmonicBondForces(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int *cnn, size_t pitch, int N, int numNN, float k, float boxsize)
00423 {
00424     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00425     if (idx < N)
00426     {
00427         FLOAT_ARRAY_TYPE tPos = dPos[idx];
00428         int pid;
00429         int *row;
00430         for (int c = 0; c < numNN; ++c)
00431         {
00432             row = (int*)((char*)cnn + c * pitch);
00433             pid = row[idx];
00434             if (pid != -1)
00435             {
00436                 gpuAddHarmonicBondForce(&dForce[idx], &tPos, &dPos[pid], &k, &boxsize);
00437             }
00438         }
00439     }
00440 }
00441 
00442 
00443 __global__ void gpuHarmonicBondForcesSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int *cnn, size_t pitch, int N, int numNN, float k, float boxsize)
00444 {
00445     extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00446     int tid = threadIdx.x;
00447     int idx = blockDim.x * blockIdx.x + tid;
00448     if (idx < N)
00449     {
00450         sPos[tid] = dPos[idx];
00451         int pid;
00452         int *row;
00453         for (int c = 0; c < numNN; ++c)
00454         {
00455             row = (int*)((char*)cnn + c * pitch);
00456             pid = row[idx];
00457             if (pid != -1)
00458             {
00459                 gpuAddHarmonicBondForce(&dForce[idx], &sPos[tid], &dPos[pid], &k, &boxsize);
00460             }
00461         }
00462     }
00463 }
/home/julian/svn/gpucpumd/mdgpu2/gpu_forces.cu