00001
00022 #include "gpu_forces.cuh"
00023
00024
00025
00026
00027 __device__ void gpuAddLJForce(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *boxsize)
00028 {
00029 float invR2, invR6, fd;
00030 FLOAT_ARRAY_TYPE d;
00031 gpuDistPBC( &d, dA, dB, boxsize);
00032 invR2 = 1.0f / ( d.x * d.x + d.y * d.y + d.z * d.z );
00033 invR6 = powf( invR2, 3.0f );
00034 fd = 48.0f * invR2 * invR6 * ( invR6 - 0.5f );
00035 (*dForce).x += fd * d.x;
00036 (*dForce).y += fd * d.y;
00037 (*dForce).z += fd * d.z;
00038 }
00039
00040 __device__ void gpuAddLJForceCut(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *boxsize, float *rCutSq)
00041 {
00042 float invR2, invR6, fd;
00043 FLOAT_ARRAY_TYPE d;
00044 gpuDistPBC( &d, dA, dB, boxsize);
00045 invR2 = d.x * d.x + d.y * d.y + d.z * d.z;
00046 if ( (*rCutSq) > invR2 )
00047 {
00048 invR2 = 1.0f / invR2;
00049 invR6 = powf( invR2, 3.0f );
00050 fd = 48.0f * invR2 * invR6 * ( invR6 - 0.5f );
00051 (*dForce).x += fd * d.x;
00052 (*dForce).y += fd * d.y;
00053 (*dForce).z += fd * d.z;
00054 }
00055 }
00056
00057
00058 __device__ void gpuAddHarmonicBondForce(FLOAT_ARRAY_TYPE *dForce, FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, float *dK, float *boxsize)
00059 {
00060 FLOAT_ARRAY_TYPE d;
00061 gpuDistPBC(&d, dA, dB, boxsize);
00062 (*dForce).x -= (*dK) * d.x;
00063 (*dForce).y -= (*dK) * d.y;
00064 (*dForce).z -= (*dK) * d.z;
00065 }
00066
00067
00068 __global__ void gpuLJForces(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00069 {
00070 int tid = threadIdx.x;
00071 int idx = blockDim.x * blockIdx.x + tid;
00072 if (idx < N)
00073 {
00074 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00075 FLOAT_ARRAY_TYPE f;
00076 f.x = 0.0f;
00077 f.y = 0.0f;
00078 f.z = 0.0f;
00079 for (int i = 0; i < N; ++i)
00080 {
00081 if ( i != idx)
00082 {
00083 gpuAddLJForce( &f, &tPos, &dPos[i], &boxsize);
00084 }
00085 }
00086 gpuCumulateVec(&dForce[idx], &f);
00087 }
00088 }
00089
00090
00091 __global__ void gpuLJForcesSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00092 {
00093
00094 int tid = threadIdx.x;
00095 int offset = blockDim.x * blockIdx.x;
00096 int rowOffset = offset + blockDim.x;
00097 int idx = offset + tid;
00098
00099 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00100 sPos[tid] = dPos[idx];
00101
00102
00103 __syncthreads();
00104
00105
00106 FLOAT_ARRAY_TYPE f;
00107
00108 f.x = 0.0f;
00109 f.y = 0.0f;
00110 f.z = 0.0f;
00111
00112 for (int i = 0; i < offset; ++i)
00113 {
00114 gpuAddLJForce( &f, &sPos[tid], &dPos[i], &boxsize);
00115 }
00116
00117 for (int i = 0; i < blockDim.x; ++i)
00118 {
00119 if ( i != tid)
00120 {
00121 gpuAddLJForce( &f, &sPos[tid], &sPos[i], &boxsize);
00122 }
00123 }
00124
00125 for (int i = rowOffset; i < N; ++i)
00126 {
00127 gpuAddLJForce( &f, &sPos[tid], &dPos[i], &boxsize);
00128 }
00129 gpuCumulateVec(&dForce[idx], &f);
00130 }
00131
00132
00133 __global__ void gpuLJForcesSmemSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize)
00134 {
00135
00136 int tid = threadIdx.x;
00137 int bid = blockIdx.x;
00138 int bOffset;
00139 int bdim = blockDim.x;
00140 int offset = blockDim.x * blockIdx.x;
00141 int idx = offset + tid;
00142
00143 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00144 sPos[tid] = dPos[idx];
00145
00146
00147 __syncthreads();
00148
00149
00150 FLOAT_ARRAY_TYPE f;
00151 f.x = 0.0f;
00152 f.y = 0.0f;
00153 f.z = 0.0f;
00154
00155 for (int j = 0; j < bid; ++j)
00156 {
00157 bOffset = j * bdim;
00158
00159 sPos[blockDim.x+tid] = dPos[bOffset+tid];
00160 __syncthreads();
00161
00162 for (int i = 0; i < bdim; ++i)
00163 {
00164 gpuAddLJForce( &f, &sPos[tid], &sPos[bdim+i], &boxsize);
00165 }
00166 __syncthreads();
00167 }
00168
00169
00170 for (int i = 0; i < blockDim.x; ++i)
00171 {
00172 if ( i != tid )
00173 {
00174 gpuAddLJForce( &f, &sPos[tid], &sPos[i], &boxsize);
00175 }
00176 }
00177 __syncthreads();
00178
00179
00180 for (int j = bid + 1; j < gridDim.x; ++j)
00181 {
00182 bOffset = j * bdim;
00183
00184 sPos[bdim+tid] = dPos[bOffset+tid];
00185 __syncthreads();
00186 for (int i = 0; i < bdim; ++i)
00187 {
00188 gpuAddLJForce( &f, &sPos[tid], &sPos[bdim+i], &boxsize);
00189 }
00190 __syncthreads();
00191 }
00192 gpuCumulateVec(&dForce[idx], &f);
00193 }
00194
00195
00196 __global__ void gpuLJForcesCut(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00197 {
00198
00199 int tid = threadIdx.x;
00200 int idx = blockDim.x * blockIdx.x + tid;
00201 if (idx < N)
00202 {
00203 FLOAT_ARRAY_TYPE f;
00204 f.x = 0.0f;
00205 f.y = 0.0f;
00206 f.z = 0.0f;
00207 for (int i = 0; i < N; ++i)
00208 {
00209 if ( i != idx)
00210 {
00211 gpuAddLJForceCut( &f, &dPos[idx], &dPos[i], &boxsize, &rCutSq);
00212 }
00213 }
00214 gpuCumulateVec(&dForce[idx], &f);
00215 }
00216 }
00217
00218
00219 __global__ void gpuLJForcesCutVar1(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00220 {
00221
00222 int tid = threadIdx.x;
00223 int idx = blockDim.x * blockIdx.x + tid;
00224 if (idx < N)
00225 {
00226 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00227 FLOAT_ARRAY_TYPE f;
00228 f.x = 0.0f;
00229 f.y = 0.0f;
00230 f.z = 0.0f;
00231 for (int i = 0; i < N; ++i)
00232 {
00233 if ( i != idx)
00234 {
00235 gpuAddLJForceCut( &f, &tPos, &dPos[i], &boxsize, &rCutSq);
00236 }
00237 }
00238 gpuCumulateVec(&dForce[idx], &f);
00239 }
00240 }
00241
00242
00243 __global__ void gpuLJForcesCutRSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00244 {
00245
00246 int tid = threadIdx.x;
00247 int offset = blockDim.x * blockIdx.x;
00248 int rowOffset = offset + blockDim.x;
00249 int idx = offset + tid;
00250 if(idx < N)
00251 {
00252
00253 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00254 sPos[tid] = dPos[idx];
00255
00256
00257 __syncthreads();
00258
00259
00260 FLOAT_ARRAY_TYPE f;
00261 f.x = 0.;
00262 f.y = 0.;
00263 f.z = 0.;
00264
00265
00266 for (int i = 0; i < offset; ++i)
00267 {
00268 gpuAddLJForceCut( &f, &sPos[tid], &dPos[i], &boxsize, &rCutSq);
00269 }
00270
00271 for (int i = 0; i < blockDim.x; ++i)
00272 {
00273 if ( i != tid)
00274 {
00275 gpuAddLJForceCut( &f, &sPos[tid], &sPos[i], &boxsize, &rCutSq);
00276 }
00277 }
00278
00279 for (int i = rowOffset; i < N; ++i)
00280 {
00281 gpuAddLJForceCut( &f, &sPos[tid], &dPos[i], &boxsize, &rCutSq);
00282 }
00283 gpuCumulateVec(&dForce[idx], &f);
00284 }
00285 }
00286
00287
00288
00289 __global__ void gpuLJForcesCutSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00290 {
00291
00292 int tid = threadIdx.x;
00293 int idx = blockIdx.x * blockDim.x + tid;
00294 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00295 if(idx < N)
00296 {
00297 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00298 FLOAT_ARRAY_TYPE f;
00299 f.x = 0.;
00300 f.y = 0.;
00301 f.z = 0.;
00302 for(int j=0; j<gridDim.x; ++j)
00303 {
00304 int bOffset = j * blockDim.x;
00305 sPos[tid] = dPos[bOffset+tid];
00306 __syncthreads();
00307 for (int i=0; i<blockDim.x; ++i)
00308 {
00309 int id = bOffset + i;
00310 if ( id != idx && id < N)
00311 {
00312 gpuAddLJForceCut( &f, &tPos, &sPos[i], &boxsize, &rCutSq);
00313 }
00314 }
00315 __syncthreads();
00316 }
00317 gpuCumulateVec(&dForce[idx], &f);
00318 }
00319 }
00320
00321
00322
00323 __global__ void gpuLJForcesCutSmemSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, float boxsize, float rCutSq)
00324 {
00325
00326 int tid = threadIdx.x;
00327 int bid = blockIdx.x;
00328 int bOffset;
00329 int bdim = blockDim.x;
00330 int offset = blockDim.x * blockIdx.x;
00331 int idx = offset + tid;
00332 if(idx < N)
00333 {
00334
00335 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00336 sPos[tid] = dPos[idx];
00337
00338
00339 __syncthreads();
00340
00341
00342 FLOAT_ARRAY_TYPE f;
00343 f.x = 0.;
00344 f.y = 0.;
00345 f.z = 0.;
00346
00347 for (int j = 0; j < bid; ++j)
00348 {
00349 bOffset = j * bdim;
00350
00351 sPos[blockDim.x+tid] = dPos[bOffset+tid];
00352 __syncthreads();
00353 for (int i = 0; i < bdim; ++i)
00354 {
00355 gpuAddLJForceCut( &f, &sPos[tid], &sPos[bdim+i], &boxsize, &rCutSq);
00356 }
00357 __syncthreads();
00358 }
00359
00360
00361 for (int i = 0; i < blockDim.x; ++i)
00362 {
00363 if ( i != tid )
00364 {
00365 gpuAddLJForceCut( &f, &sPos[tid], &sPos[i], &boxsize, &rCutSq);
00366 }
00367 }
00368 __syncthreads();
00369
00370
00371 for (int j = bid + 1; j < gridDim.x; ++j)
00372 {
00373 bOffset = j * bdim;
00374
00375 sPos[bdim+tid] = dPos[bOffset+tid];
00376 __syncthreads();
00377
00378 for (int i = 0; i < bdim; ++i)
00379 {
00380 gpuAddLJForceCut( &f, &sPos[tid], &sPos[bdim+i], &boxsize, &rCutSq);
00381 }
00382 __syncthreads();
00383 }
00384 gpuCumulateVec(&dForce[idx], &f);
00385 }
00386 }
00387
00388 __global__ void gpuResetForces(FLOAT_ARRAY_TYPE *dForces, int N)
00389 {
00390 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00391 if ( idx < N )
00392 {
00393 dForces[idx].x = 0.;
00394 dForces[idx].y = 0.;
00395 dForces[idx].z = 0.;
00396 }
00397 }
00398
00399
00400 __global__ void gpuLJForcesVerletList(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int N, int *dVlists, size_t vlists_pitch, int *dVcount, float boxsize, float rCutSq)
00401 {
00402 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00403
00404 if ( idx < N )
00405 {
00406 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00407 FLOAT_ARRAY_TYPE f;
00408 f.x = 0.;
00409 f.y = 0.;
00410 f.z = 0.;
00411 int* dVlist;
00412 for (int i = 0; i < dVcount[idx]; ++i)
00413 {
00414 dVlist = (int*)((char*)dVlists + i * vlists_pitch);
00415 gpuAddLJForceCut( &f, &tPos, &dPos[dVlist[idx]], &boxsize, &rCutSq);
00416 }
00417 gpuCumulateVec(&dForce[idx], &f);
00418 }
00419 }
00420
00421
00422 __global__ void gpuHarmonicBondForces(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int *cnn, size_t pitch, int N, int numNN, float k, float boxsize)
00423 {
00424 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00425 if (idx < N)
00426 {
00427 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00428 int pid;
00429 int *row;
00430 for (int c = 0; c < numNN; ++c)
00431 {
00432 row = (int*)((char*)cnn + c * pitch);
00433 pid = row[idx];
00434 if (pid != -1)
00435 {
00436 gpuAddHarmonicBondForce(&dForce[idx], &tPos, &dPos[pid], &k, &boxsize);
00437 }
00438 }
00439 }
00440 }
00441
00442
00443 __global__ void gpuHarmonicBondForcesSmem(FLOAT_ARRAY_TYPE *dPos, FLOAT_ARRAY_TYPE *dForce, int *cnn, size_t pitch, int N, int numNN, float k, float boxsize)
00444 {
00445 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00446 int tid = threadIdx.x;
00447 int idx = blockDim.x * blockIdx.x + tid;
00448 if (idx < N)
00449 {
00450 sPos[tid] = dPos[idx];
00451 int pid;
00452 int *row;
00453 for (int c = 0; c < numNN; ++c)
00454 {
00455 row = (int*)((char*)cnn + c * pitch);
00456 pid = row[idx];
00457 if (pid != -1)
00458 {
00459 gpuAddHarmonicBondForce(&dForce[idx], &sPos[tid], &dPos[pid], &k, &boxsize);
00460 }
00461 }
00462 }
00463 }