00001
00022 #include "gpu_tools.cuh"
00023
00024
00025
00026
00027
00028 __global__ void gpuGenerateVerletList(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00029 {
00030 int idx = blockIdx.x * blockDim.x + threadIdx.x;
00031 if (idx < N)
00032 {
00033 FLOAT_ARRAY_TYPE d;
00034 dVcount[idx]=0;
00035 int* dVlist;
00036 for (int i=0; i<N; ++i)
00037 {
00038 if ( i != idx )
00039 {
00040 gpuDistPBC(&d, &dPos[idx], &dPos[i], &boxSize);
00041
00042 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00043 {
00044 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00045 dVlist[idx] = i;
00046 dVcount[idx] = dVcount[idx] + 1;
00047 }
00048 }
00049 }
00050 dmax[idx]=0.0f;
00051 }
00052
00053 (*doUpdate)=false;
00054 }
00055
00056
00057 __global__ void gpuGenerateVerletListVar1(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00058 {
00059 int idx = blockIdx.x * blockDim.x + threadIdx.x;
00060 if (idx < N)
00061 {
00062 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00063 FLOAT_ARRAY_TYPE d;
00064 dVcount[idx]=0;
00065 int* dVlist;
00066 for (int i=0; i<N; ++i)
00067 {
00068 if ( i != idx )
00069 {
00070 gpuDistPBC(&d, &tPos, &dPos[i], &boxSize);
00071 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00072 {
00073 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00074 dVlist[idx] = i;
00075 dVcount[idx] = dVcount[idx] + 1;
00076 }
00077 }
00078 }
00079 dmax[idx]=0.0f;
00080 }
00081
00082 (*doUpdate)=false;
00083 }
00084
00085
00086 __global__ void gpuGenerateVerletListSmem(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00087 {
00088
00089 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00090 int tid = threadIdx.x;
00091 int idx = blockIdx.x * blockDim.x + tid;
00092 if (idx < N)
00093 {
00094 FLOAT_ARRAY_TYPE tPos = dPos[idx];
00095 FLOAT_ARRAY_TYPE d;
00096 dVcount[idx]=0;
00097 int* dVlist;
00098
00099 for(int j=0; j<gridDim.x; ++j)
00100 {
00101 int bOffset = j * blockDim.x;
00102 sPos[tid] = dPos[bOffset+tid];
00103 __syncthreads();
00104 for (int i=0; i<blockDim.x; ++i)
00105 {
00106 int id = bOffset + i;
00107 if ( id != idx && id < N)
00108 {
00109 gpuDistPBC(&d, &tPos, &sPos[i], &boxSize);
00110 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00111 {
00112 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00113 dVlist[idx] = id;
00114 dVcount[idx] = dVcount[idx] + 1;
00115 }
00116 }
00117 }
00118 __syncthreads();
00119 }
00120 dmax[idx]=0.0f;
00121 }
00122
00123 (*doUpdate)=false;
00124 }
00125
00126
00127
00128 __global__ void gpuGenerateVerletListSmemVar1(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00129 {
00130 extern __shared__ FLOAT_ARRAY_TYPE sPos[];
00131 int tid = threadIdx.x;
00132 int offset = blockDim.x * blockIdx.x;
00133 int idx = offset + tid;
00134 if (idx < N)
00135 {
00136 sPos[tid] = dPos[idx];
00137 __syncthreads();
00138 FLOAT_ARRAY_TYPE d;
00139 dVcount[idx]=0;
00140 int* dVlist;
00141
00142
00143 for(int j=0; j< offset; ++j)
00144 {
00145 gpuDistPBC(&d, &sPos[tid], &dPos[j], &boxSize);
00146 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00147 {
00148 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00149 dVlist[idx] = j;
00150 dVcount[idx] = dVcount[idx] + 1;
00151 }
00152 }
00153
00154 for(int j=0; j<blockDim.x; ++j)
00155 {
00156 if(j != tid)
00157 {
00158 gpuDistPBC(&d, &sPos[tid], &sPos[j], &boxSize);
00159 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00160 {
00161 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00162 dVlist[idx] = offset + j;
00163 dVcount[idx] = dVcount[idx] + 1;
00164 }
00165 }
00166 }
00167
00168 for(int j=offset+blockDim.x; j<N; ++j)
00169 {
00170 gpuDistPBC(&d, &sPos[tid], &dPos[j], &boxSize);
00171 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00172 {
00173 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00174 dVlist[idx] = j;
00175 dVcount[idx] = dVcount[idx] + 1;
00176 }
00177 }
00178 dmax[idx]=0.0f;
00179 }
00180
00181 (*doUpdate)=false;
00182 }
00183
00184
00185
00186
00187
00188 void rescaleT(int N, int dim, int numInterData, int numThreads, float T, int threads, int blocks, FLOAT_ARRAY_TYPE *dVel, float *sumSq)
00189 {
00190
00191 SqVecSumReduction(dVel, sumSq, numInterData, numThreads, N);
00192
00193
00194 gpuRescaleTKernel<<<blocks, threads>>>(N, dim, T, dVel, sumSq);
00195 checkCudaError("Kernel error: gpuRescaleTKernel");
00196 }
00197
00198 __global__ void gpuRescaleTKernel(int N, int dim, float T, FLOAT_ARRAY_TYPE *dVel, float *vSumSq)
00199 {
00200 int idx = blockIdx.x * blockDim.x + threadIdx.x;
00201 float factor = __fsqrt_rn(dim*N*T/vSumSq[0]);
00202 if (idx < N)
00203 {
00204 dVel[idx].x = dVel[idx].x * factor;
00205 dVel[idx].y = dVel[idx].y * factor;
00206 dVel[idx].z = dVel[idx].z * factor;
00207 }
00208 }
00209
00210
00211
00212
00213 void removeDrift(int N, int numInterData, int numThreads, int threads, int blocks, FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *compSum)
00214 {
00215
00216 componentSumReduction(dVel, compSum, numInterData, numThreads, N);
00217
00218
00219 gpuRemoveDriftKernel<<<blocks, threads>>>(N, dVel, compSum);
00220 checkCudaError("Kernel error: gpuRemoveDriftKernel");
00221 }
00222
00223 __global__ void gpuRemoveDriftKernel(int N, FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *compSum)
00224 {
00225 int id = blockIdx.x * blockDim.x + threadIdx.x;
00226 FLOAT_ARRAY_TYPE factor;
00227 factor.x = -compSum[0].x/N;
00228 factor.y = -compSum[0].y/N;
00229 factor.z = -compSum[0].z/N;
00230 if (id < N)
00231 {
00232 dVel[id].x += factor.x;
00233 dVel[id].y += factor.y;
00234 dVel[id].z += factor.z;
00235 }
00236 }
00237
00238 __global__ void gpuGetVelOnTime(FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *dVt, FLOAT_ARRAY_TYPE *dForce, int N, float dt)
00239 {
00240 int idx = blockIdx.x * blockDim.x + threadIdx.x;
00241 if (idx < N)
00242 {
00243 dVt[idx].x = dVel[idx].x + 0.5f * dt * dForce[idx].x;
00244 dVt[idx].y = dVel[idx].y + 0.5f * dt * dForce[idx].y;
00245 dVt[idx].z = dVel[idx].z + 0.5f * dt * dForce[idx].z;
00246 }
00247 }
00248
00249
00250
00251
00252
00253
00254
00255 float getKineticEnergy(FLOAT_ARRAY_TYPE *dVel, int &N, float *sumSq, int &maxThreads, int &maxBlocks)
00256 {
00257
00258 SqVecSumReduction(dVel, sumSq, 64,64, N);
00259 float e;
00260 cudaMemcpy(&e, sumSq, sizeof(float), cudaMemcpyDeviceToHost);
00261 e *= 0.5f;
00262 return e;
00263 }
00264
00265
00266
00267
00268
00269 float getHarmonicPotentialEnergy(float *dEpot, FLOAT_ARRAY_TYPE *dPos, float *sum, int *cnn, size_t &pitch, int &N, int &numNN, float &k, float &boxSize, dim3 &dimGrid, dim3 &dimBlock, int &maxThreads, int &maxBlocks)
00270 {
00271 float e;
00272 gpuGetHarmonicPotentialEnergyKernel<<< dimGrid, dimBlock>>>(dEpot, dPos, cnn, pitch, N, numNN, k, boxSize);
00273 checkCudaError("Kernel error: gpu_get_harmonic_potential");
00274 sumReduction(dEpot, sum, 64, 64, N);
00275 cudaMemcpy(&e, sum, sizeof(float), cudaMemcpyDeviceToHost);
00276 return e;
00277 }
00278
00279
00280 __global__ void gpuGetHarmonicPotentialEnergyKernel(float *dEpot, FLOAT_ARRAY_TYPE *dPos, int *cnn, size_t pitch, int N, int numNN, float k, float boxSize)
00281 {
00282 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00283 FLOAT_ARRAY_TYPE d;
00284 if (idx < N)
00285 {
00286 dEpot[idx] = 0.0f;
00287 int pid;
00288 int *row = (int*)((char*)cnn + idx * pitch);
00289 for (int c = 0; c < numNN; ++c)
00290 {
00291 pid = row[c];
00292 if (pid != -1)
00293 {
00294 gpuDistPBC(&d, &dPos[idx], &dPos[pid], &boxSize);
00295 dEpot[idx] += d.x * d.x + d.y * d.y + d.z * d.z;
00296 }
00297 }
00298 dEpot[idx] *= 0.25f*k;
00299 }
00300 }
00301
00302
00303 float getLJCutPotentialEnergy(FLOAT_ARRAY_TYPE *dPos, float *dEpot, float *sum, int &N, float &boxSize, float &rCutSq, dim3 &dimGrid, dim3 &dimBlock, int &maxThreads, int &maxBlocks)
00304 {
00305 float e;
00306 gpuLJCutPotentialEnergyKernel<<< dimGrid, dimBlock>>>(dPos, dEpot, N, boxSize, rCutSq);
00307 checkCudaError("Kernel error: gpuLJCutPotentialEnergyKernel");
00308 sumReduction(dEpot, sum, 64, 64, N);
00309 cudaMemcpy(&e, sum, sizeof(float), cudaMemcpyDeviceToHost);
00310 return e;
00311 }
00312
00313
00314 __global__ void gpuLJCutPotentialEnergyKernel(FLOAT_ARRAY_TYPE *dPos, float *dEpot, int N, float boxSize, float rCutSq)
00315 {
00316
00317 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00318 if (idx < N)
00319 {
00320 FLOAT_ARRAY_TYPE d;
00321 float invR6;
00322 dEpot[idx]=0.0f;
00323 for (int i=0; i<N; ++i)
00324 {
00325 if ( i != idx)
00326 {
00327 gpuDistPBC( &d, &dPos[idx], &dPos[i], &boxSize);
00328 invR6 = d.x * d.x + d.y * d.y + d.z * d.z;
00329 if ( rCutSq > invR6 )
00330 {
00331 invR6 = 1.0f / invR6;
00332 invR6 = powf( invR6, 3.0f );
00333 dEpot[idx] += 2.0f * invR6 * (invR6 - 1.0f);
00334 }
00335 }
00336 }
00337 }
00338 }
00339
00340
00341
00342
00343 inline static unsigned long inKB(unsigned long bytes)
00344 {
00345 return bytes/1024;
00346 }
00347
00348 inline static unsigned long inMB(unsigned long bytes)
00349 {
00350 return bytes/(1048576);
00351 }
00352
00353
00354 bool memInfo()
00355 {
00356 bool success=true;
00357 unsigned int free, total;
00358 CUresult res;
00359 res = cuMemGetInfo(&free, &total);
00360 if (res != CUDA_SUCCESS)
00361 {
00362 std::cout<<"uMemGetInfo failed: "<<res<<"\n";
00363 success=false;
00364 }
00365 else
00366 {
00367 std::cout<<"________________ Memory usage ________________\n";
00368 std::cout<<"Free : "<<free<<" bytes ("<<inKB(free)<<" KB) ("<<inMB(free)<<" MB)\n";
00369 std::cout<<"Used : "<<total-free<<" bytes ("<<inKB(total-free)<<" KB) ("<<inMB(total-free)<<" MB)\n";
00370 std::cout<<"Total: "<<total<<" bytes ("<<inKB(total)<<" KB) ("<<inMB(total)<<" MB)\n";
00371 std::cout<<"-----> "<<100.0*free/(double)total<<"% free, "<<100.0*(total - free)/(double)total<<"% used\n";
00372 std::cout<<"______________________________________________\n";
00373 }
00374 return success;
00375 }
00376
00377
00378 void checkCudaError(const char *identifier)
00379 {
00380 cudaThreadSynchronize();
00381 cudaError_t err = cudaGetLastError();
00382 if ( cudaSuccess != err)
00383 {
00384 std::cout<<"Cuda error: "<<identifier<<" : "<< cudaGetErrorString( err) <<"\n";
00385 exit(EXIT_FAILURE);
00386 }
00387 }
00388
00389
00390
00391 __global__ void copyDevToDev(FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, int N)
00392 {
00393 int idx = blockDim.x * blockIdx.x + threadIdx.x;
00394 if (idx < N)
00395 {
00396 dB[idx].x = dA[idx].x;
00397 dB[idx].y = dA[idx].y;
00398 dB[idx].z = dA[idx].z;
00399 }
00400 }
00401
00402
00403 __global__ void gpuSimpleSqSum(FLOAT_ARRAY_TYPE *dVal, float*sqSum, int N)
00404 {
00405 int idx=blockIdx.x*blockDim.x+threadIdx.x;
00406 if (idx==0)
00407 {
00408 (*sqSum)=0.0f;
00409 for (int i=0; i<N; ++i)
00410 {
00411 (*sqSum) += dVal[i].x * dVal[i].x + dVal[i].y * dVal[i].y + dVal[i].z * dVal[i].z;
00412 }
00413 }
00414 }
00415
00416 __global__ void gpuSimpleComponentSum(FLOAT_ARRAY_TYPE *dVal, FLOAT_ARRAY_TYPE *sum, int N)
00417 {
00418 int idx=blockIdx.x*blockDim.x+threadIdx.x;
00419 if (idx==0)
00420 {
00421 (*sum).x=0.0f;
00422 (*sum).y=0.0f;
00423 (*sum).z=0.0f;
00424 for (int i=0; i<N; ++i)
00425 {
00426 (*sum).x += dVal[i].x;
00427 (*sum).y += dVal[i].y;
00428 (*sum).z += dVal[i].z;
00429 }
00430 }
00431 }
00432
00433
00434
00435
00436
00437
00438 void showPositions(FLOAT_ARRAY_TYPE *hPos, int N)
00439 {
00440 for (int i=0; i<N; ++i)
00441 {
00442 std::cout<<"hPos["<<i<<"] = "<<hPos[i].x<<" , "<<hPos[i].y<<" , "<<hPos[i].z<<"\n";
00443 }
00444 }
00445
00446 void showVelocities(FLOAT_ARRAY_TYPE *hVel, int &N)
00447 {
00448 for (int i=0; i<N; ++i)
00449 {
00450 std::cout<<"hVel["<<i<<"] = "<<hVel[i].x<<" , "<<hVel[i].y<<" , "<<hVel[i].z<<"\n";
00451 }
00452 }
00453
00454 void showForces(FLOAT_ARRAY_TYPE *hForce, int &N)
00455 {
00456 for (int i=0; i<N; ++i)
00457 {
00458 std::cout<<"hForce["<<i<<"] = "<<hForce[i].x<<" , "<<hForce[i].y<<" , "<<hForce[i].z<<"\n";
00459 }
00460 }
00461
00462 void showDeviceArray(FLOAT_ARRAY_TYPE *dArray, int N)
00463 {
00464 FLOAT_ARRAY_TYPE *hArray = new FLOAT_ARRAY_TYPE[N];
00465 cudaMemcpy(hArray, dArray, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00466 std::cout<<"---- Device array ----\n\n";
00467 for (int i=0; i<N; ++i)
00468 {
00469 std::cout<<i<<": "<<hArray[i].x<<" "<<hArray[i].y<<" "<<hArray[i].z<<"\n";
00470 }
00471 std::cout<<"----------------------\n\n";
00472 delete[] hArray;
00473 }
00474
00475 void showNeareastNeighborList(int *dNNListArray, int &N, int &numNN)
00476 {
00477 for (int i=0; i<N; ++i)
00478 {
00479 int id=numNN*i;
00480 std::cout<<"---------------------\nparticle: "<<i<<"\nneighbors:\n";
00481 for (int j=0; j<numNN; ++j)
00482 {
00483 std::cout<<dNNListArray[id+j]<<"\n";
00484 }
00485 }
00486 std::cout<<"------------------------\n";
00487 std::cout<<"------------------------\n";
00488 }
00489
00490 void showVerletList(int *dVlists, size_t pitchVlists, int *dVcount, int N, int NList)
00491 {
00492 int *hVerletList=new int[N*NList];
00493 int *hCount = new int[N];
00494 cudaMemcpy(hCount, dVcount, N * sizeof(int), cudaMemcpyDeviceToHost);
00495 cudaMemcpy2D(hVerletList, N*sizeof(int), dVlists, pitchVlists, N * sizeof(int), NList, cudaMemcpyDeviceToHost);
00496 for (int i=0; i<10; ++i)
00497 {
00498 std::cout<<"i: "<<i<<" ; vcount: "<<hCount[i]<<" -->";
00499 for (int j=0; j<hCount[i]; ++j)
00500 {
00501 std::cout<<" "<<hVerletList[i + j*N];
00502 }
00503 std::cout<<"\n_________________\n";
00504 }
00505 int maxentry=0;
00506 int indexmax;
00507 for (int i=0; i<N; ++i)
00508 {
00509 if (maxentry < hCount[i])
00510 {
00511 maxentry = hCount[i];
00512 indexmax=i;
00513 }
00514 }
00515 std::cout<<"max entry: "<<maxentry<<" @ index: "<<indexmax<<"\n";
00516 delete[] hVerletList;
00517 delete[] hCount;
00518 }
00519
00520
00521
00522 bool setCudaDevice(int gpuDevNo)
00523 {
00524
00525 int deviceCount;
00526 cudaGetDeviceCount(&deviceCount);
00527 if(deviceCount <= gpuDevNo)
00528 {
00529 std::cout<<"Device ID '"<<gpuDevNo<<"' not present!\n";
00530 return false;
00531 }
00532
00533 cudaSetDevice( gpuDevNo );
00534 cudaDeviceProp prop;
00535 cudaGetDevice(&gpuDevNo);
00536 cudaGetDeviceProperties(&prop, gpuDevNo);
00537 std::cout<<"_______________________________________________________\n";
00538 std::cout<<"Number of GPU devices: "<<deviceCount<<"\n";
00539 std::cout<<"Running on GPU device #: "<<gpuDevNo<<"\n";
00540 std::cout<<"GPU processor #: "<<8*prop.multiProcessorCount<<"\n";
00541 std::cout<<"Computing Capability: ["<<prop.major<<"."<< prop.minor<<"]\n";
00542 std::cout<<"_______________________________________________________\n";
00543 return true;
00544 }
00545
00546
00547
00548 bool generateStartScript(std::string exeName)
00549 {
00550 bool ok;
00551 std::string scriptname = "example.sh";
00552 std::string inputfile = "N16_T0.5_g2.0_k0.1_init.dat";
00553 std::string basename = "N16_T0.5_g2.0_k0.1_";
00554 int dim = 3;
00555 int N = 4096;
00556 int numNN = 6;
00557 float T = 0.5;
00558 float k = 0.1;
00559 float boxSize = 32;
00560 float dt = 0.004;
00561 int NOffset = 0;
00562 int NSim = 10000;
00563 int NOut = 1000;
00564 int NAdjust = 100;
00565 int gpuDevNo=0;
00566 ok = generateScript(scriptname, exeName, inputfile, basename, dim, N, numNN, T, k, boxSize, dt, NOffset, NSim, NOut, NAdjust, gpuDevNo);
00567 return ok;
00568 }
00569
00570
00571 bool generateRestartScript(std::string exeName, std::string basename, int dim, int N, int numNN, float T, float k, float boxSize, float dt, int NOffset, int NSim, int NOut, int NAdjust, int gpuDevNo)
00572 {
00573 std::ostringstream ssFile;
00574 std::string scriptname;
00575 ssFile<<basename<<"_continue.sh";
00576 scriptname = ssFile.str();
00577 ssFile.str("");
00578 ssFile.clear();
00579 ssFile.fill('0');
00580 ssFile<<basename<<"";
00581 ssFile.width(10);
00582 ssFile<<NOffset;
00583 ssFile.width(1);
00584 ssFile<<".dat";
00585 std::string inputfile = ssFile.str();
00586 bool ok;
00587 ok = generateScript(scriptname, exeName, inputfile, basename, dim, N, numNN, T, k, boxSize, dt, NOffset, NSim, NOut, NAdjust, gpuDevNo);
00588 return ok;
00589 }
00590
00591 bool generateScript(std::string scriptname, std::string exeName, std::string inputfile, std::string basename, int dim, int N, int numNN, float T, float k, float boxSize, float dt, int NOffset, int NSim, int NOut, int NAdjust, int gpuDevNo)
00592 {
00593 std::ofstream out(scriptname.c_str());
00594 if (!out.good())
00595 {
00596 std::cout<<"'"<<scriptname<<"' couldn't be created!\n";
00597 return 0;
00598 }
00599 out<<"#!/bin/bash\n\n";
00600 out<<"# input x-v-data file name\n";
00601 out<<"input="<<inputfile<<" \n\n";
00602 out<<"# output base filename\n";
00603 out<<"basename="<<basename<<"\n\n";
00604 out<<"# Dimension of the simulation \n";
00605 out<<"dim="<<dim<<"\n\n";
00606 out<<"# particle number \n";
00607 out<<"N="<<N<<"\n\n";
00608 out<<"# maximum number of coupled nearest neighbors\n";
00609 out<<"num_nn="<<numNN<<"\n\n";
00610 out<<"# temperature\n";
00611 out<<"temperature="<<T<<"\n\n";
00612 out<<"# spring constant:\n";
00613 out<<"k="<<k<<"\n\n";
00614 out<<"# edge length of the simulation box\n";
00615 out<<"box="<<boxSize<<"\n\n";
00616 out<<"# timestep\n";
00617 out<<"timestep="<<dt<<"\n\n";
00618 out<<"# offset for simulation steps\n";
00619 out<<"Noffset="<<NOffset<<"\n\n";
00620 out<<"# simulate till NSim steps are done\n";
00621 out<<"Nsim="<<NSim<<"\n\n";
00622 out<<"# output intervall (CONSTRAINT: must be a multiple of 'Nadjust')\n";
00623 out<<"Nout="<<NOut<<"\n\n";
00624 out<<"# NVT velocity scaling and drift removal intervall\n";
00625 out<<"Nadjust="<<NAdjust<<"\n\n";
00626 out<<"# number of GPU device to run the job on\n";
00627 out<<"gpuDevNo="<<gpuDevNo<<"\n\n##################\n\n";
00628 out<<exeName<<" $input $basename $dim $N $num_nn $temperature $k $box $timestep $Noffset $Nsim $Nout $Nadjust $gpuDevNo\n";
00629 return 0;
00630 }
00631
00632
00633
00634
00635
00636
00637 __global__ void gpuReduce1(float *dInData, float *dInterData, int numInterData, int numThreads, int N)
00638 {
00639 extern __shared__ float smem[];
00640 int idx=blockIdx.x*blockDim.x+threadIdx.x;
00641 float s=0.0f;
00642 for (int j=idx; j<N; j+=numInterData*numThreads)
00643 {
00644 s += dInData[j];
00645 }
00646 smem[threadIdx.x]=s;
00647 __syncthreads();
00648 if (threadIdx.x==0)
00649 {
00650 float s=0.0f;
00651 for (int i=0; i<numThreads; i++)
00652 {
00653 s+=smem[i];
00654 }
00655 dInterData[blockIdx.x]=s;
00656 }
00657 }
00658
00659
00660 __global__ void gpuReduce2(float *dInterData, int numInterData, int numThreads)
00661 {
00662 extern __shared__ float smem[];
00663 int idx=threadIdx.x;
00664 float s=0.0f;
00665 for (int j=idx; j<numInterData; j+=numThreads)
00666 {
00667 s+=dInterData[j];
00668 }
00669 smem[threadIdx.x]=s;
00670 __syncthreads();
00671 if (idx==0)
00672 {
00673 float s=0.0f;
00674 for (int i=0; i<numThreads; i++)
00675 {
00676 s+=smem[i];
00677 }
00678 dInterData[0]=s;
00679 }
00680 }
00681
00682
00683
00684 __global__ void gpuSqSumReduce1(FLOAT_ARRAY_TYPE *dInData, float *dInterData, int numInterData, int numThreads, int N)
00685 {
00686 extern __shared__ float smem[];
00687 int idx=blockIdx.x*blockDim.x+threadIdx.x;
00688 float s=0.0f;
00689 for (int j=idx; j<N; j+=numInterData*numThreads)
00690 {
00691 s += dInData[j].x * dInData[j].x + dInData[j].y * dInData[j].y + dInData[j].z * dInData[j].z;
00692 }
00693 smem[threadIdx.x]=s;
00694 __syncthreads();
00695 if (threadIdx.x==0)
00696 {
00697 float s=0.0f;
00698 for (int i=0; i<numThreads; i++)
00699 {
00700 s+=smem[i];
00701 }
00702 dInterData[blockIdx.x]=s;
00703 }
00704 }
00705
00706
00707 __global__ void gpuComponentSumReduce1(FLOAT_ARRAY_TYPE *dInData, FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads, int N)
00708 {
00709 extern __shared__ FLOAT_ARRAY_TYPE smema[];
00710 int tid = threadIdx.x;
00711 int bid = blockIdx.x;
00712 int idx=bid*blockDim.x+tid;
00713 FLOAT_ARRAY_TYPE s;
00714 s.x = 0.0f;
00715 s.y = 0.0f;
00716 s.z = 0.0f;
00717 for (int j=idx; j<N; j+=numInterData*numThreads)
00718 {
00719 s.x += dInData[j].x;
00720 s.y += dInData[j].y;
00721 s.z += dInData[j].z;
00722 }
00723 smema[tid].x = s.x;
00724 smema[tid].y = s.y;
00725 smema[tid].z = s.z;
00726 __syncthreads();
00727 if (tid==0)
00728 {
00729 s.x = 0.0f;
00730 s.y = 0.0f;
00731 s.z = 0.0f;
00732 for (int i=0; i<numThreads; i++)
00733 {
00734 s.x += smema[i].x;
00735 s.y += smema[i].y;
00736 s.z += smema[i].z;
00737 }
00738 dInterData[bid].x = s.x;
00739 dInterData[bid].y = s.y;
00740 dInterData[bid].z = s.z;
00741 }
00742 }
00743
00744
00745 __global__ void gpuComponentSumReduce2(FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads)
00746 {
00747 extern __shared__ FLOAT_ARRAY_TYPE smema[];
00748 int tid=threadIdx.x;
00749 FLOAT_ARRAY_TYPE s;
00750 s.x = 0.0f;
00751 s.y = 0.0f;
00752 s.z = 0.0f;
00753
00754 for (int j=tid; j<numInterData; j+=numThreads)
00755 {
00756 s.x += dInterData[j].x;
00757 s.y += dInterData[j].y;
00758 s.z += dInterData[j].z;
00759 }
00760
00761 smema[tid].x = s.x;
00762 smema[tid].y = s.y;
00763 smema[tid].z = s.z;
00764 __syncthreads();
00765 if (tid==0)
00766 {
00767 s.x = 0.0f;
00768 s.y = 0.0f;
00769 s.z = 0.0f;
00770 for (int i=0; i<numThreads; i++)
00771 {
00772 s.x += smema[i].x;
00773 s.y += smema[i].y;
00774 s.z += smema[i].z;
00775 }
00776 dInterData[0].x = s.x;
00777 dInterData[0].y = s.y;
00778 dInterData[0].z = s.z;
00779 }
00780 }
00781
00782
00783 void sumReduction(float *dInData, float *dInterData, int numInterData, int numThreads, int N)
00784 {
00785 int smem = numThreads * sizeof(float);
00786 gpuReduce1<<<numInterData, numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00787 checkCudaError("Kernel error: reduce 1");
00788 gpuReduce2<<<1, numThreads, smem>>>(dInterData, numInterData, numThreads);
00789 checkCudaError("Kernel error: reduce 2");
00790 }
00791
00792
00793
00794 void SqVecSumReduction(FLOAT_ARRAY_TYPE *dInData, float *dInterData, int numInterData, int numThreads, int N)
00795 {
00796 int smem = numThreads * sizeof(float);
00797 gpuSqSumReduce1<<<numInterData,numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00798 checkCudaError("Kernel error: gpuSqSumReductionKernel");
00799 gpuReduce2<<<1,numThreads, smem>>>(dInterData, numInterData, numThreads);
00800 checkCudaError("Kernel error: reduce2");
00801 }
00802
00803
00804 void componentSumReduction(FLOAT_ARRAY_TYPE *dInData, FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads, int N)
00805 {
00806 int smem = numThreads * sizeof(FLOAT_ARRAY_TYPE);
00807 gpuComponentSumReduce1<<<numInterData, numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00808 checkCudaError("Kernel error: component reduction kernel");
00809 gpuComponentSumReduce2<<<1, numThreads, smem>>>(dInterData, numInterData, numThreads);
00810 checkCudaError("Kernel error: component reduce 2");
00811 }
00812