mdgpu - a molecular dynamic simulation program using GPUs: /home/julian/svn/gpucpumd/mdgpu2/gpu

00001 
00022 #include "gpu_tools.cuh"
00023 
00024 
00025 //####################################### Verlet List functions ####################################//
00026 
00027 
00028 __global__ void gpuGenerateVerletList(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00029 {
00030     int idx = blockIdx.x * blockDim.x + threadIdx.x;
00031     if (idx < N)
00032     {
00033         FLOAT_ARRAY_TYPE d;
00034         dVcount[idx]=0;
00035         int* dVlist;
00036         for (int i=0; i<N; ++i)
00037         {
00038             if ( i != idx )
00039             {
00040                 gpuDistPBC(&d, &dPos[idx], &dPos[i], &boxSize);
00041 
00042                 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00043                 {
00044                     dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00045                     dVlist[idx] = i;
00046                     dVcount[idx] = dVcount[idx] + 1;
00047                 }
00048             }
00049         }
00050         dmax[idx]=0.0f;
00051     }
00052     //reset update flag
00053     (*doUpdate)=false;
00054 }
00055 
00056 
00057 __global__ void gpuGenerateVerletListVar1(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00058 {
00059     int idx = blockIdx.x * blockDim.x + threadIdx.x;
00060     if (idx < N)
00061     {
00062         FLOAT_ARRAY_TYPE tPos = dPos[idx]; //save reference particle position in register
00063         FLOAT_ARRAY_TYPE d;
00064         dVcount[idx]=0;
00065         int* dVlist;
00066         for (int i=0; i<N; ++i)
00067         {
00068             if ( i != idx )
00069             {
00070                 gpuDistPBC(&d, &tPos, &dPos[i], &boxSize);
00071                 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00072                 {
00073                     dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00074                     dVlist[idx] = i;
00075                     dVcount[idx] = dVcount[idx] + 1;
00076                 }
00077             }
00078         }
00079         dmax[idx]=0.0f;
00080     }
00081     //reset update flag
00082     (*doUpdate)=false;
00083 }
00084 
00085 
00086 __global__ void gpuGenerateVerletListSmem(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00087 {
00088     //CONSTRAINT: numBlocks * numThreadsPerBlock == N, otherwise __synchthread() called can not work!
00089     extern __shared__ FLOAT_ARRAY_TYPE sPos[]; //allocate shared memory
00090     int tid = threadIdx.x;
00091     int idx = blockIdx.x * blockDim.x + tid;
00092     if (idx < N)
00093     {
00094         FLOAT_ARRAY_TYPE tPos = dPos[idx];//save reference particle position in register
00095         FLOAT_ARRAY_TYPE d;
00096         dVcount[idx]=0;
00097         int* dVlist;
00098 
00099         for(int j=0; j<gridDim.x; ++j)
00100         {
00101             int bOffset = j * blockDim.x;
00102             sPos[tid] = dPos[bOffset+tid]; //fetch block of particle positions and save in shared memory
00103             __syncthreads();
00104             for (int i=0; i<blockDim.x; ++i)
00105             {
00106                 int id = bOffset + i;
00107                 if ( id != idx && id < N)
00108                 {
00109                     gpuDistPBC(&d, &tPos, &sPos[i], &boxSize); //use broadcast from shared memory -> full coalescence
00110                     if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00111                     {
00112                         dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00113                         dVlist[idx] = id;
00114                         dVcount[idx] = dVcount[idx] + 1;
00115                     }
00116                 }
00117             }
00118             __syncthreads();
00119         }
00120         dmax[idx]=0.0f;
00121     }
00122     //reset update flag
00123     (*doUpdate)=false;
00124 }
00125 
00126 
00127 
00128 __global__ void gpuGenerateVerletListSmemVar1(FLOAT_ARRAY_TYPE *dPos, int *dVlists, size_t pitchDVlists, int *dVcount, float *dmax, float rListSq, int N, float boxSize, bool *doUpdate)
00129 {
00130     extern __shared__ FLOAT_ARRAY_TYPE sPos[]; //allocate shared memory
00131     int tid = threadIdx.x;
00132     int offset = blockDim.x * blockIdx.x;
00133     int idx = offset + tid;
00134     if (idx < N)
00135     {
00136         sPos[tid] = dPos[idx];//save reference particle position in smem
00137         __syncthreads();
00138         FLOAT_ARRAY_TYPE d;
00139         dVcount[idx]=0;
00140         int* dVlist;
00141 
00142         //block before thread block in device memory
00143         for(int j=0; j< offset; ++j)
00144         {
00145             gpuDistPBC(&d, &sPos[tid], &dPos[j], &boxSize);
00146             if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00147             {
00148                 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00149                 dVlist[idx] = j;
00150                 dVcount[idx] = dVcount[idx] + 1;
00151             }
00152         }
00153         //block in shared memory
00154         for(int j=0; j<blockDim.x; ++j)
00155         {
00156             if(j != tid)//exclude self interaction
00157             {
00158                 gpuDistPBC(&d, &sPos[tid], &sPos[j], &boxSize);
00159                 if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00160                 {
00161                     dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00162                     dVlist[idx] = offset + j;
00163                     dVcount[idx] = dVcount[idx] + 1;
00164                 }
00165             }
00166         }
00167         //block after threadblock in device memory
00168         for(int j=offset+blockDim.x; j<N; ++j)
00169         {
00170             gpuDistPBC(&d, &sPos[tid], &dPos[j], &boxSize);
00171             if ( rListSq > (d.x * d.x + d.y * d.y + d.z *d.z) )
00172             {
00173                 dVlist = (int*)((char*)dVlists + dVcount[idx] * pitchDVlists);
00174                 dVlist[idx] = j;
00175                 dVcount[idx] = dVcount[idx] + 1;
00176             }
00177         }
00178         dmax[idx]=0.0f;
00179     }
00180     //reset update flag
00181     (*doUpdate)=false;
00182 }
00183 
00184 
00185 //####################################### NVT ensemble methods ####################################//
00186 
00187 
00188 void rescaleT(int N, int dim, int numInterData, int numThreads, float T, int threads, int blocks, FLOAT_ARRAY_TYPE *dVel, float *sumSq)
00189 {
00190     //get sum of squares of velocities
00191     SqVecSumReduction(dVel, sumSq, numInterData, numThreads, N);
00192 
00193     //rescale velocities to desired temperatures
00194     gpuRescaleTKernel<<<blocks, threads>>>(N, dim, T, dVel, sumSq);
00195     checkCudaError("Kernel error: gpuRescaleTKernel");
00196 }
00197 
00198 __global__ void gpuRescaleTKernel(int N, int dim, float T, FLOAT_ARRAY_TYPE *dVel, float *vSumSq)
00199 {
00200     int idx = blockIdx.x * blockDim.x + threadIdx.x;
00201     float factor = __fsqrt_rn(dim*N*T/vSumSq[0]);
00202     if (idx < N)
00203     {
00204         dVel[idx].x = dVel[idx].x * factor;
00205         dVel[idx].y = dVel[idx].y * factor;
00206         dVel[idx].z = dVel[idx].z * factor;
00207     }
00208 }
00209 
00210 
00211 //####################################### general MD helper functions ####################################//
00212 
00213 void removeDrift(int N, int numInterData, int numThreads, int threads, int blocks, FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *compSum)
00214 {
00215     //get component wise sum of positon
00216     componentSumReduction(dVel, compSum, numInterData, numThreads, N);
00217 
00218     //remove drift
00219     gpuRemoveDriftKernel<<<blocks, threads>>>(N, dVel, compSum);
00220     checkCudaError("Kernel error: gpuRemoveDriftKernel");
00221 }
00222 
00223 __global__ void gpuRemoveDriftKernel(int N, FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *compSum)
00224 {
00225     int id = blockIdx.x * blockDim.x + threadIdx.x;
00226     FLOAT_ARRAY_TYPE factor;
00227     factor.x = -compSum[0].x/N;
00228     factor.y = -compSum[0].y/N;
00229     factor.z = -compSum[0].z/N;
00230     if (id < N)
00231     {
00232         dVel[id].x += factor.x;
00233         dVel[id].y += factor.y;
00234         dVel[id].z += factor.z;
00235     }
00236 }
00237 
00238 __global__ void gpuGetVelOnTime(FLOAT_ARRAY_TYPE *dVel, FLOAT_ARRAY_TYPE *dVt, FLOAT_ARRAY_TYPE *dForce, int N, float dt)
00239 {
00240     int idx = blockIdx.x * blockDim.x + threadIdx.x;
00241     if (idx < N)
00242     {
00243         dVt[idx].x = dVel[idx].x + 0.5f * dt * dForce[idx].x;
00244         dVt[idx].y = dVel[idx].y + 0.5f * dt * dForce[idx].y;
00245         dVt[idx].z = dVel[idx].z + 0.5f * dt * dForce[idx].z;
00246     }
00247 }
00248 
00249 
00250 
00251 //####################################### Observable functions ####################################//
00252 
00253 
00254 
00255 float getKineticEnergy(FLOAT_ARRAY_TYPE *dVel, int &N, float *sumSq, int &maxThreads, int &maxBlocks)
00256 {
00257     //get sum of squares
00258     SqVecSumReduction(dVel, sumSq, 64,64, N);
00259     float e;
00260     cudaMemcpy(&e, sumSq, sizeof(float), cudaMemcpyDeviceToHost);
00261     e *= 0.5f;
00262     return e;
00263 }
00264 
00265 
00266 
00267 
00268 
00269 float getHarmonicPotentialEnergy(float *dEpot, FLOAT_ARRAY_TYPE *dPos, float *sum, int *cnn, size_t &pitch, int &N, int &numNN, float &k, float &boxSize, dim3 &dimGrid, dim3 &dimBlock, int &maxThreads, int &maxBlocks)
00270 {
00271     float e;
00272     gpuGetHarmonicPotentialEnergyKernel<<< dimGrid, dimBlock>>>(dEpot, dPos, cnn, pitch, N, numNN, k, boxSize);
00273     checkCudaError("Kernel error: gpu_get_harmonic_potential");
00274     sumReduction(dEpot, sum, 64, 64, N);
00275     cudaMemcpy(&e, sum, sizeof(float), cudaMemcpyDeviceToHost);
00276     return e;
00277 }
00278 
00279 
00280 __global__ void gpuGetHarmonicPotentialEnergyKernel(float *dEpot, FLOAT_ARRAY_TYPE *dPos, int *cnn, size_t pitch, int N, int numNN, float k, float boxSize)
00281 {
00282     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00283     FLOAT_ARRAY_TYPE d;
00284     if (idx < N)
00285     {
00286         dEpot[idx] = 0.0f;
00287         int pid;
00288         int *row = (int*)((char*)cnn + idx * pitch);
00289         for (int c = 0; c < numNN; ++c)
00290         {
00291             pid = row[c];
00292             if (pid != -1)
00293             {
00294                 gpuDistPBC(&d, &dPos[idx], &dPos[pid], &boxSize);
00295                 dEpot[idx] += d.x * d.x + d.y * d.y + d.z * d.z;
00296             }
00297         }
00298         dEpot[idx] *= 0.25f*k;
00299     }
00300 }
00301 
00302 
00303 float getLJCutPotentialEnergy(FLOAT_ARRAY_TYPE *dPos, float *dEpot, float *sum, int &N, float &boxSize, float &rCutSq, dim3 &dimGrid, dim3 &dimBlock, int &maxThreads, int &maxBlocks)
00304 {
00305     float e;
00306     gpuLJCutPotentialEnergyKernel<<< dimGrid, dimBlock>>>(dPos, dEpot, N, boxSize, rCutSq);
00307     checkCudaError("Kernel error: gpuLJCutPotentialEnergyKernel");
00308     sumReduction(dEpot, sum, 64, 64, N);
00309     cudaMemcpy(&e, sum, sizeof(float), cudaMemcpyDeviceToHost);
00310     return e;
00311 }
00312 
00313 
00314 __global__ void gpuLJCutPotentialEnergyKernel(FLOAT_ARRAY_TYPE *dPos, float *dEpot, int N, float boxSize, float rCutSq)
00315 {
00316     //Reduced units are used -> epsilon=sigma=1
00317     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00318     if (idx < N)
00319     {
00320         FLOAT_ARRAY_TYPE d;
00321         float invR6;
00322         dEpot[idx]=0.0f;//clear
00323         for (int i=0; i<N; ++i)
00324         {
00325             if ( i != idx) //excude self interaction
00326             {
00327                 gpuDistPBC( &d, &dPos[idx],  &dPos[i], &boxSize);
00328                 invR6 = d.x * d.x + d.y * d.y + d.z * d.z;
00329                 if ( rCutSq > invR6 )
00330                 {
00331                     invR6 = 1.0f / invR6;
00332                     invR6 = powf( invR6, 3.0f );
00333                     dEpot[idx] += 2.0f * invR6 * (invR6 - 1.0f);
00334                 }
00335             }
00336         }
00337     }
00338 }
00339 
00340 //####################################### CUDA helper functions ####################################//
00341 
00342 
00343 inline static unsigned long inKB(unsigned long bytes)
00344 {
00345     return bytes/1024;
00346 }
00347 
00348 inline static unsigned long inMB(unsigned long bytes)
00349 {
00350     return bytes/(1048576);
00351 }
00352 
00353 
00354 bool memInfo()
00355 {
00356     bool success=true;
00357     unsigned int free, total;
00358     CUresult res;
00359     res = cuMemGetInfo(&free, &total);
00360     if (res != CUDA_SUCCESS)
00361     {
00362         std::cout<<"uMemGetInfo failed: "<<res<<"\n";
00363         success=false;
00364     }
00365     else
00366     {
00367         std::cout<<"________________ Memory usage ________________\n";
00368         std::cout<<"Free : "<<free<<" bytes ("<<inKB(free)<<" KB) ("<<inMB(free)<<" MB)\n";
00369         std::cout<<"Used : "<<total-free<<" bytes ("<<inKB(total-free)<<" KB) ("<<inMB(total-free)<<" MB)\n";
00370         std::cout<<"Total: "<<total<<" bytes ("<<inKB(total)<<" KB) ("<<inMB(total)<<" MB)\n";
00371         std::cout<<"-----> "<<100.0*free/(double)total<<"% free, "<<100.0*(total - free)/(double)total<<"% used\n";
00372         std::cout<<"______________________________________________\n";
00373     }
00374     return success;
00375 }
00376 
00377 
00378 void checkCudaError(const char *identifier)
00379 {
00380     cudaThreadSynchronize();
00381     cudaError_t err = cudaGetLastError();
00382     if ( cudaSuccess != err)
00383     {
00384         std::cout<<"Cuda error: "<<identifier<<" : "<< cudaGetErrorString( err) <<"\n";
00385         exit(EXIT_FAILURE);
00386     }
00387 }
00388 
00389 
00390 
00391 __global__ void copyDevToDev(FLOAT_ARRAY_TYPE *dA, FLOAT_ARRAY_TYPE *dB, int N)
00392 {
00393     int idx = blockDim.x * blockIdx.x + threadIdx.x;
00394     if (idx < N)
00395     {
00396         dB[idx].x = dA[idx].x;
00397         dB[idx].y = dA[idx].y;
00398         dB[idx].z = dA[idx].z;
00399     }
00400 }
00401 
00402 
00403 __global__ void gpuSimpleSqSum(FLOAT_ARRAY_TYPE *dVal, float*sqSum, int N)
00404 {
00405     int idx=blockIdx.x*blockDim.x+threadIdx.x;
00406     if (idx==0)
00407     {
00408         (*sqSum)=0.0f;
00409         for (int i=0; i<N; ++i)
00410         {
00411             (*sqSum) += dVal[i].x * dVal[i].x + dVal[i].y * dVal[i].y + dVal[i].z * dVal[i].z;
00412         }
00413     }
00414 }
00415 
00416 __global__ void gpuSimpleComponentSum(FLOAT_ARRAY_TYPE *dVal, FLOAT_ARRAY_TYPE *sum, int N)
00417 {
00418     int idx=blockIdx.x*blockDim.x+threadIdx.x;
00419     if (idx==0)
00420     {
00421         (*sum).x=0.0f;
00422         (*sum).y=0.0f;
00423         (*sum).z=0.0f;
00424         for (int i=0; i<N; ++i)
00425         {
00426             (*sum).x += dVal[i].x;
00427             (*sum).y += dVal[i].y;
00428             (*sum).z += dVal[i].z;
00429         }
00430     }
00431 }
00432 
00433 
00434 
00435 //####################################### Screen output functions ####################################//
00436 
00437 
00438 void showPositions(FLOAT_ARRAY_TYPE *hPos, int N)
00439 {
00440     for (int i=0; i<N; ++i)
00441     {
00442         std::cout<<"hPos["<<i<<"] = "<<hPos[i].x<<" , "<<hPos[i].y<<" , "<<hPos[i].z<<"\n";
00443     }
00444 }
00445 
00446 void showVelocities(FLOAT_ARRAY_TYPE *hVel, int &N)
00447 {
00448     for (int i=0; i<N; ++i)
00449     {
00450         std::cout<<"hVel["<<i<<"] = "<<hVel[i].x<<" , "<<hVel[i].y<<" , "<<hVel[i].z<<"\n";
00451     }
00452 }
00453 
00454 void showForces(FLOAT_ARRAY_TYPE *hForce, int &N)
00455 {
00456     for (int i=0; i<N; ++i)
00457     {
00458         std::cout<<"hForce["<<i<<"] = "<<hForce[i].x<<" , "<<hForce[i].y<<" , "<<hForce[i].z<<"\n";
00459     }
00460 }
00461 
00462 void showDeviceArray(FLOAT_ARRAY_TYPE *dArray, int N)
00463 {
00464     FLOAT_ARRAY_TYPE *hArray = new FLOAT_ARRAY_TYPE[N];
00465     cudaMemcpy(hArray, dArray, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00466     std::cout<<"---- Device array ----\n\n";
00467     for (int i=0; i<N; ++i)
00468     {
00469         std::cout<<i<<": "<<hArray[i].x<<" "<<hArray[i].y<<" "<<hArray[i].z<<"\n";
00470     }
00471     std::cout<<"----------------------\n\n";
00472     delete[] hArray;
00473 }
00474 
00475 void showNeareastNeighborList(int *dNNListArray, int &N, int &numNN)
00476 {
00477     for (int i=0; i<N; ++i)
00478     {
00479         int id=numNN*i;
00480         std::cout<<"---------------------\nparticle: "<<i<<"\nneighbors:\n";
00481         for (int j=0; j<numNN; ++j)
00482         {
00483             std::cout<<dNNListArray[id+j]<<"\n";
00484         }
00485     }
00486     std::cout<<"------------------------\n";
00487     std::cout<<"------------------------\n";
00488 }
00489 
00490 void showVerletList(int *dVlists, size_t pitchVlists, int *dVcount, int N, int NList)
00491 {
00492     int *hVerletList=new int[N*NList];
00493     int *hCount = new int[N];
00494     cudaMemcpy(hCount, dVcount, N * sizeof(int), cudaMemcpyDeviceToHost);
00495     cudaMemcpy2D(hVerletList, N*sizeof(int), dVlists, pitchVlists, N * sizeof(int), NList, cudaMemcpyDeviceToHost);
00496     for (int i=0; i<10; ++i)
00497     {
00498         std::cout<<"i: "<<i<<" ; vcount: "<<hCount[i]<<" -->";
00499         for (int j=0; j<hCount[i]; ++j)
00500         {
00501             std::cout<<" "<<hVerletList[i + j*N];
00502         }
00503         std::cout<<"\n_________________\n";
00504     }
00505     int maxentry=0;
00506     int indexmax;
00507     for (int i=0; i<N; ++i)
00508     {
00509         if (maxentry < hCount[i])
00510         {
00511             maxentry = hCount[i];
00512             indexmax=i;
00513         }
00514     }
00515     std::cout<<"max entry: "<<maxentry<<" @ index: "<<indexmax<<"\n";
00516     delete[] hVerletList;
00517     delete[] hCount;
00518 }
00519 
00520 //####################################### Simulation parameter functions ####################################//
00521 
00522 bool setCudaDevice(int gpuDevNo)
00523 {
00524     //check if desired device ID is there
00525     int deviceCount;
00526     cudaGetDeviceCount(&deviceCount);
00527     if(deviceCount <= gpuDevNo)
00528     {
00529         std::cout<<"Device ID '"<<gpuDevNo<<"' not present!\n";
00530         return false;
00531     }
00532 
00533     cudaSetDevice( gpuDevNo );
00534     cudaDeviceProp prop;         // get device properties
00535     cudaGetDevice(&gpuDevNo);
00536     cudaGetDeviceProperties(&prop, gpuDevNo);
00537     std::cout<<"_______________________________________________________\n";
00538     std::cout<<"Number of GPU devices: "<<deviceCount<<"\n";
00539     std::cout<<"Running on GPU device #: "<<gpuDevNo<<"\n";
00540     std::cout<<"GPU processor #: "<<8*prop.multiProcessorCount<<"\n";
00541     std::cout<<"Computing Capability: ["<<prop.major<<"."<< prop.minor<<"]\n";
00542     std::cout<<"_______________________________________________________\n";
00543     return true;
00544 }
00545 
00546 
00547 
00548 bool generateStartScript(std::string exeName)
00549 {
00550     bool ok;
00551     std::string scriptname = "example.sh";
00552     std::string inputfile = "N16_T0.5_g2.0_k0.1_init.dat";
00553     std::string basename = "N16_T0.5_g2.0_k0.1_";
00554     int dim = 3;
00555     int N = 4096;
00556     int numNN = 6;
00557     float T = 0.5;
00558     float k = 0.1;
00559     float boxSize = 32;
00560     float dt = 0.004;
00561     int NOffset = 0;
00562     int NSim = 10000;
00563     int NOut = 1000;
00564     int NAdjust = 100;
00565     int gpuDevNo=0;
00566     ok =        generateScript(scriptname, exeName, inputfile, basename, dim, N, numNN, T, k, boxSize, dt, NOffset, NSim, NOut, NAdjust, gpuDevNo);
00567     return ok;
00568 }
00569 
00570 
00571 bool generateRestartScript(std::string exeName, std::string basename, int dim, int N, int numNN, float T, float k, float boxSize, float dt, int NOffset, int NSim, int NOut, int NAdjust, int gpuDevNo)
00572 {
00573     std::ostringstream ssFile;
00574     std::string scriptname;
00575     ssFile<<basename<<"_continue.sh";
00576     scriptname = ssFile.str();
00577     ssFile.str("");
00578     ssFile.clear();
00579     ssFile.fill('0');
00580     ssFile<<basename<<"";
00581     ssFile.width(10);
00582     ssFile<<NOffset;
00583     ssFile.width(1);
00584     ssFile<<".dat";
00585     std::string inputfile = ssFile.str();
00586     bool ok;
00587     ok =        generateScript(scriptname, exeName, inputfile, basename, dim, N, numNN, T, k, boxSize, dt, NOffset, NSim, NOut, NAdjust, gpuDevNo);
00588     return ok;
00589 }
00590 
00591 bool generateScript(std::string scriptname, std::string exeName, std::string inputfile, std::string basename, int dim, int N, int numNN, float T, float k, float boxSize, float dt, int NOffset, int NSim, int NOut, int NAdjust, int gpuDevNo)
00592 {
00593     std::ofstream out(scriptname.c_str());
00594     if (!out.good())
00595     {
00596         std::cout<<"'"<<scriptname<<"' couldn't be created!\n";
00597         return 0;
00598     }
00599     out<<"#!/bin/bash\n\n";
00600     out<<"# input x-v-data file name\n";
00601     out<<"input="<<inputfile<<" \n\n";
00602     out<<"# output base filename\n";
00603     out<<"basename="<<basename<<"\n\n";
00604     out<<"# Dimension of the simulation \n";
00605     out<<"dim="<<dim<<"\n\n";
00606     out<<"# particle number \n";
00607     out<<"N="<<N<<"\n\n";
00608     out<<"# maximum number of coupled nearest neighbors\n";
00609     out<<"num_nn="<<numNN<<"\n\n";
00610     out<<"# temperature\n";
00611     out<<"temperature="<<T<<"\n\n";
00612     out<<"# spring constant:\n";
00613     out<<"k="<<k<<"\n\n";
00614     out<<"# edge length of the simulation box\n";
00615     out<<"box="<<boxSize<<"\n\n";
00616     out<<"# timestep\n";
00617     out<<"timestep="<<dt<<"\n\n";
00618     out<<"# offset for simulation steps\n";
00619     out<<"Noffset="<<NOffset<<"\n\n";
00620     out<<"# simulate till NSim steps are done\n";
00621     out<<"Nsim="<<NSim<<"\n\n";
00622     out<<"# output intervall (CONSTRAINT: must be a multiple of 'Nadjust')\n";
00623     out<<"Nout="<<NOut<<"\n\n";
00624     out<<"# NVT velocity scaling and drift removal intervall\n";
00625     out<<"Nadjust="<<NAdjust<<"\n\n";
00626     out<<"# number of GPU device to run the job on\n";
00627     out<<"gpuDevNo="<<gpuDevNo<<"\n\n##################\n\n";
00628     out<<exeName<<" $input $basename $dim $N $num_nn $temperature $k $box $timestep $Noffset $Nsim $Nout $Nadjust $gpuDevNo\n";
00629     return 0;
00630 }
00631 
00632 
00633 //####################################### Parallel Primitives ####################################//
00634 
00635 
00636 
00637 __global__ void gpuReduce1(float *dInData, float *dInterData, int numInterData, int numThreads, int N)
00638 {
00639     extern __shared__ float smem[];
00640     int idx=blockIdx.x*blockDim.x+threadIdx.x;
00641     float s=0.0f;
00642     for (int j=idx; j<N; j+=numInterData*numThreads)
00643     {
00644         s += dInData[j];
00645     }
00646     smem[threadIdx.x]=s;
00647     __syncthreads();
00648     if (threadIdx.x==0)
00649     {
00650         float s=0.0f;
00651         for (int i=0; i<numThreads; i++)
00652         {
00653             s+=smem[i];
00654         }
00655         dInterData[blockIdx.x]=s;
00656     }
00657 }
00658 
00659 
00660 __global__ void gpuReduce2(float *dInterData, int numInterData, int numThreads)
00661 {
00662     extern __shared__ float smem[];
00663     int idx=threadIdx.x;
00664     float s=0.0f;
00665     for (int j=idx; j<numInterData; j+=numThreads)
00666     {
00667         s+=dInterData[j];
00668     }
00669     smem[threadIdx.x]=s;
00670     __syncthreads();
00671     if (idx==0)
00672     {
00673         float s=0.0f;
00674         for (int i=0; i<numThreads; i++)
00675         {
00676             s+=smem[i];
00677         }
00678         dInterData[0]=s;
00679     }
00680 }
00681 
00682 
00683 
00684 __global__ void gpuSqSumReduce1(FLOAT_ARRAY_TYPE *dInData, float *dInterData, int numInterData, int numThreads, int N)
00685 {
00686     extern __shared__ float smem[];
00687     int idx=blockIdx.x*blockDim.x+threadIdx.x;
00688     float s=0.0f;
00689     for (int j=idx; j<N; j+=numInterData*numThreads)
00690     {
00691         s += dInData[j].x * dInData[j].x + dInData[j].y * dInData[j].y + dInData[j].z * dInData[j].z;
00692     }
00693     smem[threadIdx.x]=s;
00694     __syncthreads();
00695     if (threadIdx.x==0)
00696     {
00697         float s=0.0f;
00698         for (int i=0; i<numThreads; i++)
00699         {
00700             s+=smem[i];
00701         }
00702         dInterData[blockIdx.x]=s;
00703     }
00704 }
00705 
00706 
00707 __global__ void gpuComponentSumReduce1(FLOAT_ARRAY_TYPE *dInData, FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads, int N)
00708 {
00709     extern __shared__ FLOAT_ARRAY_TYPE smema[];
00710     int tid = threadIdx.x;
00711     int bid = blockIdx.x;
00712     int idx=bid*blockDim.x+tid;
00713     FLOAT_ARRAY_TYPE s;
00714     s.x = 0.0f;
00715     s.y = 0.0f;
00716     s.z = 0.0f;
00717     for (int j=idx; j<N; j+=numInterData*numThreads)
00718     {
00719         s.x += dInData[j].x;
00720         s.y += dInData[j].y;
00721         s.z += dInData[j].z;
00722     }
00723     smema[tid].x = s.x;
00724     smema[tid].y = s.y;
00725     smema[tid].z = s.z;
00726     __syncthreads();
00727     if (tid==0)
00728     {
00729         s.x = 0.0f;
00730         s.y = 0.0f;
00731         s.z = 0.0f;
00732         for (int i=0; i<numThreads; i++)
00733         {
00734             s.x += smema[i].x;
00735             s.y += smema[i].y;
00736             s.z += smema[i].z;
00737         }
00738         dInterData[bid].x = s.x;
00739         dInterData[bid].y = s.y;
00740         dInterData[bid].z = s.z;
00741     }
00742 }
00743 
00744 
00745 __global__ void gpuComponentSumReduce2(FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads)
00746 {
00747     extern __shared__ FLOAT_ARRAY_TYPE smema[];
00748     int tid=threadIdx.x;
00749     FLOAT_ARRAY_TYPE s;
00750     s.x = 0.0f;
00751     s.y = 0.0f;
00752     s.z = 0.0f;
00753 
00754     for (int j=tid; j<numInterData; j+=numThreads)
00755     {
00756         s.x += dInterData[j].x;
00757         s.y += dInterData[j].y;
00758         s.z += dInterData[j].z;
00759     }
00760 
00761     smema[tid].x = s.x;
00762     smema[tid].y = s.y;
00763     smema[tid].z = s.z;
00764     __syncthreads();
00765     if (tid==0)
00766     {
00767         s.x = 0.0f;
00768         s.y = 0.0f;
00769         s.z = 0.0f;
00770         for (int i=0; i<numThreads; i++)
00771         {
00772             s.x += smema[i].x;
00773             s.y += smema[i].y;
00774             s.z += smema[i].z;
00775         }
00776         dInterData[0].x = s.x;
00777         dInterData[0].y = s.y;
00778         dInterData[0].z = s.z;
00779     }
00780 }
00781 
00782 
00783 void sumReduction(float *dInData, float *dInterData, int numInterData, int numThreads, int N)
00784 {
00785     int smem = numThreads * sizeof(float);
00786     gpuReduce1<<<numInterData, numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00787     checkCudaError("Kernel error: reduce 1");
00788     gpuReduce2<<<1, numThreads, smem>>>(dInterData, numInterData, numThreads);
00789     checkCudaError("Kernel error: reduce 2");
00790 }
00791 
00792 
00793 
00794 void SqVecSumReduction(FLOAT_ARRAY_TYPE *dInData, float *dInterData, int numInterData, int numThreads, int N)
00795 {
00796     int smem = numThreads * sizeof(float);
00797     gpuSqSumReduce1<<<numInterData,numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00798     checkCudaError("Kernel error: gpuSqSumReductionKernel");
00799     gpuReduce2<<<1,numThreads, smem>>>(dInterData, numInterData, numThreads);
00800     checkCudaError("Kernel error: reduce2");
00801 }
00802 
00803 
00804 void componentSumReduction(FLOAT_ARRAY_TYPE *dInData, FLOAT_ARRAY_TYPE *dInterData, int numInterData, int numThreads, int N)
00805 {
00806     int smem = numThreads * sizeof(FLOAT_ARRAY_TYPE);
00807     gpuComponentSumReduce1<<<numInterData, numThreads, smem>>>(dInData, dInterData, numInterData, numThreads, N);
00808     checkCudaError("Kernel error: component reduction kernel");
00809     gpuComponentSumReduce2<<<1, numThreads, smem>>>(dInterData, numInterData, numThreads);
00810     checkCudaError("Kernel error: component reduce 2");
00811 }
00812
/home/julian/svn/gpucpumd/mdgpu2/gpu_tools.cu