mdgpu - a molecular dynamic simulation program using GPUs: /home/julian/svn/gpucpumd/mdgpu2/main.cu Source File

00001 
00022 #include <iostream>
00023 #include <fstream>
00024 #include <string>
00025 #include <cmath>
00026 #include <assert.h>
00027 #include <cstdlib>
00028 #include <limits>
00029 #include <cuda.h>
00030 #include <ctime>
00031 #include "gpu_forces.cuh"
00032 #include "gpu_integrator.cuh"
00033 #include "gpu_tools.cuh"
00034 #include "define.h"
00035 #include "data.h"
00036 #include <cstring>
00037 #include "tests.cuh"
00038 
00039 
00064 int main(int argc, char *argv[])
00065 {
00066 
00067     /*###########################################
00068     TEST AREA
00069     *///#########################################
00070     //int returnValue=0;
00071     //outputSteppingTest();
00072     //LJ_force_test();
00073     //summation_test();
00074     //temperature_scale_test();
00075     //scaling();
00076     //drift();
00077     //test2d();
00078     //test3d();
00079     //threadtest(argc, argv);
00080     //precision_test(argc, argv);
00081     //returnValue = forceSpeed(argc, argv);
00082     //return returnValue;
00083     /*##########################################
00084     END TEST AREA
00085     *///########################################
00086 
00087 
00088     /*##########################################
00089     Production code
00090     *///########################################
00091     if( argc !=15 )
00092     {
00093         std::cout<<"Argument parsing failed!\nPlease edit 'example.sh' for your needs!\n";
00094         std::string exeName = argv[0];
00095         std::cout<<exeName<<"\n";
00096         std::cout<<"Number of given arguments: "<<argc<<"\n";
00097         bool tmp=generateStartScript(exeName);
00098         return 1;
00099     }
00100     std::string inputfile=argv[1]; //inputdata file name
00101     std::string basename = argv[2]; //output base name
00102     int dim = atoi(argv[3]); //dimension of the simulation
00103     int N = atoi(argv[4]); //number of particles given as command line argument
00104     int numNN = atoi(argv[5]); //maximum number of coupled nearest neighbors
00105     float T = atof(argv[6]); //Temperature
00106     float k = atof(argv[7]); //spring constant
00107     float boxSize = atof(argv[8]); //edge length of the simulation box
00108     float dt = atof(argv[9]); //timestep
00109     int NOffset = atoi(argv[10]);//simulation step count offset
00110     int NSim = atoi(argv[11]);//final simulation step count
00111     int NOut = atoi(argv[12]);//output intervall
00112     int NAdjust =       atoi(argv[13]);//NVT velocity scaling and drift removal intervall
00113     int gpuDevNo = atoi(argv[14]); //use specific gpu device
00114 
00115     bool success = true;//state flag
00116     bool linLogEqual = false;
00117     bool NthreadEqN = true;
00118 
00119     //set device
00120     success=setCudaDevice(gpuDevNo);
00121     if(!success)
00122     {
00123         return 1;
00124     }
00125 
00126     //generate script to continue finished simulation
00127     //success = generateRestartScript(argv[0], basename, dim, N, numNN, T, k, boxSize, dt, NOffset+NSim, NOffset+2*NSim, NOut, NAdjust, gpuDevNo);
00128 
00131     //static simulation parameters
00132     float rCut = 3. * pow(2., 1./6.);
00133     float rCutSq = rCut * rCut;
00134     float rVer = 1.5 * rCut;
00135     float rVerSq = rVer * rVer;
00136     int numPointsPerDecade = 20; 
00139     //cuda parameters
00140     int threads=128;
00141     int blocks=N/threads + (N%threads == 0?0:1);
00142     if(blocks >= 65536)
00143     {
00144         std::cout<<"Maximum block number (65536) exeeded: "<<blocks<<"\n";
00145         return 1;
00146     }
00147     if( blocks * threads != N)
00148     {
00149         NthreadEqN = false;
00150     }
00151     dim3 dimGrid(blocks);
00152     dim3 dimBlock(threads);
00153 
00154     //reduction tuning parameters
00155     int redM = 128;
00156     int redN = 128;
00160     //derived parameters
00161     double logInc = (log(10)-log(1))/static_cast<double>(numPointsPerDecade);//logarithmic increment
00162     int numLogOutputPoints = static_cast<int>( (log(NSim) - log(NOffset) )/logInc);
00163     int LogOutputCounter = 1;
00164     int LogOutputOffset = NOffset==0 ? 1 : NOffset;
00165     int nextLogOutput = static_cast<int>(exp(logInc * LogOutputCounter)*LogOutputOffset);
00166     std::string backup="_bkp_"+basename;
00167 
00168     //basic array setup
00169     FLOAT_ARRAY_TYPE *hPos; //particle positions on host
00170     FLOAT_ARRAY_TYPE *hVel; //particle velocities on host
00171     FLOAT_ARRAY_TYPE *dPos; //particle positions on device
00172     FLOAT_ARRAY_TYPE *dVel; //particle velocities on device
00173     FLOAT_ARRAY_TYPE *velOnDt; //particle velocities on device at dt
00174     FLOAT_ARRAY_TYPE *dForce; //forces action on particles on device
00175     float *dEPot; //potentials at each particles on device
00176     float *reduction; //array used in reduction kernel
00177     FLOAT_ARRAY_TYPE *compReduction; //array used in component wise reduction
00178     int *hCnn; //coupled nearest neighbor list on host
00179     int *dCnn; //coupled nearest neighbor list on device
00180 
00181     //generate verlet list data structures in gpu memory
00182     int *vCount; //holds number of neigbor list entries for each particle
00183     int *vLists; //holds in each row the verlet list for one particle
00184     size_t vListsPitch; //receives pitch value from cudaMallocPitch call
00185     int NVList = static_cast<int>(0.75 * pow( 2.0 * rVer, 3.));//maximum number of entries in each verlet list
00186     //std::cout<<"NVList = "<<NVList<<"\n";
00187     float *dMax; //holds displacement of each particle since last verlet list update
00188     bool *dListUpdate;//flag for list update on device
00189     bool hListUpdate=false;//flag for list update on host
00190     float maxDisplacement=(rVer - rCut) * 0.5; //maximum allowed displacement before next verlet list update
00191     //std::cout<<"max allowed displacement: "<<maxDisplacement<<"\n";
00192     //std::cout<<"list update: "<<hListUpdate<<"\n";
00193 
00194 
00195     //allocate memory
00196     //host
00197     hPos = new FLOAT_ARRAY_TYPE[N];
00198     hVel = new FLOAT_ARRAY_TYPE[N];
00199     hCnn = new int[numNN*N];
00200 
00201     //read in data
00202     success = loadInitData(inputfile, hPos, hVel, hCnn, N, numNN);
00203     if(!success)
00204     {
00205         std::cout<<"Data could not be read!\n";
00206         delete[] hPos;
00207         delete[] hVel;
00208         delete[] hCnn;
00209         return 1;
00210     }
00211 
00212     //save inital setup
00213     success = writeData(basename, hPos, hVel, hCnn, N, numNN, NOffset);
00214 
00215     //allocate device memory
00216     size_t size=N*sizeof(FLOAT_ARRAY_TYPE);
00217 
00218     cudaMalloc((void **) &dPos, size);
00219     checkCudaError("Mem Alloc dPos");
00220     success=memInfo();
00221     cudaMalloc((void **) &dVel, size);
00222     checkCudaError("Mem Alloc dVel");
00223     cudaMalloc((void **) &velOnDt, size);
00224     checkCudaError("Mem Alloc velOnDt");
00225     cudaMalloc((void **) &dForce, size);
00226     checkCudaError("Mem Alloc dForce");
00227     cudaMalloc((void **) &dEPot, N* sizeof(float) );
00228     checkCudaError("Mem Alloc dEPot");
00229     cudaMalloc((void**) &reduction, redM * sizeof(float));
00230     checkCudaError("Mem Alloc reduction");
00231     cudaMalloc((void**) &compReduction, redM * sizeof(float3));
00232     checkCudaError("Mem Alloc compReduction");
00233 
00234     size_t hCnn_pitch=N * sizeof(int); //pitch of host data
00235     size_t dCnnPitch=0; //receives pitch value from cudaMallocPitch call
00236     cudaMallocPitch((void**)&dCnn, &dCnnPitch, N * sizeof(int), numNN);
00237     checkCudaError("Pitch Mem Alloc dCnn");
00238 
00239     //verlet list allocations
00240     cudaMalloc((void **) &vCount, N * sizeof(int));
00241     checkCudaError("Mem Alloc vCount");
00242     cudaMallocPitch((void**)&vLists, &vListsPitch, N * sizeof(int), NVList);
00243     checkCudaError("Pitch Mem Alloc vLists");
00244     cudaMalloc((void **) &dListUpdate, sizeof(bool));
00245     checkCudaError("Mem Alloc dListUpdate");
00246     cudaMalloc((void **) &dMax, N * sizeof(float));
00247     checkCudaError("Mem Alloc dMax");
00248     success=memInfo();
00249 
00250     //copy initial data to device
00251     cudaMemcpy(dPos, hPos, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyHostToDevice);
00252     checkCudaError("Mem Cpy: dPos, hPos");
00253     cudaMemcpy(dVel, hVel, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyHostToDevice);
00254     checkCudaError("Mem Cpy: dVel, hVel");
00255     cudaMemcpy2D(dCnn, dCnnPitch, hCnn, hCnn_pitch, N * sizeof(int), numNN, cudaMemcpyHostToDevice);
00256     checkCudaError("Mem Cpy 2D: dCnn, hCnn");
00257     cudaMemcpy(dListUpdate, &hListUpdate, sizeof(bool), cudaMemcpyHostToDevice);
00258 
00259 
00260     //execute one kernel before timing begins
00261     gpuResetForces<<<dimGrid, dimBlock>>>(dForce, N);
00262     checkCudaError("Kernel error: gpuResetForces");
00263 
00264     //#############################################################
00265     //#### start timimg
00266     //cuda timing
00267     cudaEvent_t start, stop;
00268     cudaEventCreate(&start);
00269     cudaEventCreate(&stop);
00270     float time;
00271     //host timing
00272     clock_t cStart, cEnd;
00273     cudaEventRecord(start, 0);
00274     cStart=clock();
00275 
00276     //generate first time verlet list
00277     if(NthreadEqN)
00278     {
00279         gpuGenerateVerletListSmem<<<dimGrid, dimBlock, dimBlock.x*sizeof(FLOAT_ARRAY_TYPE)>>>(dPos, vLists, vListsPitch, vCount, dMax, rVerSq, N, boxSize, dListUpdate);
00280         checkCudaError("Kernel error: gpuGenerateVerletListSmem");
00281     }
00282     else
00283     {
00284         gpuGenerateVerletListVar1<<<dimGrid, dimBlock>>>(dPos, vLists, vListsPitch, vCount, dMax, rVerSq, N, boxSize, dListUpdate);
00285         checkCudaError("Kernel error: gpuGenerateVerletList");
00286     }
00287 
00288 
00289     //rescale temperature
00290     rescaleT(N, dim, redM, redN, T, threads, blocks, dVel, reduction);
00291 
00292     //remove drift
00293     removeDrift(N, redM, redN, threads, blocks, dVel, compReduction);
00294 
00295     //do initial velocity backstep
00296     gpuResetForces<<<dimGrid, dimBlock>>>(dForce, N);
00297     checkCudaError("Kernel error: gpuResetForces");
00298 
00299     //Verlet List kernel
00300     gpuLJForcesVerletList<<< dimGrid, dimBlock>>>(dPos, dForce, N, vLists, vListsPitch, vCount, boxSize, rCutSq);
00301     checkCudaError("Kernel error: gpuLJForcesVerletList");
00302 
00303     gpuHarmonicBondForces<<< dimGrid, dimBlock>>>(dPos, dForce, dCnn, dCnnPitch, N, numNN, k, boxSize);
00304     checkCudaError("Kernel error: gpuHarmonicBondForces");
00305     gpuInitialBackstep<<<dimGrid, dimBlock>>>(dVel, dForce, N, dt);
00306     checkCudaError("Kernel error: gpuInitialBackstep");
00307 
00308     std::cout.setf(std::ios_base::scientific);
00309     std::cout.precision(std::numeric_limits< float >::digits10);
00310 
00311     //main simulation loop
00312     int counter = NOffset + 1;
00313     for(int i=NOffset+NOut; i<=NSim; i=i+NOut)
00314     {
00315         for(int j=NAdjust; j<= NOut; j=j+NAdjust)
00316         {
00317             //adjust velocities and remove drift
00318             rescaleT(N, dim, redM, redN, T, threads, blocks, dVel, reduction);
00319             removeDrift(N, redM, redN, threads, blocks, dVel, compReduction);
00320 
00321             for(int mAdj=1; mAdj<=NAdjust; ++mAdj)
00322             {
00323                 //check if neigbor list update is needed
00324                 cudaMemcpy(&hListUpdate, dListUpdate, sizeof(bool), cudaMemcpyDeviceToHost);
00325                 checkCudaError("Mem Cpy: &hListUpdate, dListUpdate");
00326                 if(hListUpdate == true)
00327                 {
00328                     //std::cout<<"VL update @: "<<counter<<"\n";
00329                     if(NthreadEqN)
00330                     {
00331                         gpuGenerateVerletListSmem<<<dimGrid, dimBlock, dimBlock.x*sizeof(FLOAT_ARRAY_TYPE)>>>(dPos, vLists, vListsPitch, vCount, dMax, rVerSq, N, boxSize, dListUpdate);
00332                         checkCudaError("Kernel error: gpuGenerateVerletListSmem");
00333                     }
00334                     else
00335                     {
00336                         gpuGenerateVerletListVar1<<<dimGrid, dimBlock>>>(dPos, vLists, vListsPitch, vCount, dMax, rVerSq, N, boxSize, dListUpdate);
00337                         checkCudaError("Kernel error: gpuGenerateVerletList");
00338                     }
00339                     hListUpdate = false;
00340                 }
00341 
00342                 //calculate forces
00343                 gpuResetForces<<<dimGrid, dimBlock>>>(dForce, N);
00344                 checkCudaError("Kernel error: gpuResetForces");
00345 
00346                 //Verlet List kernel
00347                 gpuLJForcesVerletList<<< dimGrid, dimBlock>>>(dPos, dForce, N, vLists, vListsPitch, vCount, boxSize, rCutSq);
00348                 checkCudaError("Kernel error: gpuLJForcesVerletList");
00349                 gpuHarmonicBondForces<<< dimGrid, dimBlock>>>(dPos, dForce, dCnn, dCnnPitch, N, numNN, k, boxSize);
00350                 checkCudaError("Kernel error: gpuHarmonicBondForces");
00351                 //integrate
00352                 gpuIntegratorLeapFrog<<< dimGrid, dimBlock>>>(dPos, dVel, dForce, N, dt, boxSize, dMax, maxDisplacement, dListUpdate);
00353                 checkCudaError("Kernel error: gpuIntegratorLeapFrog");
00354 
00355                 if(nextLogOutput == counter)
00356                 {
00357                     if(i == nextLogOutput)
00358                     {
00359                         linLogEqual = true;
00360                     }
00361                     gpuResetForces<<<dimGrid, dimBlock>>>(dForce, N);
00362                     checkCudaError("Kernel error: gpuResetForces");
00363                     //Verlet List kernel
00364                     gpuLJForcesVerletList<<< dimGrid, dimBlock>>>(dPos, dForce, N, vLists, vListsPitch, vCount, boxSize, rCutSq);;
00365                     checkCudaError("Kernel error: gpuLJForcesVerletList");
00366                     gpuHarmonicBondForces<<< dimGrid, dimBlock>>>(dPos, dForce, dCnn, dCnnPitch, N, numNN, k, boxSize);
00367                     checkCudaError("Kernel error: gpuHarmonicBondForces");
00368                     gpuGetVelOnTime<<<dimGrid, dimBlock>>>(dVel, velOnDt, dForce, N, dt);
00369                     checkCudaError("Kernel error: gpuGetVelOnTime");
00370                     //transfer x-v-data to host
00371                     cudaMemcpy(hPos, dPos, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00372                     checkCudaError("Mem Cpy: hPos, dPos");
00373                     cudaMemcpy(hVel, velOnDt, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00374                     checkCudaError("Mem Cpy: hVel, velOnDt");
00375                     //write output
00376                     success = writeData(basename, hPos, hVel, hCnn, N, numNN, counter);
00377                     int tmp;
00378                     do
00379                     {
00380                         LogOutputCounter += 1;
00381                         tmp = static_cast<int>(exp(logInc * LogOutputCounter)*LogOutputOffset);
00382                     }
00383                     while (tmp == nextLogOutput);
00384                     nextLogOutput = tmp;
00385                 }
00386                 counter++;
00387             }//end adjustment loop
00388         }//end output loop
00389         //write output
00390         //get velocities on time slice (due to leap frog shift)
00391         if(linLogEqual == false)
00392         {
00393             gpuResetForces<<<dimGrid, dimBlock>>>(dForce, N);
00394             checkCudaError("Kernel error: gpuResetForces");
00395 
00396             //Verlet List kernel
00397             gpuLJForcesVerletList<<< dimGrid, dimBlock>>>(dPos, dForce, N, vLists, vListsPitch, vCount, boxSize, rCutSq);
00398             checkCudaError("Kernel error: gpuLJForcesVerletList");
00399             gpuHarmonicBondForces<<< dimGrid, dimBlock>>>(dPos, dForce, dCnn, dCnnPitch, N, numNN, k, boxSize);
00400             checkCudaError("Kernel error: gpuHarmonicBondForces");
00401             gpuGetVelOnTime<<<dimGrid, dimBlock>>>(dVel, velOnDt, dForce, N, dt);
00402             checkCudaError("Kernel error: gpuGetVelOnTime");
00403 
00404             //transfer x-v-data to host
00405             cudaMemcpy(hPos, dPos, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00406             checkCudaError("Mem Cpy: hPos, dPos");
00407             cudaMemcpy(hVel, velOnDt, sizeof(FLOAT_ARRAY_TYPE)*N, cudaMemcpyDeviceToHost);
00408             checkCudaError("Mem Cpy: hVel, velOnDt");
00409         }
00410         //write output
00411         success = writeData(backup, hPos, hVel, hCnn, N, numNN, i);
00412         linLogEqual = false;
00413     }//end simulation loop
00414 
00415     //#### stop timing
00416     cudaThreadSynchronize();//wait until all threads are finished
00417     cudaEventRecord(stop, 0);
00418     cudaEventSynchronize(stop);//blocking till stop has been recorded
00419     cEnd=clock();
00420     std::cout<<"########### CPU TIME ###########\n";
00421     std::cout<<"start: "<<cStart<<"\nend: "<<cEnd<<"\n";
00422     std::cout<<"clocks per second: "<<CLOCKS_PER_SEC<<"\n";
00423     std::cout<<"cpu time: "<<(cEnd-cStart)/static_cast<double>(CLOCKS_PER_SEC)<<"s\n\n";
00424     cudaEventElapsedTime( &time, start, stop);
00425     std::cout<<"########## CUDA TIMER #########\n";
00426     std::cout<<"elapsed time: "<<time<<"ms\n";
00427     //#############################################################
00428     //free memory
00429     delete[] hPos;
00430     delete[] hVel;
00431     delete[] hCnn;
00432     cudaFree(dPos);
00433     cudaFree(dVel);
00434     cudaFree(velOnDt);
00435     cudaFree(dForce);
00436     cudaFree(dEPot);
00437     cudaFree(reduction);
00438     cudaFree(compReduction);
00439     cudaFree(dCnn);
00440     cudaFree(vCount);
00441     cudaFree(vLists);
00442     cudaFree(dMax);
00443     cudaFree(dListUpdate);
00444     success=memInfo();
00445 
00446     return 0;
00447 }