#include #include #include // Macro to check a cude function call for runtime errors. #define CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ exit(EXIT_FAILURE); \ } \ } while(0) // Kernel function runs on device. __global__ void mykernel(float* fdevice, float* gdevice, float* hdevice, int nf, int ng, int nh, int nthreads) { int threadid; int i,j,k; float sum; // Each thread computes several samples of hdevice separated by nthreads. // Get unique thread id for each thread. int blocksize = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = blocksize*blockindex + threadindex; for(i=threadid; i>>(fdevice,gdevice,hdevice,nf,ng,nh,nthreads); // Wait for GPU to finish. cudaDeviceSynchronize(); // Copy memory from device to host. CHECK(cudaMemcpy(hhost, hdevice, sizeh, cudaMemcpyDeviceToHost)); // Compare results of CPU and GPU float maxerr = 0.0; int imax = 0; float err; for(int i=0; i maxerr) { maxerr = err; imax = i; } } if(maxerr != 0.0) { printf("\n\n ----------------------ERROR---------------------------\n\n"); printf("CPU: %f GPU: %f at %d\n\n\n",hhost[imax],chost[imax],i); exit(-1); } free(fhost); free(ghost); free(hhost); cudaFree(fdevice); cudaFree(gdevice); cudaFree(hdevice); } int main() { int gridsize = 30; int blocksize = 128; srand48(clock()); convolution(gridsize,blocksize); return 0; }