#include #include // Macro to check a cude function call for runtime errors. #define CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ exit(EXIT_FAILURE); \ } \ } while(0) // Kernel function runs on device. __global__ void kernelwrite(float* array) { int threadid; // Get unique thread id for each thread. int blocksize = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = blocksize*blockindex + threadindex; array[threadid] = (float)threadid; } // Kernel function runs on device. __global__ void kernelread(float* array) { int threadid; // Get unique thread id for each thread. int blocksize = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = blocksize*blockindex + threadindex; float x = array[threadid]; if(x != (float)threadid) { printf("error: threadid = %d, found %f\n",threadid,x); } else { printf("thread %d reads %f\n",threadid,x); } } // Host function. void persistent(int gridsize, int blocksize) { int nthreads = gridsize * blocksize; // total number of threads float* arrayhost; float* arraydevice; int size = nthreads*sizeof(float); // Allocate separate memory on host and device. arrayhost = (float*)malloc(size); CHECK(cudaMalloc(&arraydevice,size)); // Host copies memory to device. CHECK(cudaMemcpy(arraydevice, arrayhost, nthreads, cudaMemcpyHostToDevice)); // Start write threads. kernelwrite<<>>(arraydevice); // Wait for GPU to finish. cudaDeviceSynchronize(); // Start read threads. kernelread<<>>(arraydevice); // Wait for GPU to finish. cudaDeviceSynchronize(); free(arrayhost); cudaFree(arraydevice); } int main() { int gridsize = 2; int blocksize = 3; persistent(gridsize,blocksize); }