#include <stdio.h>
#include <unistd.h>


// Macro to check a cude function call for runtime errors.
#define CHECK(call)                                           \
  do {                                                        \
    cudaError_t err = call;                                   \
    if (err != cudaSuccess) {                                 \
      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
             cudaGetErrorString(err));                        \
      exit(EXIT_FAILURE);                                     \
    }                                                         \
  } while(0)


// Kernel function runs on device.
__global__ void kernelwrite(float* array)
{
  int threadid;

  // Get unique thread id for each thread.
  int blocksize = blockDim.x;
  int blockindex = blockIdx.x;
  int threadindex = threadIdx.x;
  threadid = blocksize*blockindex + threadindex;

  array[threadid] = (float)threadid;
}
// Kernel function runs on device.
__global__ void kernelread(float* array)
{
  int threadid;

  // Get unique thread id for each thread.
  int blocksize = blockDim.x;
  int blockindex = blockIdx.x;
  int threadindex = threadIdx.x;
  threadid = blocksize*blockindex + threadindex;

  float x = array[threadid];
  if(x != (float)threadid)
  {
    printf("error: threadid = %d, found %f\n",threadid,x);
  }
  else
  {
    printf("thread %d reads %f\n",threadid,x);
  }
}


// Host function.
void persistent(int gridsize, int blocksize)
{
  int nthreads = gridsize * blocksize; // total number of threads

  float* arrayhost;
  float* arraydevice;
  int size = nthreads*sizeof(float);

  // Allocate separate memory on host and device.
  arrayhost = (float*)malloc(size);
  CHECK(cudaMalloc(&arraydevice,size));

  // Host copies memory to device.
  CHECK(cudaMemcpy(arraydevice, arrayhost, nthreads, cudaMemcpyHostToDevice));

  // Start write threads.
  kernelwrite<<<gridsize,blocksize>>>(arraydevice);

  // Wait for GPU to finish.
  cudaDeviceSynchronize();

  // Start read threads.
  kernelread<<<gridsize,blocksize>>>(arraydevice);

  // Wait for GPU to finish.
  cudaDeviceSynchronize();

  free(arrayhost);
  cudaFree(arraydevice);
}

int main()
{
  int gridsize = 2;
  int blocksize = 3;

  persistent(gridsize,blocksize);
}