#include <stdio.h>
#include "driver_types.h"

// Macro to check a cude function call for runtime errors.
#define CHECK(call)                                           \
  do {                                                        \
    cudaError_t err = call;                                   \
    if (err != cudaSuccess) {                                 \
      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
             cudaGetErrorString(err));                        \
      exit(EXIT_FAILURE);                                     \
    }                                                         \
  } while(0)


// Kernel function runs on device.
__global__ void mykernel(int * arraydevice)
{
  int threadid;

  // Get unique thread id for each thread.
  int numthreadsperblock = blockDim.x;
  int blockindex = blockIdx.x;
  int threadindex = threadIdx.x;
  threadid = numthreadsperblock*blockindex + threadindex;

  // Each thread adds its id in the array.
  printf("thread %d adds %d to arraydevice[%d]\n",threadid,threadid,threadid);
  arraydevice[threadid] += threadid;
}


// Host function.
int main(void)
{
  int numblocks = 2;
  int numthreadsperblock = 3;
  int numthreads = numblocks*numthreadsperblock;
  int* arrayhost;
  int* arraydevice;
  int size = numthreads*sizeof(int);
 
  // Allocate separate memory on host and device.
  arrayhost = (int*)malloc(size);
  CHECK(cudaMalloc(&arraydevice,size));

  // Host writes memory.
  for(int i=0; i<numthreads; i++)
  {
    printf("Host sets arrayhost[%d] to %d\n",i,i);
    arrayhost[i] = i;
  }

  // Host copies memory to device.
  printf("\nCopy array from host to device\n\n");
  CHECK(cudaMemcpy(arraydevice, arrayhost, size, cudaMemcpyHostToDevice));

  // Start threads.
  mykernel<<<numblocks,numthreadsperblock>>>(arraydevice);

  // Wait for GPU to finish.
  CHECK(cudaDeviceSynchronize());

  // Copy memory from device to host.
  printf("\nCopy array from device to host\n\n");
  CHECK(cudaMemcpy(arrayhost, arraydevice, size, cudaMemcpyDeviceToHost));

  // Print array.
  for(int i=0; i<numthreads; i++)
    printf("Host reads arrayhost[%d] = %d\n",i,arrayhost[i]);

  free(arrayhost);
  CHECK(cudaFree(arraydevice));

  return 0;
}