#include <stdio.h>

// Kernel function runs on device.
__global__ void mykernel(int * arraydevice)
{
  int threadid;

  // Get unique thread id for each thread.
  int numthreadsperblock = blockDim.x;
  int blockindex = blockIdx.x;
  int threadindex = threadIdx.x;
  threadid = numthreadsperblock*blockindex + threadindex;

  // Each thread adds its id in the array.
  printf("thread %d adds %d to arraydevice[%d]\n",threadid,threadid,threadid);
  arraydevice[threadid] += threadid;
}


// Host function.
int main(void)
{
  int numblocks = 2;
  int numthreadsperblock = 3;
  int numthreads = numblocks*numthreadsperblock;
  int* arrayhost;
  int* arraydevice;
  int size = numthreads*sizeof(int);
 
  // Allocate separate memory on host and device.
  arrayhost = (int*)malloc(size);
  cudaMalloc(&arraydevice,size);

  // Host writes memory.
  for(int i=0; i<numthreads; i++)
  {
    printf("Host sets arrayhost[%d] to %d\n",i,i);
    arrayhost[i] = i;
  }

  // Host copies memory to device.
  printf("\nCopy array from host to device\n\n");
  cudaMemcpy(arraydevice, arrayhost, size, cudaMemcpyHostToDevice);

  // Start threads.
  mykernel<<<numblocks,numthreadsperblock>>>(arraydevice);

  // Wait for GPU to finish.
  cudaDeviceSynchronize();

  // Copy memory from device to host.
  printf("\nCopy array from device to host\n\n");
  cudaMemcpy(arrayhost, arraydevice, size, cudaMemcpyDeviceToHost);

  // Print array.
  for(int i=0; i<numthreads; i++)
    printf("Host reads arrayhost[%d] = %d\n",i,arrayhost[i]);

  free(arrayhost);
  cudaFree(arraydevice);

  // Error check.
  cudaError_t err =  cudaPeekAtLastError();
  if(err != cudaSuccess)
    printf("Cuda error: %s\n",cudaGetErrorString(err));


  return 0;
}