#include <stdio.h>

// Kernel function to add the elements of two arrays
__global__ void add(int n, float* x, float* y, float* z)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int numthreads = blockDim.x * gridDim.x;

  for(int i=index; i<n; i+=numthreads) 
    z[i] = x[i] + y[i];
}

int main(void)
{
  int n = 1<<20;
  int size = n*sizeof(float);
  float *xd,*yd,*zd,*xh,*yh,*zh;

  // Allocate memory on host and device.
  xh = (float*)malloc(size); 
  yh = (float*)malloc(size);
  zh = (float*)malloc(size);
  cudaMalloc(&xd,size); 
  cudaMalloc(&yd,size); 
  cudaMalloc(&zd,size);

  // Initialize x and y arrays on the host
  for(int i=0; i<n; i++)
  {
    xh[i] = 1.0f;
    yh[i] = (float)i;
  }
  // Copy arrays from host to device.
  cudaMemcpy(xd, xh, size, cudaMemcpyHostToDevice);
  cudaMemcpy(yd, yh, size, cudaMemcpyHostToDevice);

  // Start 20 blocks with 32 threads per block.
  add<<<20, 32>>>(n, xd, yd, zd);

  // Wait for GPU to finish.
  cudaDeviceSynchronize();

  // Copy result array from device to host.
  cudaMemcpy(zh, zd, size, cudaMemcpyDeviceToHost);

  // Compare addition on GPU with addition on CPU
  float maxError = 0.0f;
  for (int i=0; i<n; i++) maxError = fmax(maxError, xh[i]+yh[i]-zh[i]);
  printf("Max error: %f\n",maxError);

  // Free memory
  cudaFree(xd); cudaFree(yd); cudaFree(zd);
  free(xh); free(yh); free(zh);

  return 0;
}