#include <stdio.h>
#include <unistd.h>
#include <math.h>

// Macro to check a cude function call for runtime errors.
#define CHECK(call)                                           \
  do {                                                        \
    cudaError_t err = call;                                   \
    if (err != cudaSuccess) {                                 \
      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
             cudaGetErrorString(err));                        \
      exit(EXIT_FAILURE);                                     \
    }                                                         \
  } while(0)


// Kernel function runs on device.
__global__ void mykernel(float* fdevice, float* gdevice, float* hdevice, int nf, int ng, int nh, int nthreads)
{
  int threadid;
  int i,j,k;
  float sum;

  // Each thread computes several samples of hdevice separated by nthreads.

  // Get unique thread id for each thread.
  int blocksize = blockDim.x;
  int blockindex = blockIdx.x;
  int threadindex = threadIdx.x;
  threadid = blocksize*blockindex + threadindex;

  for(i=threadid; i<nh; i+=nthreads)
  {
    sum = (float)0.0;
    k = ng-1+i;
    for(j=0; j<ng; j++)
    {
      sum += fdevice[k] * gdevice[j];
      --k;
    }
    hdevice[i] = sum;
  }
}


// Host function.
void convolution(int gridsize, int blocksize)
{
  int nf = 1<<12;   // Signal
  int ng = 1<<10;    // Impulse response
  int nh = nf-ng+1; // Convolved signal

  int nthreads = gridsize * blocksize; // total number of threads

  float* fhost;
  float* ghost;
  float* hhost;
  float* chost;
  float* fdevice;
  float* gdevice;
  float* hdevice;

  int sizef = nf*sizeof(float);
  int sizeg = ng*sizeof(float);
  int sizeh = nh*sizeof(float);

  int i,j,k;
  float sum;

  // Allocate separate memory on host and device.
  fhost = (float*)malloc(sizef);
  ghost = (float*)malloc(sizeg);
  hhost = (float*)malloc(sizeh);
  chost = (float*)malloc(sizeh);
  CHECK(cudaMalloc(&fdevice,sizef));
  CHECK(cudaMalloc(&gdevice,sizeg));
  CHECK(cudaMalloc(&hdevice,sizeh));

  // Host writes memory.
  for(i=0; i<nf; i++) fhost[i] = (float)drand48();
  for(i=0; i<ng; i++) ghost[i] = (float)drand48();

  for(i=0; i<nh; i++)
  {
    hhost[i] = (float)0.0;
    sum = (float)0.0;
    k = ng-1+i;
    for(j=0; j<ng; j++)
    {
      sum = fmaf(fhost[k], ghost[j], sum);  // fused multiply-add on host.
      //sum += fhost[k] * ghost[j];
      --k;
    }
    chost[i] = sum;
  }

  // Host copies memory to device.
  CHECK(cudaMemcpy(fdevice, fhost, sizef, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(gdevice, ghost, sizeg, cudaMemcpyHostToDevice));

  // Start threads.
  mykernel<<<gridsize,blocksize>>>(fdevice,gdevice,hdevice,nf,ng,nh,nthreads);

  // Wait for GPU to finish.
  cudaDeviceSynchronize();

  // Copy memory from device to host.
  CHECK(cudaMemcpy(hhost, hdevice, sizeh, cudaMemcpyDeviceToHost));

  // Compare results of CPU and GPU
  float maxerr = 0.0;
  int imax = 0;
  float err;
  for(int i=0; i<nh; i++)
  {
    err = hhost[i] - chost[i];
    if(err < 0.0) err = -err;
    if(err > maxerr) 
    {
      maxerr = err;
      imax = i;
    }
  }
  if(maxerr != 0.0)
  {
    printf("\n\n ----------------------ERROR---------------------------\n\n");
    printf("CPU: %f  GPU: %f at %d\n\n\n",hhost[imax],chost[imax],i);
    exit(-1);
  }

  free(fhost);
  free(ghost);
  free(hhost);
  cudaFree(fdevice);
  cudaFree(gdevice);
  cudaFree(hdevice);
}

int main()
{
  int gridsize = 30;
  int blocksize = 128;

  srand48(clock());
  convolution(gridsize,blocksize);
  return 0;
}