#include // Kernel function to add the elements of two arrays __global__ void add(int n, float* x, float* y, float* z) { int index = blockIdx.x * blockDim.x + threadIdx.x; int numthreads = blockDim.x * gridDim.x; for(int i=index; i>>(n, xd, yd, zd); // Wait for GPU to finish. cudaDeviceSynchronize(); // Copy result array from device to host. cudaMemcpy(zh, zd, size, cudaMemcpyDeviceToHost); // Compare addition on GPU with addition on CPU float maxError = 0.0f; for (int i=0; i