#include #include "driver_types.h" // Macro to check a cude function call for runtime errors. #define CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ exit(EXIT_FAILURE); \ } \ } while(0) // Kernel function runs on device. __global__ void mykernel(int * arraydevice) { int threadid; // Get unique thread id for each thread. int numthreadsperblock = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = numthreadsperblock*blockindex + threadindex; // Each thread adds its id in the array. printf("thread %d adds %d to arraydevice[%d]\n",threadid,threadid,threadid); arraydevice[threadid] += threadid; } // Host function. int main(void) { int numblocks = 2; int numthreadsperblock = 3; int numthreads = numblocks*numthreadsperblock; int* arrayhost; int* arraydevice; int size = numthreads*sizeof(int); // Allocate separate memory on host and device. arrayhost = (int*)malloc(size); CHECK(cudaMalloc(&arraydevice,size)); // Host writes memory. for(int i=0; i>>(arraydevice); // Wait for GPU to finish. CHECK(cudaDeviceSynchronize()); // Copy memory from device to host. printf("\nCopy array from device to host\n\n"); CHECK(cudaMemcpy(arrayhost, arraydevice, size, cudaMemcpyDeviceToHost)); // Print array. for(int i=0; i