#include // Kernel function runs on device. __global__ void mykernel(int * arraydevice) { int threadid; // Get unique thread id for each thread. int numthreadsperblock = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = numthreadsperblock*blockindex + threadindex; // Each thread adds its id in the array. printf("thread %d adds %d to arraydevice[%d]\n",threadid,threadid,threadid); arraydevice[threadid] += threadid; } // Host function. int main(void) { int numblocks = 2; int numthreadsperblock = 3; int numthreads = numblocks*numthreadsperblock; int* arrayhost; int* arraydevice; int size = numthreads*sizeof(int); // Allocate separate memory on host and device. arrayhost = (int*)malloc(size); cudaMalloc(&arraydevice,size); // Host writes memory. for(int i=0; i>>(arraydevice); // Wait for GPU to finish. cudaDeviceSynchronize(); // Copy memory from device to host. printf("\nCopy array from device to host\n\n"); cudaMemcpy(arrayhost, arraydevice, size, cudaMemcpyDeviceToHost); // Print array. for(int i=0; i