#include // Kernel function runs on device. __global__ void mykernel(int * array) { int threadid; // Get unique thread id for each thread. int numthreadsperblock = blockDim.x; int blockindex = blockIdx.x; int threadindex = threadIdx.x; threadid = numthreadsperblock*blockindex + threadindex; // Each thread writes its id in the array. printf("thread %d writes %d in array[%d]\n",threadid,threadid,threadid); array[threadid] = threadid; } // Host function. int main(void) { int numblocks = 2; int numthreadsperblock = 3; int numthreads = numblocks*numthreadsperblock; int* array; // Allocate Unified Memory -- accessible from CPU or GPU cudaMallocManaged(&array, numthreads*sizeof(int)); // Optional: prefetch memory to improve efficiency. cudaMemPrefetchAsync(array,numthreads*sizeof(int), 0, 0); // Start threads. printf("Device writes memory.\n"); mykernel<<>>(array); // Wait for GPU to finish. cudaDeviceSynchronize(); // Print array. printf("\nHost reads memory.\n"); for(int i=0; i