#include // Kernel function runs on device. __global__ void mykernel() { // Every thread prints its block and thread ID. printf("no. blocks in grid %d, no. threads in block %d, block/thread index %d/%d\n",gridDim.x,blockDim.x,blockIdx.x,threadIdx.x); } // Host function. int main(void) { int griddim = 2; int blockdim = 3; // Start threads. mykernel<<>>(); // Wait for GPU to finish. cudaDeviceSynchronize(); // Error check. cudaError_t err = cudaPeekAtLastError(); if(err != cudaSuccess) printf("Cuda error: %s\n",cudaGetErrorString(err)); return 0; }