%%writefile test.cu #include // Kernel function runs on GPU. __global__ void mykernel() { int index = blockIdx.x * blockDim.x + threadIdx.x; printf("%d\n",index); } int main(void) { mykernel<<<2,3>>>(); // 2 blocks, 3 threads per block // Wait for GPU to finish. cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) printf("%s\n",cudaGetErrorName(err)); return 0; }