#include __global__ void mykernel() { // Declare shared variable with modifier __shared__. __shared__ int x; int tid = threadIdx.x; x = 17; // Synchronize all threads of this block. __syncthreads(); // Thread 0 writes shared variable. if(tid == 0) x = 42; // Synchronize all threads of this block. __syncthreads(); // Thread 1 reads shared variable. if(tid == 1) printf("thread 1 reads shared variable: %d\n",x); } int main() { // Launch kernel with 1 block and 2 threads. mykernel<<<1, 2>>>(); cudaDeviceSynchronize(); return 0; }