#include <stdio.h>

#define ThreadsPerBlock 32
#define BlocksPerSM 20

#define SIZE 200

// Compiler has to assign registers to threads such that 
// at least BlocksPerSM blocks can be active on an SM.

// Example:
// With 20 blocks per SM, 32 threads per Block and 64K registers per SM
// each thread can have at most 102 registers.

__launch_bounds__(ThreadsPerBlock, BlocksPerSM)
__global__ void spill(int *array) 
{
  // if SIZE is small, then all elements of regs can be stored in registers.
  int regs[SIZE];
  int tid = threadIdx.x;
  int sum = 0;

  for(int i=0; i<SIZE; i++) 
    regs[i] = (int)clock64();

  // Unroll loop allows to use registers instead of memory access.
  #pragma unroll 1000
  for (int i=0; i<SIZE; i++) 
    sum += regs[i];

  array[tid] = sum;
}

int main() 
{
  int *arraydevice;
  int arrayhost[ThreadsPerBlock];

  cudaMalloc(&arraydevice, ThreadsPerBlock*sizeof(int));
  spill<<<1, ThreadsPerBlock>>>(arraydevice);
  cudaMemcpy(arrayhost, arraydevice, ThreadsPerBlock*sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(arraydevice);
  return 0;
}