#include #define ThreadsPerBlock 32 #define BlocksPerSM 20 #define SIZE 200 // Compiler has to assign registers to threads such that // at least BlocksPerSM blocks can be active on an SM. // Example: // With 20 blocks per SM, 32 threads per Block and 64K registers per SM // each thread can have at most 102 registers. __launch_bounds__(ThreadsPerBlock, BlocksPerSM) __global__ void spill(int *array) { // if SIZE is small, then all elements of regs can be stored in registers. int regs[SIZE]; int tid = threadIdx.x; int sum = 0; for(int i=0; i>>(arraydevice); cudaMemcpy(arrayhost, arraydevice, ThreadsPerBlock*sizeof(int), cudaMemcpyDeviceToHost); cudaFree(arraydevice); return 0; }