#include __global__ void mykernel(unsigned long loops) { unsigned long start = clock64(); while (clock64() - start < loops); } int main() { unsigned long loops = 100000000; cudaEvent_t startA, startB, endA, endB; cudaStream_t streamA, streamB; cudaStreamCreate(&streamA); cudaStreamCreate(&streamB); cudaEventCreate(&startA); cudaEventCreate(&endA); cudaEventCreate(&startB); cudaEventCreate(&endB); // create zero event as a reference time. cudaEvent_t zeroEvent; cudaEventCreate(&zeroEvent); cudaEventRecord(zeroEvent, 0); cudaEventSynchronize(zeroEvent); // Launch kernel1 in streamA cudaEventRecord(startA, streamA); mykernel<<<1, 1, 0, streamA>>>(loops); cudaEventRecord(endA, streamA); // Launch kernel2 in streamB cudaEventRecord(startB, streamB); mykernel<<<1, 1, 0, streamB>>>(loops); cudaEventRecord(endB, streamB); // Wait for both streams to finish cudaStreamSynchronize(streamA); cudaStreamSynchronize(streamB); float ms; cudaEventElapsedTime(&ms, startA, endA); printf("time for stream A: %f\n",ms); cudaEventElapsedTime(&ms, startB, endB); printf("time for stream B: %f\n",ms); float tA_start, tA_end, tB_start, tB_end; cudaEventElapsedTime(&tA_start, zeroEvent, startA); cudaEventElapsedTime(&tA_end, zeroEvent, endA); cudaEventElapsedTime(&tB_start, zeroEvent, startB); cudaEventElapsedTime(&tB_end, zeroEvent, endB); ms = fmaxf(0.0f, fminf(tA_end, tB_end) - fmaxf(tA_start, tB_start)); printf("overlap time = %f\n", ms); cudaStreamDestroy(streamA); cudaStreamDestroy(streamB); return 0; }