#include __global__ void kernel1(){ } __global__ void kernel2(){ } __global__ void kernel3(){ } int main() { cudaStream_t streamA, streamB; cudaStreamCreate(&streamA); cudaStreamCreate(&streamB); // Launch kernel1 and kernel2 in streamA. kernel1<<<32, 256, 0, streamA>>>(); kernel2<<<32, 256, 0, streamA>>>(); // Launch kernel3 in streamB. kernel3<<<32, 256, 0, streamB>>>(); // Wait for both streams to finish cudaStreamSynchronize(streamA); cudaStreamSynchronize(streamB); cudaStreamDestroy(streamA); cudaStreamDestroy(streamB); return 0; }