#include __global__ void kernel(int * arraydevice, int length) { int index = blockIdx.x * blockDim.x + threadIdx.x; int numthreads = blockDim.x * gridDim.x; for(int i=index; i>>(arraydeviceA, length); cudaMemcpyAsync(arrayhostA, arraydeviceA, size, cudaMemcpyDeviceToHost, streamA); // Launch streamB. cudaMemcpyAsync(arraydeviceB, arrayhostB, size, cudaMemcpyHostToDevice, streamB); kernel<<<3, 128, 0, streamB>>>(arraydeviceB, length); cudaMemcpyAsync(arrayhostB, arraydeviceB, size, cudaMemcpyDeviceToHost, streamB); // Wait for streamA to finish. cudaStreamSynchronize(streamA); check(arrayhostA,length); // Wait for streamB to finish. cudaStreamSynchronize(streamB); check(arrayhostB,length); cudaStreamDestroy(streamA); cudaStreamDestroy(streamB); return 0; }