diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index c805c1084ec0f..04e052377336c 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -574,10 +574,18 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, - void *pDst, - const void *pSrc, - size_t Size) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) { + // cuMemcpy is synchronous with respect to the host, but it does not + // synchronize with any device streams. We need to synchronize all streams + // in the context before performing the copy to ensure all previous + // operations have completed. + // + // Set the context and synchronize all streams + ScopedContext Active(hContext->getDevices().front()); + UR_CHECK_ERROR(cuCtxSynchronize()); + + // Now perform the synchronous copy UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size)); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index 0c41b4fd9dff6..ec776bc0cbc7d 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -81,8 +81,6 @@ struct urUSMContextMemcpyExpTestDevice : urUSMContextMemcpyExpTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice); TEST_P(urUSMContextMemcpyExpTestDevice, Success) { - // https://github.com/intel/llvm/issues/19688 - UUR_KNOWN_FAILURE_ON(uur::CUDA{}); ASSERT_SUCCESS( urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); verifyData();