From b0569a440b0cf45b567ba21baf67cfe89178a535 Mon Sep 17 00:00:00 2001 From: Katarzyna Kaczmarska Date: Mon, 23 Mar 2026 21:55:19 +0100 Subject: [PATCH] [UR][CUDA] Fix urUSMContextMemcpyExp synchronization issue cuMemcpy is synchronous with respect to the host, but it does not synchronize with device operations in other streams. This can lead to race conditions where urUSMContextMemcpyExp reads stale data if there are pending operations on the source or destination buffers. The issue manifests as sporadic test failures in CI where host_mem reads as 0 instead of the expected value (42), indicating the copy happened before the fill operations completed. Fix: Add cuCtxSynchronize() before cuMemcpy to ensure all pending device operations in the context have completed. This guarantees data consistency at the cost of a device-wide synchronization. Since urUSMContextMemcpyExp is not performance-critical and should provide strong consistency guarantees, this trade-off is acceptable. Fixes #19688 Test: exp_usm_context_memcpy/urUSMContextMemcpyExpTestDevice.Success now passes consistently on CUDA. --- unified-runtime/source/adapters/cuda/usm.cpp | 16 ++++++++++++---- .../urUSMContextMemcpyExp.cpp | 2 -- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index c805c1084ec0..04e052377336 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -574,10 +574,18 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, - void *pDst, - const void *pSrc, - size_t Size) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) { + // cuMemcpy is synchronous with respect to the host, but it does not + // synchronize with any device streams. We need to synchronize all streams + // in the context before performing the copy to ensure all previous + // operations have completed. + // + // Set the context and synchronize all streams + ScopedContext Active(hContext->getDevices().front()); + UR_CHECK_ERROR(cuCtxSynchronize()); + + // Now perform the synchronous copy UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size)); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index 0c41b4fd9dff..ec776bc0cbc7 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -81,8 +81,6 @@ struct urUSMContextMemcpyExpTestDevice : urUSMContextMemcpyExpTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice); TEST_P(urUSMContextMemcpyExpTestDevice, Success) { - // https://github.com/intel/llvm/issues/19688 - UUR_KNOWN_FAILURE_ON(uur::CUDA{}); ASSERT_SUCCESS( urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); verifyData();