From ae554e7873f1e2516341300db5c5bf8ff843209d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Wed, 4 Feb 2026 10:08:45 -0700 Subject: [PATCH 01/24] datadeps: Fix KeyError on key ainfo in AliasedObjectCache --- src/datadeps/aliasing.jl | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index 64ce11be5..2f26fa3b0 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -263,8 +263,11 @@ function is_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::Ab key = cache.derived[ainfo] return key in cache.stored[space] end -function is_key_present(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - return haskey(cache.derived, ainfo) +function is_key_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) + if !haskey(cache.stored, space) + return false + end + return ainfo in cache.stored[space] end function get_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) @assert is_stored(cache, space, ainfo) "Cache does not have derived ainfo $ainfo" @@ -302,13 +305,13 @@ function is_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore return is_stored(cache_raw, cache.space, ainfo) end -function is_key_present(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing) +function is_key_stored(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing) wid = root_worker_id(cache.chunk) if wid != myid() - return remotecall_fetch(is_key_present, wid, cache, space, ainfo) + return remotecall_fetch(is_key_stored, wid, cache, space, ainfo) end cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return is_key_present(cache_raw, space, ainfo) + return is_key_stored(cache_raw, space, ainfo) end function get_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) wid = root_worker_id(cache.chunk) @@ -337,12 +340,8 @@ function set_key_stored!(cache::AliasedObjectCache, space::MemorySpace, ainfo::A end function aliased_object!(f, cache::AliasedObjectCache, x; ainfo=aliasing(x, identity)) x_space = memory_space(x) - if !is_key_present(cache, x_space, ainfo) - # Preserve the object's memory-space/processor pairing when inserting - # the source key. Using bare `tochunk(x)` defaults to OSProc, which can - # incorrectly wrap GPU-backed objects as CPU chunks. - x_chunk = x isa Chunk ? x : tochunk(x, first(processors(x_space))) - set_key_stored!(cache, x_space, ainfo, x_chunk) + if !is_key_stored(cache, x_space, ainfo) + set_key_stored!(cache, x_space, ainfo, x isa Chunk ? x : tochunk(x)) end if is_stored(cache, ainfo) return get_stored(cache, ainfo) From ce75198cd7734e09bf40e3fb34ec6d24fb1f0075 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 12 Feb 2026 10:29:08 -0700 Subject: [PATCH 02/24] fixup! datadeps: span_end must be inclusive --- src/utils/interval_tree.jl | 53 ++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/src/utils/interval_tree.jl b/src/utils/interval_tree.jl index 7dc59532e..1c2b3a7f6 100644 --- a/src/utils/interval_tree.jl +++ b/src/utils/interval_tree.jl @@ -195,11 +195,44 @@ function Base.delete!(tree::IntervalTree{M,E}, span::M) where {M,E} parent_of_succ.right = replacement end - # Update max_end bottom-up for the successor's original path - update_max_end!(parent_of_succ) - for i in length(succ_path)-1:-1:1 - update_max_end!(succ_path[i]) + target.span = successor.span + replacement = target + end + + # Phase 3: Handle overlap case - add remaining portions + if target_type == :overlap + original_start = span_start(original_span) + original_end = span_end(original_span) + del_start = span_start(span) + del_end = span_end(span) + verify_span(span) + + # Left portion: exists if original starts before deleted span + if original_start < del_start + left_end = min(original_end, del_start - _span_one(del_start)) + if left_end >= original_start + left_span = M(original_start, left_end - original_start + _span_one(left_end)) + if !isempty(left_span) + replacement = insert_node!(replacement, left_span) + end + end end + + # Right portion: exists if original extends beyond deleted span + if original_end > del_end + right_start = max(original_start, del_end + _span_one(del_end)) + if original_end >= right_start + right_span = M(right_start, original_end - right_start + _span_one(original_end)) + if !isempty(right_span) + replacement = insert_node!(replacement, right_span) + end + end + end + end + + # Phase 4: Update parent's child pointer + if isempty(path) + root = replacement else # Zero or one child replacement = target.left !== nothing ? target.left : target.right @@ -259,16 +292,14 @@ function find_overlapping!(node::IntervalNode{M,E}, query::M, result::Vector{M}; end end - # Search left subtree if its max_end is at least query_start + # Enqueue left subtree if it might contain overlapping intervals if current.left !== nothing && current.left.max_end >= span_start(query) - push!(stack, current.left) + push!(queue, current.left) end - # Search right subtree if it could contain an overlap - if current.right !== nothing && - span_start(current.span) <= span_end(query) && - current.right.max_end >= span_start(query) - push!(stack, current.right) + # Enqueue right subtree if query extends beyond current node's start + if current.right !== nothing && span_end(query) >= span_start(current.span) + push!(queue, current.right) end end end From bec6280893bee1ba5c42da3b21411352a5cd23ca Mon Sep 17 00:00:00 2001 From: Akhil Akkapelli Date: Mon, 11 Aug 2025 03:11:48 +0530 Subject: [PATCH 03/24] Add linear algebra functions for matrix inversion 'inv' and triangular solve `ldiv!` --- src/Dagger.jl | 2 +- src/array/linalg.jl | 70 ++++++--------------------------------------- src/array/trsm.jl | 34 +++++++++++----------- 3 files changed, 27 insertions(+), 79 deletions(-) diff --git a/src/Dagger.jl b/src/Dagger.jl index b29254d5d..5fd1a929e 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -10,7 +10,7 @@ import MemPool: DRef, FileRef, poolget, poolset import Base: collect, reduce, view import NextLA import LinearAlgebra -import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, Cholesky, diagind, ishermitian, issymmetric, I +import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, I import Random import Random: AbstractRNG diff --git a/src/array/linalg.jl b/src/array/linalg.jl index 3bf8e20e0..bddc781df 100644 --- a/src/array/linalg.jl +++ b/src/array/linalg.jl @@ -92,23 +92,9 @@ function LinearAlgebra.ishermitian(A::DArray{T,2}) where T return all(fetch, to_check) end -function LinearAlgebra.LAPACK.chkfinite(A::DArray) - Ac = A.chunks - chunk_finite = [Ref(true) for _ in Ac] - chkfinite!(finite, A) = finite[] = LinearAlgebra.LAPACK.chkfinite(A) - Dagger.spawn_datadeps() do - for idx in eachindex(Ac) - Dagger.@spawn chkfinite!(Out(chunk_finite[idx]), In(Ac[idx])) - end - end - return all(getindex, chunk_finite) -end - DMatrix{T}(::LinearAlgebra.UniformScaling, m::Int, n::Int, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, m, n), IBlocks) -DMatrix(::LinearAlgebra.UniformScaling{T}, m::Int, n::Int, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, m, n), IBlocks) DMatrix{T}(::LinearAlgebra.UniformScaling, size::Tuple, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, size), IBlocks) -DMatrix(::LinearAlgebra.UniformScaling{T}, size::Tuple, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, size), IBlocks) function LinearAlgebra.inv(F::LU{T,<:DMatrix}) where T n = size(F, 1) @@ -161,9 +147,7 @@ end function LinearAlgebra.ldiv!(A::LU{<:Any,<:DMatrix}, B::AbstractVecOrMat) - allowscalar(true) do - LinearAlgebra._apply_ipiv_rows!(A, B) - end + LinearAlgebra._apply_ipiv_rows!(A, B) LinearAlgebra.ldiv!(UnitLowerTriangular(A.factors), B) LinearAlgebra.ldiv!(UpperTriangular(A.factors), B) end @@ -179,64 +163,26 @@ function LinearAlgebra.ldiv!(A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLower uplo = 'L' end - dB = B isa DVecOrMat ? B : (B isa AbstractMatrix ? view(B, A.data.partitioning) : view(B, AutoBlocks())) + dB = B isa DVecOrMat ? B : view(B, A.data.partitioning) - parent_A = parent(A) if isa(B, AbstractVector) - min_bsa = min(min(parent_A.partitioning.blocksize...), dB.partitioning.blocksize[1]) - Dagger.maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa)) do parent_A, dB - Dagger.trsv!(uplo, trans, diag, alpha, parent_A, dB) - end + Dagger.trsv!(uplo, trans, diag, alpha, A.data, dB) elseif isa(B, AbstractMatrix) - min_bsa = min(parent_A.partitioning.blocksize...) - Dagger.maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa, min_bsa)) do parent_A, dB - Dagger.trsm!('L', uplo, trans, diag, alpha, parent_A, dB) + min_bsa = min(A.data.partitioning.blocksize...) + Dagger.maybe_copy_buffered(A.data => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa, min_bsa)) do A, dB + Dagger.trsm!('L', uplo, trans, diag, alpha, A, dB) end end end -function LinearAlgebra.ldiv!(Y::DArray, A::DMatrix, B::DArray) +function LinearAlgebra.ldiv!(Y::DArray, A::DMatrix, B::DArray) LinearAlgebra.ldiv!(A, copyto!(Y, B)) end -function LinearAlgebra.ldiv!(A::DMatrix, B::DArray) +function LinearAlgebra.ldiv!(A::DMatrix, B::DArray) LinearAlgebra.ldiv!(LinearAlgebra.lu(A), B) end function LinearAlgebra.ldiv!(C::DVecOrMat, A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLowerTriangular{<:Any,<:DMatrix},UpperTriangular{<:Any,<:DMatrix},UnitUpperTriangular{<:Any,<:DMatrix}}, B::DVecOrMat) LinearAlgebra.ldiv!(A, copyto!(C, B)) -end - -function LinearAlgebra.ldiv!(C::Cholesky{T,<:DMatrix}, B::DVecOrMat) where T - # Solve directly with C.factors and the trans parameter to avoid - # C.L / C.U which use copy(adjoint(factors)) — that creates a DMatrix - # with inconsistent block metadata vs chunk layout, breaking darray_copyto!. - factors = C.factors - alpha = one(T) - iscomplex = T <: Complex - trans = iscomplex ? 'C' : 'T' # conjugate transpose for complex, plain transpose for real - - parent_A = factors - dB = B isa DVecOrMat ? B : (B isa AbstractMatrix ? view(B, factors.partitioning) : view(B, AutoBlocks())) - min_bsa = min(parent_A.partitioning.blocksize...) - - if C.uplo == 'U' - # A = U'U → solve U'y = B, then Ux = y - maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB - Dagger.trsm!('L', 'U', trans, 'N', alpha, pA, pB) - end - maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB - Dagger.trsm!('L', 'U', 'N', 'N', alpha, pA, pB) - end - else - # A = LL' → solve Ly = B, then L'x = y - maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB - Dagger.trsm!('L', 'L', 'N', 'N', alpha, pA, pB) - end - maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB - Dagger.trsm!('L', 'L', trans, 'N', alpha, pA, pB) - end - end - - return B end \ No newline at end of file diff --git a/src/array/trsm.jl b/src/array/trsm.jl index 4535a3cb6..6044d9045 100644 --- a/src/array/trsm.jl +++ b/src/array/trsm.jl @@ -1,4 +1,5 @@ -function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B::DVector{T}) where T +function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B::AbstractArray{T,1}) where T + zone = one(T) mzone = -one(T) @@ -23,12 +24,12 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B:: Dagger.@spawn BLAS.gemv!('N', mzone, In(Ac[i, k]), In(Bc[k]), lalpha, InOut(Bc[i])) end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in 1:Bnt lalpha = (k == 1) ? alpha : zone - Dagger.@spawn BLAS.trsv!('U', trans, diag, In(Ac[k, k]), InOut(Bc[k])) + Dagger.@spawn BLAS.trsv!('U', 'T', diag, In(Ac[k, k]), InOut(Bc[k])) for i in k+1:Bnt - Dagger.@spawn BLAS.gemv!(trans, mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) + Dagger.@spawn BLAS.gemv!('T', mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) end end end @@ -41,12 +42,12 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B:: Dagger.@spawn BLAS.gemv!('N', mzone, In(Ac[i, k]), In(Bc[k]), lalpha, InOut(Bc[i])) end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in reverse(1:Bnt) lalpha = (k == Bnt) ? alpha : zone - Dagger.@spawn BLAS.trsv!('L', trans, diag, In(Ac[k, k]), InOut(Bc[k])) + Dagger.@spawn BLAS.trsv!('L', 'T', diag, In(Ac[k, k]), InOut(Bc[k])) for i in 1:k-1 - Dagger.@spawn BLAS.gemv!(trans, mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) + Dagger.@spawn BLAS.gemv!('T', mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) end end end @@ -56,7 +57,8 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B:: end -function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B::DVecOrMat{T}) where T +function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B::DArray{T,2}) where T + zone = one(T) mzone = -one(T) @@ -100,7 +102,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in range(1, Bmt) lalpha = k == 1 ? alpha : zone; for n in range(1, Bnt) @@ -108,7 +110,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end for m in range(k+1, Bmt) for n in range(1, Bnt) - Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[k, m]), In(Bc[k, n]), lalpha, InOut(Bc[m, n])) + Dagger.@spawn BLAS.gemm!('T', 'N', mzone, In(Ac[k, m]), In(Bc[k, n]), lalpha, InOut(Bc[m, n])) end end end @@ -126,7 +128,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in range(1, Bmt) lalpha = k == 1 ? alpha : zone; for n in range(1, Bnt) @@ -134,7 +136,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end for m in range(k+1, Bmt) for n in range(1, Bnt) - Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[(Bmt-k)+1, (Bmt-m)+1]), In(Bc[(Bmt-k)+1, n]), lalpha, InOut(Bc[(Bmt-m)+1, n])) + Dagger.@spawn BLAS.gemm!('T', 'N', mzone, In(Ac[(Bmt-k)+1, (Bmt-m)+1]), In(Bc[(Bmt-k)+1, n]), lalpha, InOut(Bc[(Bmt-m)+1, n])) end end end @@ -154,12 +156,12 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in range(1, Bnt) for m in range(1, Bmt) Dagger.@spawn BLAS.trsm!(side, uplo, trans, diag, alpha, In(Ac[(Bnt-k)+1, (Bnt-k)+1]), InOut(Bc[m, (Bnt-k)+1])) for n in range(k+1, Bnt) - Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Bc[m, (Bnt-k)+1]), In(Ac[(Bnt-n)+1, (Bnt-k)+1]), zone, InOut(Bc[m, (Bnt-n)+1])) + Dagger.@spawn BLAS.gemm!('N', 'T', minvalpha, In(B[m, (Bnt-k)+1]), In(Ac[(Bnt-n)+1, (Bnt-k)+1]), zone, InOut(Bc[m, (Bnt-n)+1])) end end end @@ -175,12 +177,12 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMa end end end - elseif trans == 'T' || trans == 'C' + elseif trans == 'T' for k in range(1, Bnt) for m in range(1, Bmt) Dagger.@spawn BLAS.trsm!(side, uplo, trans, diag, alpha, In(Ac[k, k]), InOut(Bc[m, k])) for n in range(k+1, Bnt) - Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Bc[m, k]), In(Ac[n, k]), zone, InOut(Bc[m, n])) + Dagger.@spawn BLAS.gemm!('N', 'T', minvalpha, In(Bc[m, k]), In(Ac[n, k]), zone, InOut(Bc[m, n])) end end end From 9a6b0973e337004b217c9e458506b122abb12310 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 14 Aug 2025 21:27:16 +0000 Subject: [PATCH 04/24] TEMP: Add demo/trace stuff --- demo.jl | 55 ++++++++++++++++ filter-traces.jl | 168 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 demo.jl create mode 100644 filter-traces.jl diff --git a/demo.jl b/demo.jl new file mode 100644 index 000000000..0c9ef9e0c --- /dev/null +++ b/demo.jl @@ -0,0 +1,55 @@ +begin +using Revise +using Dagger +using LinearAlgebra + +using Profile +include("filter-traces.jl") +end + +function demo(pivot=RowMaximum()) + fetch(Dagger.@spawn 1+1) + + N = 2000 + nt = Threads.nthreads() + #bs = cld(N, np) + bs = div(N, 4) + println("OpenBLAS Initialization:") + GC.enable(false) + A = @time rand(N, N) + GC.enable(true) + println("Dagger Initialization:") + GC.enable(false) + @time begin + DA = DArray(A, Blocks(bs, bs)) + wait.(DA.chunks) + end + GC.enable(true) + + println("OpenBLAS:") + BLAS.set_num_threads(nt) + lu_A = @time lu(A, pivot; check=false) + println("Dagger:") + BLAS.set_num_threads(1) + GC.enable(false) + lu_DA = @time lu(DA, pivot; check=false) + GC.enable(true) + + Profile.@profile 1+1 + Profile.clear() + println("Dagger (profiler):") + GC.enable(false) + Profile.@profile @time lu(DA, pivot; check=false) + GC.enable(true) + + @show norm(lu_A.U - UpperTriangular(collect(lu_DA.factors))) + + return +end + +demo(); + +begin + samples, lidata = Profile.retrieve() + validate_and_filter_traces!(samples, lidata) +end \ No newline at end of file diff --git a/filter-traces.jl b/filter-traces.jl new file mode 100644 index 000000000..0e6ab49c4 --- /dev/null +++ b/filter-traces.jl @@ -0,0 +1,168 @@ +""" +Filter out traces from the Julia Profile.jl buffer. + +Each trace in the buffer has the following structure (all UInt64): +- Stack frames (variable number) +- Thread ID +- Task ID +- CPU cycle clock +- Thread state (1 = awake, 2 = sleeping) +- Null word +- Null word + +The trace ends are marked by two consecutive null words (0x0). +""" +function filter_traces!(f, buffer::Vector{UInt64}, lidata) + if length(buffer) < 6 + return 0 # Buffer too small to contain even one complete trace + end + + filtered_count = 0 + i = 1 + + while i <= length(buffer) + # Find the end of the current trace by looking for two consecutive nulls + trace_start = i + trace_end = find_trace_end(buffer, i) + + if trace_end == -1 + # No complete trace found from this position + error("Failed to find trace end for $i") + break + end + + # Extract trace metadata (last 6 elements before the two nulls) + if trace_end - trace_start < 5 + # Trace too short to have proper metadata + i = trace_end + 1 + continue + end + + # Check if the trace should be filtered + do_filter = f(buffer, lidata, trace_start, trace_end)::Bool + + # If the trace should be filtered, null out the entire trace + if do_filter + for j in trace_start:trace_end + buffer[j] = 0x0 + end + filtered_count += 1 + end + + # Move to the next trace + i = trace_end + 1 + end + + return filtered_count +end +""" +Find the end of a trace starting from start_idx. +Returns the index of the second null word, or -1 if not found. +""" +function find_trace_end(buffer::Vector{UInt64}, start_idx::Int) + i = start_idx + while i < length(buffer) + if buffer[i] == 0x0 && buffer[i + 1] == 0x0 + return i + 1 # Return index of second null + end + i += 1 + end + + return -1 # No complete trace found +end + +"Count total number of trace entries in the buffer." +function count_traces(buffer::Vector{UInt64}) + count = 0 + + filter_traces!(buffer, lidata) do buffer, lidata, trace_start, trace_end + count += 1 + return false + end + + return count +end + +"Parse profile buffer and null out traces from sleeping threads." +function filter_sleeping_traces!(buffer::Vector{UInt64}, lidata) + return filter_traces!(buffer, lidata) do buffer, lidata, trace_start, trace_end + # The structure before the two nulls is: + # [...stack frames...][thread_id][task_id][cpu_cycles][thread_state][null][null] + thread_state_idx = trace_end - 2 # thread_state is 4th from end (before 2 nulls + 1 other field) + thread_state = buffer[thread_state_idx] + return thread_state == 2 + end +end + +"Parse profile buffer and null out traces without calls to a slowlock path." +function filter_for_slowlock_traces!(buffer::Vector{UInt64}, lidata) + return filter_traces!(buffer, lidata) do buffer, lidata, trace_start, trace_end + # Find slowlock frames + slowlock = false + frames_end = trace_end - 6 + for j in trace_start:frames_end + slowlock && break + ptr = buffer[j] + for frame in lidata[ptr] + if occursin("slowlock", string(frame)) + slowlock = true + break + end + end + end + return !slowlock + end +end + +"Parse profile buffer and keep only traces from a specific thread." +function filter_for_thread!(buffer::Vector{UInt64}, lidata, thread) + return filter_traces!(buffer, lidata) do buffer, lidata, trace_start, trace_end + # The structure before the two nulls is: + # [...stack frames...][thread_id][task_id][cpu_cycles][thread_state][null][null] + thread_id_idx = trace_end - 5 # thread_id is 5th from end (before 2 nulls + 1 other field) + thread_id = buffer[thread_id_idx] + return thread_id+1 == thread + end +end + +""" +Filters out traces from the Julia Profile.jl buffer. Performs: +- Removal of sleeping thread traces +- If slocklock is true, also remove traces that do not call into a slowlock path + +Args: + buffer: Vector{UInt64} containing profile trace data + +Returns: + (filtered_count, total_traces) tuple +""" +function filter_traces_multi!(buffer::Vector{UInt64}, lidata; + slocklock::Bool=false, thread=nothing) + total_traces = count_traces(buffer) + sleeping_count = filter_sleeping_traces!(buffer, lidata) + if slocklock + slowlock_count = filter_for_slowlock_traces!(buffer, lidata) + else + slowlock_count = 0 + end + if thread !== nothing + thread_count = filter_for_thread!(buffer, lidata, thread) + else + thread_count = 0 + end + + #= Find the last double-zero in the buffer and truncate the buffer there + last_zero = 1 + idx = 1 + while idx < length(buffer) + if buffer[idx] == 0x0 && buffer[idx + 1] == 0x0 + last_zero = idx + break + end + idx += 1 + end + deleteat!(buffer, last_zero:length(buffer)) + =# + + return (;total_traces, sleeping_count, slowlock_count, thread_count) +end \ No newline at end of file From 5479ae51438c42802b421d190c8ef7136d80589e Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Tue, 19 Aug 2025 19:52:11 +0000 Subject: [PATCH 05/24] TEMP: Attempt to implement apply_ipiv_rows --- src/array/linalg.jl | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/array/linalg.jl b/src/array/linalg.jl index bddc781df..bfec27aea 100644 --- a/src/array/linalg.jl +++ b/src/array/linalg.jl @@ -147,10 +147,40 @@ end function LinearAlgebra.ldiv!(A::LU{<:Any,<:DMatrix}, B::AbstractVecOrMat) - LinearAlgebra._apply_ipiv_rows!(A, B) + # FIXME: Don't apply pivots for NoPivot + LinearAlgebra._apply_ipiv_rows!(A, B) #apply_ipiv_rows!(A.ipiv, B) LinearAlgebra.ldiv!(UnitLowerTriangular(A.factors), B) LinearAlgebra.ldiv!(UpperTriangular(A.factors), B) end +#= Adapted from LinearAlgebra.jl +function apply_ipiv_rows!(ipiv::DVector{Int}, B::AbstractVecOrMat) + ipivc = ipiv.chunks + offset = 0 + incr = ipiv.partitioning.blocksize[1] + Dagger.spawn_datadeps() do + for ic in ipivc + Dagger.@spawn swap_ipiv_rows!(InOut(B), In(ic), offset) + offset += incr + end + end +end +function swap_ipiv_rows!(B::AbstractVecOrMat, ic::AbstractVector, offset::Int) + for raw_i in 1:length(ic) + i = raw_i + offset + if i != ic[i] + _swap_rows!(B, i, ic[i]) + end + end +end +function swap_ipiv_rows!(B::AbstractVector, i::Integer, j::Integer) + B[i], B[j] = B[j], B[i] +end +function swap_ipiv_rows!(B::AbstractMatrix, i::Integer, j::Integer) + for col in 1:size(B, 2) + B[i,col], B[j,col] = B[j,col], B[i,col] + end +end=# + function LinearAlgebra.ldiv!(A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLowerTriangular{<:Any,<:DMatrix},UpperTriangular{<:Any,<:DMatrix},UnitUpperTriangular{<:Any,<:DMatrix}}, B::AbstractVecOrMat) alpha = one(eltype(A)) @@ -170,7 +200,7 @@ function LinearAlgebra.ldiv!(A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLower elseif isa(B, AbstractMatrix) min_bsa = min(A.data.partitioning.blocksize...) Dagger.maybe_copy_buffered(A.data => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa, min_bsa)) do A, dB - Dagger.trsm!('L', uplo, trans, diag, alpha, A, dB) + Dagger.trsm!('L', uplo, trans, diag, alpha, A, dB) end end end From e398e831f8af3193944933d03e28053be36be4f6 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Fri, 10 Jan 2025 11:33:22 -0300 Subject: [PATCH 06/24] DaggerMPI: Initial implementation Co-authored-by: Julian P Samaroo Co-authored-by: yanzin00 --- Project.toml | 2 + src/Dagger.jl | 18 +- src/affinity.jl | 32 ++ src/array/alloc.jl | 17 +- src/array/darray.jl | 124 +++--- src/chunks.jl | 102 +---- src/datadeps/aliasing.jl | 641 ++++++++++----------------- src/datadeps/chunkview.jl | 50 +-- src/datadeps/queue.jl | 614 ++++++++++++++++---------- src/datadeps/remainders.jl | 300 ++++--------- src/dtask.jl | 30 +- src/memory-spaces.jl | 417 +++++++----------- src/mpi.jl | 870 +++++++++++++++++++++++++++++++++++++ src/mutable.jl | 41 ++ src/options.jl | 3 + src/processor.jl | 12 - src/queue.jl | 2 +- src/sch/Sch.jl | 129 +++--- src/sch/eager.jl | 3 +- src/sch/util.jl | 13 +- src/scopes.jl | 11 +- src/shard.jl | 89 ++++ src/thunk.jl | 9 +- src/tochunk.jl | 106 +++++ src/types/acceleration.jl | 1 + src/types/chunk.jl | 27 ++ src/types/memory-space.jl | 1 + src/types/processor.jl | 11 + src/types/scope.jl | 1 + src/utils/chunks.jl | 189 -------- src/utils/scopes.jl | 25 +- src/weakchunk.jl | 23 + test/mpi.jl | 33 ++ 33 files changed, 2332 insertions(+), 1614 deletions(-) create mode 100644 src/affinity.jl create mode 100644 src/mpi.jl create mode 100644 src/mutable.jl create mode 100644 src/shard.jl create mode 100644 src/tochunk.jl create mode 100644 src/types/acceleration.jl create mode 100644 src/types/chunk.jl create mode 100644 src/types/memory-space.jl create mode 100644 src/types/processor.jl create mode 100644 src/types/scope.jl delete mode 100644 src/utils/chunks.jl create mode 100644 src/weakchunk.jl create mode 100644 test/mpi.jl diff --git a/Project.toml b/Project.toml index ce49bf6d7..b6d03531d 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" NextLA = "d37ed344-79c4-486d-9307-6d11355a15a3" @@ -76,6 +77,7 @@ GraphViz = "0.2" Graphs = "1" JSON3 = "1" KernelAbstractions = "0.9" +MPI = "0.20.22" MacroTools = "0.5" MemPool = "0.4.12" Metal = "1.1" diff --git a/src/Dagger.jl b/src/Dagger.jl index 5fd1a929e..411d184e1 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -53,6 +53,13 @@ import Adapt include("lib/util.jl") include("utils/dagdebug.jl") +# Type definitions +include("types/processor.jl") +include("types/scope.jl") +include("types/memory-space.jl") +include("types/chunk.jl") +include("types/acceleration.jl") + # Distributed data include("utils/locked-object.jl") include("utils/tasks.jl") @@ -64,7 +71,6 @@ include("context.jl") include("utils/processors.jl") include("scopes.jl") include("utils/scopes.jl") -include("chunks.jl") include("utils/signature.jl") include("thunkid.jl") include("utils/lfucache.jl") @@ -76,7 +82,12 @@ include("argument.jl") include("queue.jl") include("thunk.jl") include("utils/fetch.jl") -include("utils/chunks.jl") +include("chunks.jl") +include("affinity.jl") +include("tochunk.jl") +include("mutable.jl") +include("shard.jl") +include("weakchunk.jl") include("utils/logging.jl") include("submission.jl") abstract type MemorySpace end @@ -156,6 +167,9 @@ function set_distributed_package!(value) @info "Dagger.jl preference has been set, restart your Julia session for this change to take effect!" end +# MPI +include("mpi.jl") + # Precompilation import PrecompileTools: @compile_workload include("precompile.jl") diff --git a/src/affinity.jl b/src/affinity.jl new file mode 100644 index 000000000..aab663a51 --- /dev/null +++ b/src/affinity.jl @@ -0,0 +1,32 @@ +export domain, UnitDomain, project, alignfirst, ArrayDomain + +import Base: isempty, getindex, intersect, ==, size, length, ndims + +""" + domain(x::T) + +Returns metadata about `x`. This metadata will be in the `domain` +field of a Chunk object when an object of type `T` is created as +the result of evaluating a Thunk. +""" +function domain end + +""" + UnitDomain + +Default domain -- has no information about the value +""" +struct UnitDomain end + +""" +If no `domain` method is defined on an object, then +we use the `UnitDomain` on it. A `UnitDomain` is indivisible. +""" +domain(x::Any) = UnitDomain() + +### ChunkIO +affinity(r::DRef) = OSProc(r.owner)=>r.size +# this previously returned a vector with all machines that had the file cached +# but now only returns the owner and size, for consistency with affinity(::DRef), +# see #295 +affinity(r::FileRef) = OSProc(1)=>r.size diff --git a/src/array/alloc.jl b/src/array/alloc.jl index fe92ae1e1..e67ca593c 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -184,24 +184,13 @@ function Base.zero(x::DArray{T,N}) where {T,N} return _to_darray(a) end -# Weird LinearAlgebra dispatch in `\` needs this -function LinearAlgebra._zeros(::Type{T}, B::DVector, n::Integer) where T - m = max(size(B, 1), n) - sz = (m,) - return zeros(auto_blocks(sz), T, sz) -end -function LinearAlgebra._zeros(::Type{T}, B::DMatrix, n::Integer) where T - m = max(size(B, 1), n) - sz = (m, size(B, 2)) - return zeros(auto_blocks(sz), T, sz) -end - -function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} +@warn "Consider a better way to provide a unique ID for each chunk" maxlog=1 +function Base.view(A::AbstractArray{T,N}, p::Blocks{N}; space=default_memory_space(current_acceleration(), A)) where {T,N} d = ArrayDomain(Base.index_shape(A)) dc = partition(p, d) # N.B. We use `tochunk` because we only want to take the view locally, and # taking views should be very fast - chunks = [tochunk(view(A, x.indexes...)) for x in dc] + chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] return DArray(T, d, dc, chunks, p) end Base.view(A::AbstractArray, ::AutoBlocks) = diff --git a/src/array/darray.jl b/src/array/darray.jl index 20722b8ed..e04bcf065 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -176,28 +176,46 @@ domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) stage(ctx, c::DArray) = c -function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N} - a = fetch(d) - if isempty(d.chunks) - return Array{eltype(d)}(undef, size(d)...) +@warn "Dispatch uniform on acceleration" maxlog=1 +@warn "Take D.concat into account" maxlog=1 +function Base.collect(D::DArray{T,N}; tree=false, copyto=false, uniform::Bool=true) where {T,N} + if isempty(D.chunks) + return Array{eltype(D)}(undef, size(D)...) end - if ndims(d) == 0 - return fetch(a.chunks[1]) + # Return a scalar, as required by Julia's array interface + if ndims(D) == 0 + return fetch(D.chunks[1]; unwrap=true) end - if copyto - C = Array{T,N}(undef, size(a)) - DC = view(C, Blocks(size(a)...)) - copyto!(DC, a) - return C - end + if uniform + @assert D.concat === cat "FIXME: Handle non-cat" + A = Array{eltype(D)}(undef, size(D)...) + DA = view(A, D.partitioning; space=CPURAMMemorySpace()) + + # Perform the equivalent of `copyto!(DA, D)`, but force local updates + # FIXME: Be more parallel? + for idx in eachindex(DA.chunks) + dest = fetch(DA.chunks[idx]; move_value=false, unwrap=true, uniform=true)::AbstractArray + src = fetch(D.chunks[idx]; move_value=true, unwrap=true, uniform=true)::AbstractArray + copyto!(dest, src) + end - dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)] - if tree - collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) + return A else - collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks))) + if copyto + C = Array{T,N}(undef, size(D)) + DC = view(C, Blocks(size(D)...)) + copyto!(DC, D) + return C + end + + dimcatfuncs = [(x...) -> D.concat(x..., dims=i) for i in 1:ndims(D)] + if tree + collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), D.chunks))) + else + treereduce_nd(dimcatfuncs, asyncmap(fetch, D.chunks)) + end end end Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A)) @@ -321,8 +339,8 @@ function Base.isequal(x::ArrayOp, y::ArrayOp) x === y end -Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} = - DArray{S,N}(undef, D.partitioning, dims) +Base.similar(::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} = + DArray{S,N}(undef, dims) Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} = map(identity, x)::DArray{T,N,B,F} @@ -388,23 +406,18 @@ function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d: end """ - Base.fetch(c::DArray) + Base.fetch(A::DArray; unwrap::Bool=false, kwargs...) -> DArray -If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. +Returns a new `DArray` with the same data as `A`, but where all values are +fully computed. """ -function Base.fetch(c::DArray{T}) where T - if any(istask, chunks(c)) - thunks = chunks(c) - sz = size(thunks) - dmn = domain(c) - dmnchunks = domainchunks(c) - return fetch(Dagger.spawn(Options(meta=true), thunks...) do results... - t = eltype(fetch(results[1])) - DArray(t, dmn, dmnchunks, reshape(Any[results...], sz), - c.partitioning, c.concat) - end) +function Base.fetch(A::DArray{T}; unwrap::Bool=false, kwargs...) where T + if any(unwrappable, chunks(A)) + tasks = map(t->unwrappable(t) ? fetch(t; unwrap, kwargs...) : t, chunks(A)) + B = DArray(T, A.domain, A.subdomains, tasks, A.partitioning, A.concat) + return B else - return c + return A end end @@ -505,7 +518,6 @@ auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A)) const AssignmentType{N} = Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} -distribute(A::AbstractArray, assignment::AssignmentType = :arbitrary) = distribute(A, AutoBlocks(), assignment) function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} procgrid = nothing availprocs = collect(Dagger.compatible_processors()) @@ -546,8 +558,10 @@ function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Assignme procgrid = assignment end - return _to_darray(Distribute(dist, A, procgrid)) + return _distribute(current_acceleration(), A, dist, procgrid) end +_distribute(::DistributedAcceleration, A::AbstractArray{T,N}, dist::Blocks{N}, procgrid) where {T,N} = + _to_darray(Distribute(dist, A, procgrid)) distribute(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = distribute(A, auto_blocks(A), assignment) function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} @@ -556,7 +570,6 @@ function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentT end distribute(x::AbstractVector, n::Int, assignment::AssignmentType{1} = :arbitrary) = distribute(x, (n,), assignment) - DVector(A::AbstractVector{T}, part::Blocks{1}, assignment::AssignmentType{1} = :arbitrary) where T = distribute(A, part, assignment) DMatrix(A::AbstractMatrix{T}, part::Blocks{2}, assignment::AssignmentType{2} = :arbitrary) where T = distribute(A, part, assignment) DArray(A::AbstractArray{T,N}, part::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} = distribute(A, part, assignment) @@ -569,29 +582,26 @@ DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::AssignmentType{1} = :arb DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::AssignmentType{2} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment) DArray(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = DArray(A, auto_blocks(A), assignment) -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims) -function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} - domain = ArrayDomain(map(x->1:x, dims)) +@warn "Add assignment to undef initializer" maxlog=1 +function DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}) where {T,N} + dist = auto_blocks(dims) + return DArray{T,N}(undef, dist, dims...) +end +function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}) where {T,N} + domain = ArrayDomain(ntuple(i->1:dims[i], N)) subdomains = partition(dist, domain) - a = AllocateArray(T, AllocateUndef{T}(), false, domain, subdomains, dist, assignment) - return _to_darray(a) -end -DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, (dims...,); assignment) -DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks(dims), dims; assignment) -DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) - -DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, dims; assignment) -DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, (dims...,); assignment) -DArray{T}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks(dims), dims; assignment) -DArray{T}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) + tasks = Array{DTask,N}(undef, size(subdomains)...) + Dagger.spawn_datadeps() do + for (i, x) in enumerate(subdomains) + tasks[i] = Dagger.@spawn allocate_array_undef(T, size(x)) + end + end + return DArray(T, domain, subdomains, tasks, dist) +end +DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,)) +DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}) where {T,N} = + DArray{T,N}(undef, dist, (dims...,)) function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N} collect(x) == y diff --git a/src/chunks.jl b/src/chunks.jl index 03bdfb65d..0defc1ff6 100644 --- a/src/chunks.jl +++ b/src/chunks.jl @@ -1,56 +1,4 @@ -export domain, UnitDomain, project, alignfirst, ArrayDomain - -import Base: isempty, getindex, intersect, ==, size, length, ndims - -""" - domain(x::T) - -Returns metadata about `x`. This metadata will be in the `domain` -field of a Chunk object when an object of type `T` is created as -the result of evaluating a Thunk. -""" -function domain end - -""" - UnitDomain - -Default domain -- has no information about the value -""" -struct UnitDomain end - -""" -If no `domain` method is defined on an object, then -we use the `UnitDomain` on it. A `UnitDomain` is indivisible. -""" -domain(x::Any) = UnitDomain() - -###### Chunk ###### - -""" - Chunk - -A reference to a piece of data located on a remote worker. `Chunk`s are -typically created with `Dagger.tochunk(data)`, and the data can then be -accessed from any worker with `collect(::Chunk)`. `Chunk`s are -serialization-safe, and use distributed refcounting (provided by -`MemPool.DRef`) to ensure that the data referenced by a `Chunk` won't be GC'd, -as long as a reference exists on some worker. - -Each `Chunk` is associated with a given `Dagger.Processor`, which is (in a -sense) the processor that "owns" or contains the data. Calling -`collect(::Chunk)` will perform data movement and conversions defined by that -processor to safely serialize the data to the calling worker. - -## Constructors -See [`tochunk`](@ref). -""" -mutable struct Chunk{T, H, P<:Processor, S<:AbstractScope} - chunktype::Type{T} - domain - handle::H - processor::P - scope::S -end +###### Chunk Methods ###### domain(c::Chunk) = c.domain chunktype(c::Chunk) = c.chunktype @@ -72,20 +20,27 @@ function collect(ctx::Context, chunk::Chunk; options=nothing) elseif chunk.handle isa FileRef return poolget(chunk.handle) else - return move(chunk.processor, OSProc(), chunk.handle) + return move(chunk.processor, default_processor(), chunk.handle) end end collect(ctx::Context, ref::DRef; options=nothing) = move(OSProc(ref.owner), OSProc(), ref) collect(ctx::Context, ref::FileRef; options=nothing) = poolget(ref) # FIXME: Do move call -function Base.fetch(chunk::Chunk; raw=false) - if raw - poolget(chunk.handle) - else - collect(chunk) +@warn "Fix semantics of collect" maxlog=1 +function Base.fetch(chunk::Chunk{T}; unwrap::Bool=false, uniform::Bool=false, kwargs...) where T + value = fetch_handle(chunk.handle; uniform)::T + if unwrap && unwrappable(value) + return fetch(value; unwrap, uniform, kwargs...) end + return value end +fetch_handle(ref::DRef; uniform::Bool=false) = poolget(ref) +fetch_handle(ref::FileRef; uniform::Bool=false) = poolget(ref) +unwrappable(x::Chunk) = true +unwrappable(x::DRef) = true +unwrappable(x::FileRef) = true +unwrappable(x) = false # Unwrap Chunk, DRef, and FileRef by default move(from_proc::Processor, to_proc::Processor, x::Chunk) = @@ -100,32 +55,3 @@ move(to_proc::Processor, d::DRef) = move(OSProc(d.owner), to_proc, d) move(to_proc::Processor, x) = move(OSProc(), to_proc, x) - -### ChunkIO -affinity(r::DRef) = OSProc(r.owner)=>r.size -# this previously returned a vector with all machines that had the file cached -# but now only returns the owner and size, for consistency with affinity(::DRef), -# see #295 -affinity(r::FileRef) = OSProc(1)=>r.size - -struct WeakChunk - wid::Int - id::Int - x::WeakRef - function WeakChunk(c::Chunk) - return new(c.handle.owner, c.handle.id, WeakRef(c)) - end -end -unwrap_weak(c::WeakChunk) = c.x.value -function unwrap_weak_checked(c::WeakChunk) - cw = unwrap_weak(c) - @assert cw !== nothing "WeakChunk expired: ($(c.wid), $(c.id))" - return cw -end -wrap_weak(c::Chunk) = WeakChunk(c) -isweak(c::WeakChunk) = true -isweak(c::Chunk) = false -is_task_or_chunk(c::WeakChunk) = true -Serialization.serialize(io::AbstractSerializer, wc::WeakChunk) = - error("Cannot serialize a WeakChunk") -chunktype(c::WeakChunk) = chunktype(unwrap_weak_checked(c)) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index 2f26fa3b0..57ebee404 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -8,7 +8,7 @@ export In, Out, InOut, Deps, spawn_datadeps ============================================================================== This file implements the data dependencies system for Dagger tasks, which allows -tasks to access their arguments in a controlled manner. The system maintains +tasks to write to their arguments in a controlled manner. The system maintains data coherency across distributed workers by tracking aliasing relationships and orchestrating data movement operations. @@ -25,59 +25,26 @@ KEY CONCEPTS: 1. ALIASING ANALYSIS: - Every mutable argument is analyzed for its memory access pattern - Memory spans are computed to determine which bytes in memory are accessed - - Arguments that access overlapping memory spans are considered "aliasing" + - Objects that access overlapping memory spans are considered "aliasing" - Examples: An array A and view(A, 2:3, 2:3) alias each other 2. DATA LOCALITY TRACKING: - The system tracks where the "source of truth" for each piece of data lives - As tasks execute and modify data, the source of truth may move between workers - - Each argument can have its own independent source of truth location + - Each aliasing region can have its own independent source of truth location 3. ALIASED OBJECT MANAGEMENT: - When copying arguments between workers, the system tracks "aliased objects" - This ensures that if both an array and its view need to be copied to a worker, only one copy of the underlying array is made, with the view pointing to it - - The aliased_object!() and move_rewrap() functions manage this sharing - -ALIASING INFO: --------------- - -The system uses different types of aliasing info to represent different types of -aliasing relationships: - -- ContiguousAliasing: Single contiguous memory region (e.g., full array) -- StridedAliasing: Multiple non-contiguous regions (e.g., SubArray) -- DiagonalAliasing: Diagonal elements only (e.g., Diagonal(A)) -- TriangularAliasing: Triangular regions (e.g., UpperTriangular(A)) - -Any two aliasing objects can be compared using the will_alias function to -determine if they overlap. Additionally, any aliasing object can be converted to -a vector of memory spans, which represents the contiguous regions of memory that -the aliasing object covers. - -DATA MOVEMENT FUNCTIONS: ------------------------- - -move!(dep_mod, to_space, from_space, to, from): -- The core in-place data movement function -- dep_mod specifies which part of the data to copy (identity, UpperTriangular, etc.) -- Supports partial copies via RemainderAliasing dependency modifiers - -move_rewrap(...): -- Handles copying of wrapped objects (SubArrays, ChunkViews) -- Ensures aliased objects are reused on destination worker - -read/write_remainder!(...): -- Read/write a span of memory from an object to/from a buffer -- Used by move! to copy the remainder of an aliased object + - The aliased_object!() functions manage this sharing THE DISTRIBUTED ALIASING PROBLEM: --------------------------------- In a multithreaded environment, aliasing "just works" because all tasks operate -on the user-provided memory. However, in a distributed environment, arguments -must be copied between workers, which breaks aliasing relationships if care is -not taken. +on the same memory. However, in a distributed environment, arguments must be +copied between workers, which breaks aliasing relationships. Consider this scenario: ```julia @@ -96,9 +63,11 @@ MULTITHREADED BEHAVIOR (WORKS): - Task dependencies ensure correct ordering (e.g., Task 1 then Task 2) DISTRIBUTED BEHAVIOR (THE PROBLEM): +- Tasks may be scheduled on different workers - Each argument must be copied to the destination worker -- Without special handling, we would copy A and vA independently to another worker -- This creates two separate arrays, breaking the aliasing relationship between A and vA +- Without special handling, we would copy A to worker1 and vA to worker2 +- This creates two separate arrays, breaking the aliasing relationship +- Updates to the view on worker2 don't affect the array on worker1 THE SOLUTION - PARTIAL DATA MOVEMENT: ------------------------------------- @@ -112,13 +81,12 @@ The datadeps system solves this by: 2. PARTIAL DATA TRANSFER: - Instead of copying entire objects, only transfer the "dirty" regions - - This prevents overwrites of data that has already been updated by another task - - This also minimizes network traffic and overall copy time - - Uses the move!(dep_mod, ...) function with RemainderAliasing dependency modifiers + - This minimizes network traffic and maximizes parallelism + - Uses the move!(dep_mod, ...) function with dependency modifiers 3. REMAINDER TRACKING: - - When a task needs the full object, copy partial regions as needed - When a partial region is updated, track what parts still need updating + - Before a task needs the full object, copy the remaining "clean" regions - This preserves all updates while avoiding overwrites EXAMPLE EXECUTION FLOW: @@ -140,24 +108,69 @@ Tasks: T1 modifies InOut(A), T2 modifies InOut(vA) - T2 needs vA, but vA aliases with A (which was modified by T1) - Copy vA-region of A from worker1 to worker2 - This is a PARTIAL copy - only the 2:3, 2:3 region - - Create vA on worker2 pointing to the appropriate region of A + - Create vA on worker2 pointing to the appropriate region - T2 executes, modifying vA region on worker2 - Update: vA's data_locality = worker2 4. FINAL SYNCHRONIZATION: - - Need to copy-back A and vA to worker0 - - A needs to be assembled from: worker1 (non-vA regions of A) + worker2 (vA region of A) - - REMAINDER COPY: Copy non-vA regions from worker1 to worker0 - - REMAINDER COPY: Copy vA region from worker2 to worker0 + - Some future task needs the complete A + - A needs to be assembled from: worker1 (non-vA regions) + worker2 (vA region) + - REMAINDER COPY: Copy non-vA regions from worker1 to worker2 + - OR INVERSE: Copy vA-region from worker2 to worker1, then copy full A -REMAINDER COMPUTATION: ----------------------- +MEMORY SPAN COMPUTATION: +------------------------ + +The system uses memory spans to determine aliasing and compute remainders: + +- ContiguousAliasing: Single contiguous memory region (e.g., full array) +- StridedAliasing: Multiple non-contiguous regions (e.g., SubArray) +- DiagonalAliasing: Diagonal elements only (e.g., Diagonal(A)) +- TriangularAliasing: Triangular regions (e.g., UpperTriangular(A)) Remainder computation involves: 1. Computing memory spans for all overlapping aliasing objects 2. Finding the set difference: full_object_spans - updated_spans -3. Creating a RemainderAliasing object representing the difference between spans -4. Performing one or more move! calls with this RemainderAliasing object to copy only needed data +3. Creating a "remainder aliasing" object representing the not-yet-updated regions +4. Performing move! with this remainder object to copy only needed data + +DATA MOVEMENT FUNCTIONS: +------------------------ + +move!(dep_mod, to_space, from_space, to, from): +- The core in-place data movement function +- dep_mod specifies which part of the data to copy (identity, UpperTriangular, etc.) +- Supports partial copies via dependency modifiers + +move_rewrap(): +- Handles copying of wrapped objects (SubArrays, ChunkViews) +- Ensures aliased objects are reused on destination worker + +enqueue_copy_to!(): +- Schedules data movement tasks before user tasks +- Ensures data is up-to-date on the worker where a task will run + +CURRENT LIMITATIONS AND TODOS: +------------------------------- + +1. REMAINDER COMPUTATION: + - The system currently handles simple overlaps but needs sophisticated + remainder calculation for complex aliasing patterns + - Need functions to compute span set differences + +2. ORDERING DEPENDENCIES: + - Need to ensure remainder copies happen in correct order + - Must not overwrite more recent updates with stale data + +3. COMPLEX ALIASING PATTERNS: + - Multiple overlapping views of the same array + - Nested aliasing structures (views of views) + - Mixed aliasing types (diagonal + triangular regions) + +4. PERFORMANCE OPTIMIZATION: + - Minimize number of copy operations + - Batch compatible transfers + - Optimize for common access patterns =# "Specifies a read-only dependency." @@ -179,11 +192,6 @@ struct Deps{T,DT<:Tuple} end Deps(x, deps...) = Deps(x, deps) -chunktype(::In{T}) where T = T -chunktype(::Out{T}) where T = T -chunktype(::InOut{T}) where T = T -chunktype(::Deps{T,DT}) where {T,DT} = T - function unwrap_inout(arg) readdep = false writedep = false @@ -214,7 +222,6 @@ function unwrap_inout(arg) end _identity_hash(arg, h::UInt=UInt(0)) = ismutable(arg) ? objectid(arg) : hash(arg, h) -_identity_hash(arg::Chunk, h::UInt=UInt(0)) = hash(arg.handle, hash(Chunk, h)) _identity_hash(arg::SubArray, h::UInt=UInt(0)) = hash(arg.indices, hash(arg.offset1, hash(arg.stride1, _identity_hash(arg.parent, h)))) _identity_hash(arg::CartesianIndices, h::UInt=UInt(0)) = hash(arg.indices, hash(typeof(arg), h)) @@ -232,135 +239,9 @@ end Base.hash(aw::ArgumentWrapper) = hash(ArgumentWrapper, aw.hash) Base.:(==)(aw1::ArgumentWrapper, aw2::ArgumentWrapper) = aw1.hash == aw2.hash -Base.isequal(aw1::ArgumentWrapper, aw2::ArgumentWrapper) = - aw1.hash == aw2.hash - -struct HistoryEntry - ainfo::AliasingWrapper - space::MemorySpace - write_num::Int -end - -struct AliasedObjectCacheStore - keys::Vector{AbstractAliasing} - derived::Dict{AbstractAliasing,AbstractAliasing} - stored::Dict{MemorySpace,Set{AbstractAliasing}} - values::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} -end -AliasedObjectCacheStore() = - AliasedObjectCacheStore(Vector{AbstractAliasing}(), - Dict{AbstractAliasing,AbstractAliasing}(), - Dict{MemorySpace,Set{AbstractAliasing}}(), - Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}()) - -function is_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - if !haskey(cache.stored, space) - return false - end - if !haskey(cache.derived, ainfo) - return false - end - key = cache.derived[ainfo] - return key in cache.stored[space] -end -function is_key_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - if !haskey(cache.stored, space) - return false - end - return ainfo in cache.stored[space] -end -function get_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - @assert is_stored(cache, space, ainfo) "Cache does not have derived ainfo $ainfo" - key = cache.derived[ainfo] - return cache.values[space][key] -end -function set_stored!(cache::AliasedObjectCacheStore, dest_space::MemorySpace, value::Chunk, ainfo::AbstractAliasing) - @assert !is_stored(cache, dest_space, ainfo) "Cache already has derived ainfo $ainfo" - key = cache.derived[ainfo] - value_ainfo = aliasing(value, identity) - cache.derived[value_ainfo] = key - push!(get!(Set{AbstractAliasing}, cache.stored, dest_space), key) - values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, dest_space) - values_dict[key] = value - return -end -function set_key_stored!(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) - push!(cache.keys, ainfo) - cache.derived[ainfo] = ainfo - push!(get!(Set{AbstractAliasing}, cache.stored, space), ainfo) - values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, space) - values_dict[ainfo] = value - return -end - -struct AliasedObjectCache - space::MemorySpace - chunk::Chunk -end -function is_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(is_stored, wid, cache, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return is_stored(cache_raw, cache.space, ainfo) -end -function is_key_stored(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(is_key_stored, wid, cache, space, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return is_key_stored(cache_raw, space, ainfo) -end -function get_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(get_stored, wid, cache, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return get_stored(cache_raw, cache.space, ainfo) -end -function set_stored!(cache::AliasedObjectCache, value::Chunk, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(set_stored!, wid, cache, value, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - set_stored!(cache_raw, cache.space, value, ainfo) - return -end -function set_key_stored!(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(set_key_stored!, wid, cache, space, ainfo, value) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - set_key_stored!(cache_raw, space, ainfo, value) -end -function aliased_object!(f, cache::AliasedObjectCache, x; ainfo=aliasing(x, identity)) - x_space = memory_space(x) - if !is_key_stored(cache, x_space, ainfo) - set_key_stored!(cache, x_space, ainfo, x isa Chunk ? x : tochunk(x)) - end - if is_stored(cache, ainfo) - return get_stored(cache, ainfo) - else - y = f(x) - @assert y isa Chunk "Didn't get a Chunk from functor" - @assert memory_space(y) == cache.space "Space mismatch! $(memory_space(y)) != $(cache.space)" - if memory_space(x) != cache.space - @assert ainfo != aliasing(y, identity) "Aliasing mismatch! $ainfo == $(aliasing(y, identity))" - end - set_stored!(cache, y, ainfo) - return y - end -end +@warn "Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper" maxlog=1 struct DataDepsState - # The mapping of original raw argument to its Chunk - raw_arg_to_chunk::IdDict{Any,Chunk} - # The origin memory space of each argument # Used to track the original location of an argument, for final copy-from arg_origin::IdDict{Any,MemorySpace} @@ -372,18 +253,15 @@ struct DataDepsState # The mapping of remote argument to original argument remote_arg_to_original::IdDict{Any,Any} - # The mapping of original argument wrapper to remote argument wrapper - remote_arg_w::Dict{ArgumentWrapper,Dict{MemorySpace,ArgumentWrapper}} - # The mapping of ainfo to argument and dep_mod # Used to lookup which argument and dep_mod a given ainfo is generated from # N.B. This is a mapping for remote argument copies - ainfo_arg::Dict{AliasingWrapper,Set{ArgumentWrapper}} + ainfo_arg::Dict{AliasingWrapper,ArgumentWrapper} # The history of writes (direct or indirect) to each argument and dep_mod, in terms of ainfos directly written to, and the memory space they were written to # Updated when a new write happens on an overlapping ainfo # Used by remainder copies to track which portions of an argument and dep_mod were written to elsewhere, through another argument - arg_history::Dict{ArgumentWrapper,Vector{HistoryEntry}} + arg_history::Dict{ArgumentWrapper,Vector{Tuple{AliasingWrapper,MemorySpace,Int}}} # The mapping of memory space and argument to the memory space of the last direct write # Used by remainder copies to lookup the "backstop" if any portion of the target ainfo is not updated by the remainder @@ -396,7 +274,7 @@ struct DataDepsState # The mapping of, for a given memory space, the backing Chunks that an ainfo references # Used by slot generation to replace the backing Chunks during move - ainfo_backing_chunk::Chunk{AliasedObjectCacheStore} + ainfo_backing_chunk::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} # Cache of argument's supports_inplace_move query result supports_inplace_cache::IdDict{Any,Bool} @@ -405,10 +283,6 @@ struct DataDepsState # N.B. This is a mapping for remote argument copies ainfo_cache::Dict{ArgumentWrapper,AliasingWrapper} - # The oracle for aliasing lookups - # Used to populate ainfos_overlaps efficiently - ainfos_lookup::AliasingLookup - # The overlapping ainfos for each ainfo # Incrementally updated as new ainfos are created # Used for fast will_alias lookups @@ -420,32 +294,59 @@ struct DataDepsState ainfos_owner::Dict{AliasingWrapper,Union{Pair{DTask,Int},Nothing}} ainfos_readers::Dict{AliasingWrapper,Vector{Pair{DTask,Int}}} - function DataDepsState() - arg_to_chunk = IdDict{Any,Chunk}() + function DataDepsState(aliasing::Bool) + if !aliasing + @warn "aliasing=false is no longer supported, aliasing is now always enabled" maxlog=1 + end + arg_origin = IdDict{Any,MemorySpace}() remote_args = Dict{MemorySpace,IdDict{Any,Any}}() remote_arg_to_original = IdDict{Any,Any}() - remote_arg_w = Dict{ArgumentWrapper,Dict{MemorySpace,ArgumentWrapper}}() - ainfo_arg = Dict{AliasingWrapper,Set{ArgumentWrapper}}() - arg_history = Dict{ArgumentWrapper,Vector{HistoryEntry}}() + ainfo_arg = Dict{AliasingWrapper,ArgumentWrapper}() arg_owner = Dict{ArgumentWrapper,MemorySpace}() arg_overlaps = Dict{ArgumentWrapper,Set{ArgumentWrapper}}() - ainfo_backing_chunk = tochunk(AliasedObjectCacheStore()) + ainfo_backing_chunk = Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}() + arg_history = Dict{ArgumentWrapper,Vector{Tuple{AliasingWrapper,MemorySpace,Int}}}() supports_inplace_cache = IdDict{Any,Bool}() ainfo_cache = Dict{ArgumentWrapper,AliasingWrapper}() - ainfos_lookup = AliasingLookup() ainfos_overlaps = Dict{AliasingWrapper,Set{AliasingWrapper}}() ainfos_owner = Dict{AliasingWrapper,Union{Pair{DTask,Int},Nothing}}() ainfos_readers = Dict{AliasingWrapper,Vector{Pair{DTask,Int}}}() - return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, remote_arg_w, ainfo_arg, arg_history, arg_owner, arg_overlaps, ainfo_backing_chunk, - supports_inplace_cache, ainfo_cache, ainfos_lookup, ainfos_overlaps, ainfos_owner, ainfos_readers) + return new(arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_owner, arg_overlaps, ainfo_backing_chunk, arg_history, + supports_inplace_cache, ainfo_cache, ainfos_overlaps, ainfos_owner, ainfos_readers) end end +# N.B. arg_w must be the original argument wrapper, not a remote copy +function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper) + # Grab the remote copy of the argument, and calculate the ainfo + remote_arg = get_or_generate_slot!(state, target_space, arg_w.arg) + remote_arg_w = ArgumentWrapper(remote_arg, arg_w.dep_mod) + + # Check if we already have the result cached + if haskey(state.ainfo_cache, remote_arg_w) + return state.ainfo_cache[remote_arg_w] + end + + # Calculate the ainfo + ainfo = AliasingWrapper(aliasing(current_acceleration(), remote_arg, arg_w.dep_mod)) + + # Cache the result + state.ainfo_cache[remote_arg_w] = ainfo + + # Update the mapping of ainfo to argument and dep_mod + state.ainfo_arg[ainfo] = remote_arg_w + + # Populate info for the new ainfo + populate_ainfo!(state, arg_w, ainfo, target_space) + + return ainfo +end + function supports_inplace_move(state::DataDepsState, arg) return get!(state.supports_inplace_cache, arg) do return supports_inplace_move(arg) @@ -459,69 +360,29 @@ function is_writedep(arg, deps, task::DTask) end # Aliasing state setup -function populate_task_info!(state::DataDepsState, task_args, spec::DTaskSpec, task::DTask) +function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) # Track the task's arguments and access patterns - return map_or_ntuple(task_args) do idx - _arg = task_args[idx] - - # Unwrap the argument - _arg_with_deps = value(_arg) - pos = _arg.pos + for (idx, _arg) in enumerate(spec.fargs) + arg = value(_arg) # Unwrap In/InOut/Out wrappers and record dependencies - arg_pre_unwrap, deps = unwrap_inout(_arg_with_deps) + arg, deps = unwrap_inout(arg) # Unwrap the Chunk underlying any DTask arguments - arg = arg_pre_unwrap isa DTask ? fetch(arg_pre_unwrap; raw=true) : arg_pre_unwrap - - # Skip non-aliasing arguments or arguments that don't support in-place move - may_alias = type_may_alias(typeof(arg)) - inplace_move = may_alias && supports_inplace_move(state, arg) - if !may_alias || !inplace_move - arg_w = ArgumentWrapper(arg, identity) - if is_typed(spec) - return TypedDataDepsTaskArgument(arg, pos, may_alias, inplace_move, (DataDepsTaskDependency(arg_w, false, false),)) - else - return DataDepsTaskArgument(arg, pos, may_alias, inplace_move, [DataDepsTaskDependency(arg_w, false, false)]) - end - end + arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg - # Generate a Chunk for the argument if necessary - if haskey(state.raw_arg_to_chunk, arg) - arg_chunk = state.raw_arg_to_chunk[arg] - else - if !(arg isa Chunk) - arg_chunk = tochunk(arg) - state.raw_arg_to_chunk[arg] = arg_chunk - else - state.raw_arg_to_chunk[arg] = arg - arg_chunk = arg - end - end + # Skip non-aliasing arguments + type_may_alias(typeof(arg)) || continue # Track the origin space of the argument - origin_space = memory_space(arg_chunk) - state.arg_origin[arg_chunk] = origin_space - state.remote_arg_to_original[arg_chunk] = arg_chunk + origin_space = memory_space(arg) + state.arg_origin[arg] = origin_space + state.remote_arg_to_original[arg] = arg # Populate argument info for all aliasing dependencies - # And return the argument, dependencies, and ArgumentWrappers - if is_typed(spec) - deps = Tuple(DataDepsTaskDependency(arg_chunk, dep) for dep in deps) - map_or_ntuple(deps) do dep_idx - dep = deps[dep_idx] - # Populate argument info - populate_argument_info!(state, dep.arg_w, origin_space) - end - return TypedDataDepsTaskArgument(arg_chunk, pos, may_alias, inplace_move, deps) - else - deps = [DataDepsTaskDependency(arg_chunk, dep) for dep in deps] - map_or_ntuple(deps) do dep_idx - dep = deps[dep_idx] - # Populate argument info - populate_argument_info!(state, dep.arg_w, origin_space) - end - return DataDepsTaskArgument(arg_chunk, pos, may_alias, inplace_move, deps) + for (dep_mod, _, _) in deps + aw = ArgumentWrapper(arg, dep_mod) + populate_argument_info!(state, aw, origin_space) end end end @@ -538,62 +399,26 @@ function populate_argument_info!(state::DataDepsState, arg_w::ArgumentWrapper, o state.arg_overlaps[arg_w] = Set{ArgumentWrapper}() end if !haskey(state.arg_history, arg_w) - state.arg_history[arg_w] = Vector{HistoryEntry}() + state.arg_history[arg_w] = Vector{Tuple{AliasingWrapper,MemorySpace,Int}}() end # Calculate the ainfo (which will populate ainfo structures and merge history) aliasing!(state, origin_space, arg_w) end -# N.B. arg_w must be the original argument wrapper, not a remote copy -function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper) - if haskey(state.remote_arg_w, arg_w) && haskey(state.remote_arg_w[arg_w], target_space) - remote_arg_w = @inbounds state.remote_arg_w[arg_w][target_space] - remote_arg = remote_arg_w.arg - else - # Grab the remote copy of the argument, and calculate the ainfo - remote_arg = get_or_generate_slot!(state, target_space, arg_w.arg) - remote_arg_w = ArgumentWrapper(remote_arg, arg_w.dep_mod) - get!(Dict{MemorySpace,ArgumentWrapper}, state.remote_arg_w, arg_w)[target_space] = remote_arg_w - end - - # Check if we already have the result cached - if haskey(state.ainfo_cache, remote_arg_w) - return state.ainfo_cache[remote_arg_w] - end - - # Calculate the ainfo - ainfo = AliasingWrapper(aliasing(remote_arg, arg_w.dep_mod)) - - # Cache the result - state.ainfo_cache[remote_arg_w] = ainfo - - # Update the mapping of ainfo to argument and dep_mod - if !haskey(state.ainfo_arg, ainfo) - state.ainfo_arg[ainfo] = Set{ArgumentWrapper}([remote_arg_w]) - end - push!(state.ainfo_arg[ainfo], remote_arg_w) - - # Populate info for the new ainfo - populate_ainfo!(state, arg_w, ainfo, target_space) - - return ainfo -end function populate_ainfo!(state::DataDepsState, original_arg_w::ArgumentWrapper, target_ainfo::AliasingWrapper, target_space::MemorySpace) + # Initialize owner and readers if !haskey(state.ainfos_owner, target_ainfo) - # Add ourselves to the lookup oracle - ainfo_idx = push!(state.ainfos_lookup, target_ainfo) - - # Find overlapping ainfos overlaps = Set{AliasingWrapper}() push!(overlaps, target_ainfo) - for other_ainfo in intersect(state.ainfos_lookup, target_ainfo; ainfo_idx) + for other_ainfo in keys(state.ainfos_owner) target_ainfo == other_ainfo && continue - # Mark us and them as overlapping - push!(overlaps, other_ainfo) - push!(state.ainfos_overlaps[other_ainfo], target_ainfo) + if will_alias(target_ainfo, other_ainfo) + # Mark us and them as overlapping + push!(overlaps, other_ainfo) + push!(state.ainfos_overlaps[other_ainfo], target_ainfo) - # Add overlapping history to our own - for other_remote_arg_w in state.ainfo_arg[other_ainfo] + # Add overlapping history to our own + other_remote_arg_w = state.ainfo_arg[other_ainfo] other_arg = state.remote_arg_to_original[other_remote_arg_w.arg] other_arg_w = ArgumentWrapper(other_arg, other_remote_arg_w.dep_mod) push!(state.arg_overlaps[original_arg_w], other_arg_w) @@ -602,53 +427,22 @@ function populate_ainfo!(state::DataDepsState, original_arg_w::ArgumentWrapper, end end state.ainfos_overlaps[target_ainfo] = overlaps - - # Initialize owner and readers state.ainfos_owner[target_ainfo] = nothing state.ainfos_readers[target_ainfo] = Pair{DTask,Int}[] end end function merge_history!(state::DataDepsState, arg_w::ArgumentWrapper, other_arg_w::ArgumentWrapper) history = state.arg_history[arg_w] - @opcounter :merge_history - @opcounter :merge_history_complexity length(history) - origin_space = state.arg_origin[other_arg_w.arg] - for other_entry in state.arg_history[other_arg_w] - write_num_tuple = HistoryEntry(AliasingWrapper(NoAliasing()), origin_space, other_entry.write_num) - range = searchsorted(history, write_num_tuple; by=x->x.write_num) - if !isempty(range) - # Find and skip duplicates - match = false - for source_idx in range - source_entry = history[source_idx] - if source_entry.ainfo == other_entry.ainfo && - source_entry.space == other_entry.space && - source_entry.write_num == other_entry.write_num - match = true - break - end + for (other_ainfo, other_space, write_num) in state.arg_history[other_arg_w] + idx = findfirst(h->h[3] > write_num, history) + if idx === nothing + if isempty(history) + idx = 1 + else + idx = length(history) + 1 end - match && continue - - # Insert at the first position - idx = first(range) - else - # Insert at the last position - idx = length(history) + 1 - end - insert!(history, idx, other_entry) - end -end -function truncate_history!(state::DataDepsState, arg_w::ArgumentWrapper) - # FIXME: Do this continuously if possible - if haskey(state.arg_history, arg_w) && length(state.arg_history[arg_w]) > 100000 - origin_space = state.arg_origin[arg_w.arg] - @opcounter :truncate_history - _, last_idx = compute_remainder_for_arg!(state, origin_space, arg_w, 0; compute_syncdeps=false) - if last_idx > 0 - @opcounter :truncate_history_removed last_idx - deleteat!(state.arg_history[arg_w], 1:last_idx) end + insert!(history, idx, (other_ainfo, other_space, write_num)) end end @@ -664,8 +458,11 @@ use of `x`, and the data in `x` will not be updated when the `spawn_datadeps` region returns. """ supports_inplace_move(x) = true -supports_inplace_move(t::DTask) = supports_inplace_move(fetch(t; raw=true)) +supports_inplace_move(t::DTask) = supports_inplace_move(fetch(t; move_value=false, unwrap=false)) +@warn "Fix this to work with MPI (can't call poolget on the wrong rank)" maxlog=1 function supports_inplace_move(c::Chunk) + # FIXME + return true # FIXME: Use MemPool.access_ref pid = root_worker_id(c.processor) if pid == myid() @@ -719,12 +516,12 @@ function add_writer!(state::DataDepsState, arg_w::ArgumentWrapper, dest_space::M empty!(state.arg_history[arg_w]) # Add our own history - push!(state.arg_history[arg_w], HistoryEntry(ainfo, dest_space, write_num)) + push!(state.arg_history[arg_w], (ainfo, dest_space, write_num)) # Find overlapping arguments and update their history for other_arg_w in state.arg_overlaps[arg_w] other_arg_w == arg_w && continue - push!(state.arg_history[other_arg_w], HistoryEntry(ainfo, dest_space, write_num)) + push!(state.arg_history[other_arg_w], (ainfo, dest_space, write_num)) end # Record the last place we were fully written to @@ -737,12 +534,19 @@ function add_reader!(state::DataDepsState, arg_w::ArgumentWrapper, dest_space::M push!(state.ainfos_readers[ainfo], task=>write_num) end +# FIXME: These should go in MPIExt.jl +const MPI_TID = ScopedValue{Int64}(0) +const MPI_UID = ScopedValue{Int64}(0) + # Make a copy of each piece of data on each worker # memory_space => {arg => copy_of_arg} isremotehandle(x) = false isremotehandle(x::DTask) = true isremotehandle(x::Chunk) = true function generate_slot!(state::DataDepsState, dest_space, data) + if data isa DTask + data = fetch(data; move_value=false, unwrap=false) + end # N.B. We do not perform any sync/copy with the current owner of the data, # because all we want here is to make a copy of some version of the data, # even if the data is not up to date. @@ -750,16 +554,30 @@ function generate_slot!(state::DataDepsState, dest_space, data) to_proc = first(processors(dest_space)) from_proc = first(processors(orig_space)) dest_space_args = get!(IdDict{Any,Any}, state.remote_args, dest_space) - aliased_object_cache = AliasedObjectCache(dest_space, state.ainfo_backing_chunk) - ctx = Sch.eager_context() - id = rand(Int) - @maybelog ctx timespan_start(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data)) - data_chunk = move_rewrap(aliased_object_cache, from_proc, to_proc, orig_space, dest_space, data) - @maybelog ctx timespan_finish(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data=data_chunk)) + ALIASED_OBJECT_CACHE[] = get!(Dict{AbstractAliasing,Chunk}, state.ainfo_backing_chunk, dest_space) + if orig_space == dest_space && (data isa Chunk || !isremotehandle(data)) + # Fast path for local data that's already in a Chunk or not a remote handle needing rewrapping + task = DATADEPS_CURRENT_TASK[] + data_chunk = with(MPI_UID=>task.uid) do + tochunk(data, from_proc) + end + else + ctx = Sch.eager_context() + id = rand(Int) + @maybelog ctx timespan_start(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data)) + data_chunk = move_rewrap(from_proc, to_proc, orig_space, dest_space, data) + @maybelog ctx timespan_finish(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data=data_chunk)) + end @assert memory_space(data_chunk) == dest_space "space mismatch! $dest_space (dest) != $(memory_space(data_chunk)) (actual) ($(typeof(data)) (data) vs. $(typeof(data_chunk)) (chunk)), spaces ($orig_space -> $dest_space)" dest_space_args[data] = data_chunk state.remote_arg_to_original[data_chunk] = data + ALIASED_OBJECT_CACHE[] = nothing + + check_uniform(memory_space(dest_space_args[data])) + check_uniform(processor(dest_space_args[data])) + check_uniform(dest_space_args[data].handle) + return dest_space_args[data] end function get_or_generate_slot!(state, dest_space, data) @@ -772,82 +590,67 @@ function get_or_generate_slot!(state, dest_space, data) end return state.remote_args[dest_space][data] end -function remotecall_endpoint(f, from_proc, to_proc, from_space, to_space, data) - to_w = root_worker_id(to_proc) - if to_w == myid() - data_converted = f(move(from_proc, to_proc, data)) - return tochunk(data_converted, to_proc) - end - return remotecall_fetch(to_w, from_proc, to_proc, to_space, data) do from_proc, to_proc, to_space, data - data_converted = f(move(from_proc, to_proc, data)) - return tochunk(data_converted, to_proc) - end -end -function rewrap_aliased_object!(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x) - return aliased_object!(cache, x) do x - return remotecall_endpoint(identity, from_proc, to_proc, from_space, to_space, x) +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) + return aliased_object!(data) do data + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, data) end end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data::Chunk) - # Unwrap so that we hit the right dispatch - wid = root_worker_id(data) - if wid != myid() - return remotecall_fetch(move_rewrap, wid, cache, from_proc, to_proc, from_space, to_space, data) +function remotecall_endpoint(f, ::Dagger.DistributedAcceleration, from_proc, to_proc, orig_space, dest_space, data) + to_w = root_worker_id(to_proc) + return remotecall_fetch(to_w, from_proc, to_proc, dest_space, data) do from_proc, to_proc, dest_space, data + data_converted = f(move(from_proc, to_proc, data)) + return tochunk(data_converted, to_proc, dest_space) end - data_raw = unwrap(data) - return move_rewrap(cache, from_proc, to_proc, from_space, to_space, data_raw) end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) - # For generic data - return aliased_object!(cache, data) do data - return remotecall_endpoint(identity, from_proc, to_proc, from_space, to_space, data) - end +const ALIASED_OBJECT_CACHE = TaskLocalValue{Union{Dict{AbstractAliasing,Chunk}, Nothing}}(()->nothing) +@warn "Document these public methods" maxlog=1 +# TODO: Use state to cache aliasing() results +function declare_aliased_object!(x; ainfo=aliasing(current_acceleration(), x, identity)) + cache = ALIASED_OBJECT_CACHE[] + cache[ainfo] = x end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) - to_w = root_worker_id(to_proc) - p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) - inds = parentindices(v) - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, inds) do from_proc, to_proc, from_space, to_space, p_chunk, inds - p_new = move(from_proc, to_proc, p_chunk) - v_new = view(p_new, inds...) - return tochunk(v_new, to_proc) +function aliased_object!(x; ainfo=aliasing(current_acceleration(), x, identity)) + cache = ALIASED_OBJECT_CACHE[] + if haskey(cache, ainfo) + y = cache[ainfo] + else + @assert x isa Chunk "x must be a Chunk\nUse functor form of aliased_object!" + cache[ainfo] = x + y = x end + return y end -# FIXME: Do this programmatically via recursive dispatch -for wrapper in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular) - @eval function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::$(wrapper)) - to_w = root_worker_id(to_proc) - p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk) do from_proc, to_proc, from_space, to_space, p_chunk - p_new = move(from_proc, to_proc, p_chunk) - v_new = $(wrapper)(p_new) - return tochunk(v_new, to_proc) - end +function aliased_object!(f, x; ainfo=aliasing(current_acceleration(), x, identity)) + cache = ALIASED_OBJECT_CACHE[] + if haskey(cache, ainfo) + y = cache[ainfo] + else + y = f(x) + @assert y isa Chunk "Didn't get a Chunk from functor" + cache[ainfo] = y end + return y end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::Base.RefValue) - return aliased_object!(cache, v) do v - return remotecall_endpoint(identity, from_proc, to_proc, from_space, to_space, v) - end +function aliased_object_unwrap!(x::Chunk) + y = unwrap(x) + ainfo = aliasing(current_acceleration(), y, identity) + return unwrap(aliased_object!(x; ainfo)) end -#= FIXME: Make this work so we can automatically move-rewrap recursive objects -function move_rewrap_recursive(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::T) where T - if isstructtype(T) - # Check all object fields (recursive) - for field in fieldnames(T) - value = getfield(x, field) - new_value = aliased_object!(cache, value) do value - return move_rewrap_recursive(cache, from_proc, to_proc, from_space, to_space, value) - end - setfield!(x, field, new_value) - end - return x - else - @warn "Cannot move-rewrap object of type $T" - return x + +struct DataDepsSchedulerState + task_to_spec::Dict{DTask,DTaskSpec} + assignments::Dict{DTask,MemorySpace} + dependencies::Dict{DTask,Set{DTask}} + task_completions::Dict{DTask,UInt64} + space_completions::Dict{MemorySpace,UInt64} + capacities::Dict{MemorySpace,Int} + + function DataDepsSchedulerState() + return new(Dict{DTask,DTaskSpec}(), + Dict{DTask,MemorySpace}(), + Dict{DTask,Set{DTask}}(), + Dict{DTask,UInt64}(), + Dict{MemorySpace,UInt64}(), + Dict{MemorySpace,Int}()) end end -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::String) = x # FIXME: Not necessarily true -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Symbol) = x -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Type) = x -=# diff --git a/src/datadeps/chunkview.jl b/src/datadeps/chunkview.jl index 1c2aa600f..6e2a21dfd 100644 --- a/src/datadeps/chunkview.jl +++ b/src/datadeps/chunkview.jl @@ -3,10 +3,6 @@ struct ChunkView{N} slices::NTuple{N, Union{Int, AbstractRange{Int}, Colon}} end -function _identity_hash(arg::ChunkView, h::UInt=UInt(0)) - return hash(arg.slices, _identity_hash(arg.chunk, h)) -end - function Base.view(c::Chunk, slices...) if c.domain isa ArrayDomain nd, sz = ndims(c.domain), size(c.domain) @@ -29,39 +25,31 @@ function Base.view(c::Chunk, slices...) return ChunkView(c, slices) end -Base.view(c::DTask, slices...) = view(fetch(c; raw=true), slices...) +Base.view(c::DTask, slices...) = view(fetch(c; move_value=false, unwrap=false), slices...) -function aliasing(x::ChunkView{N}) where N - return remotecall_fetch(root_worker_id(x.chunk.processor), x.chunk, x.slices) do x, slices - x = unwrap(x) - v = view(x, slices...) - return aliasing(v) - end -end +aliasing(x::ChunkView) = + throw(ConcurrencyViolationError("Cannot query aliasing of a ChunkView directly")) memory_space(x::ChunkView) = memory_space(x.chunk) isremotehandle(x::ChunkView) = true -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) - to_w = root_worker_id(to_proc) - # N.B. We use move_rewrap (not rewrap_aliased_object!) so that if the inner - # chunk is a SubArray, it goes through the SubArray-aware path which shares - # the parent array via the aliased object cache. Using rewrap_aliased_object! - # would simply serialize the entire SubArray, creating a new parent copy on - # the destination, breaking aliasing with other views of the same parent. - p_chunk = move_rewrap(cache, from_proc, to_proc, from_space, to_space, slice.chunk) - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, slice.slices) do from_proc, to_proc, from_space, to_space, p_chunk, inds - p_new = move(from_proc, to_proc, p_chunk) - v_new = view(p_new, inds...) - return tochunk(v_new, to_proc) +# This definition is here because it's so similar to ChunkView +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) + p_chunk = aliased_object!(parent(v)) do p_chunk + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) + end + inds = parentindices(v) + return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new + return view(p_new, inds...) end end -function move(from_proc::Processor, to_proc::Processor, slice::ChunkView) - to_w = root_worker_id(to_proc) - return remotecall_fetch(to_w, from_proc, to_proc, slice.chunk, slice.slices) do from_proc, to_proc, chunk, slices - chunk_new = move(from_proc, to_proc, chunk) - v_new = view(chunk_new, slices...) - return tochunk(v_new, to_proc) +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) + p_chunk = aliased_object!(slice.chunk) do p_chunk + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) + end + inds = slice.slices + return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new + return view(p_new, inds...) end end -Base.fetch(slice::ChunkView) = view(fetch(slice.chunk), slice.slices...) \ No newline at end of file +Base.fetch(slice::ChunkView) = view(fetch(slice.chunk), slice.slices...) diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 3e8d89d50..4e68ecbca 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -1,32 +1,44 @@ -struct DataDepsTaskQueue{Scheduler<:DataDepsScheduler} <: AbstractTaskQueue +struct DataDepsTaskQueue <: AbstractTaskQueue # The queue above us upper_queue::AbstractTaskQueue # The set of tasks that have already been seen - seen_tasks::Union{Vector{DTaskPair},Nothing} + seen_tasks::Union{Vector{Pair{DTaskSpec,DTask}},Nothing} # The data-dependency graph of all tasks g::Union{SimpleDiGraph{Int},Nothing} # The mapping from task to graph ID task_to_id::Union{Dict{DTask,Int},Nothing} + # How to traverse the dependency graph when launching tasks + traversal::Symbol # Which scheduler to use to assign tasks to processors - scheduler::Scheduler + scheduler::Symbol - function DataDepsTaskQueue(upper_queue; scheduler::DataDepsScheduler) - seen_tasks = DTaskPair[] + # Whether aliasing across arguments is possible + # The fields following only apply when aliasing==true + aliasing::Bool + + function DataDepsTaskQueue(upper_queue; + traversal::Symbol=:inorder, + scheduler::Symbol=:naive, + aliasing::Bool=true) + seen_tasks = Pair{DTaskSpec,DTask}[] g = SimpleDiGraph() task_to_id = Dict{DTask,Int}() - return new{typeof(scheduler)}(upper_queue, seen_tasks, g, task_to_id, scheduler) + return new(upper_queue, seen_tasks, g, task_to_id, traversal, scheduler, + aliasing) end end -function enqueue!(queue::DataDepsTaskQueue, pair::DTaskPair) - push!(queue.seen_tasks, pair) +function enqueue!(queue::DataDepsTaskQueue, spec::Pair{DTaskSpec,DTask}) + push!(queue.seen_tasks, spec) end -function enqueue!(queue::DataDepsTaskQueue, pairs::Vector{DTaskPair}) - append!(queue.seen_tasks, pairs) +function enqueue!(queue::DataDepsTaskQueue, specs::Vector{Pair{DTaskSpec,DTask}}) + append!(queue.seen_tasks, specs) end +const DATADEPS_CURRENT_TASK = TaskLocalValue{Union{DTask,Nothing}}(Returns(nothing)) + """ - spawn_datadeps(f::Base.Callable) + spawn_datadeps(f::Base.Callable; traversal::Symbol=:inorder) Constructs a "datadeps" (data dependencies) region and calls `f` within it. Dagger tasks launched within `f` may wrap their arguments with `In`, `Out`, or @@ -53,41 +65,46 @@ appropriately. At the end of executing `f`, `spawn_datadeps` will wait for all launched tasks to complete, rethrowing the first error, if any. The result of `f` will be returned from `spawn_datadeps`. + +The keyword argument `traversal` controls the order that tasks are launched by +the scheduler, and may be set to `:bfs` or `:dfs` for Breadth-First Scheduling +or Depth-First Scheduling, respectively. All traversal orders respect the +dependencies and ordering of the launched tasks, but may provide better or +worse performance for a given set of datadeps tasks. This argument is +experimental and subject to change. """ function spawn_datadeps(f::Base.Callable; static::Bool=true, traversal::Symbol=:inorder, - scheduler::Union{DataDepsScheduler,Nothing}=nothing, + scheduler::Union{Symbol,Nothing}=nothing, aliasing::Bool=true, launch_wait::Union{Bool,Nothing}=nothing) if !static throw(ArgumentError("Dynamic scheduling is no longer available")) end - if traversal != :inorder - throw(ArgumentError("Traversal order is no longer configurable, and always :inorder")) - end - if !aliasing - throw(ArgumentError("Aliasing analysis is no longer optional")) - end wait_all(; check_errors=true) do - scheduler = something(scheduler, DATADEPS_SCHEDULER[], RoundRobinScheduler()) + scheduler = something(scheduler, DATADEPS_SCHEDULER[], :roundrobin)::Symbol launch_wait = something(launch_wait, DATADEPS_LAUNCH_WAIT[], false)::Bool if launch_wait result = spawn_bulk() do - queue = DataDepsTaskQueue(get_options(:task_queue); scheduler) + queue = DataDepsTaskQueue(get_options(:task_queue); + traversal, scheduler, aliasing) with_options(f; task_queue=queue) distribute_tasks!(queue) end else - queue = DataDepsTaskQueue(get_options(:task_queue); scheduler) + queue = DataDepsTaskQueue(get_options(:task_queue); + traversal, scheduler, aliasing) result = with_options(f; task_queue=queue) distribute_tasks!(queue) end + DATADEPS_CURRENT_TASK[] = nothing return result end end -const DATADEPS_SCHEDULER = ScopedValue{Union{DataDepsScheduler,Nothing}}(nothing) +const DATADEPS_SCHEDULER = ScopedValue{Union{Symbol,Nothing}}(nothing) const DATADEPS_LAUNCH_WAIT = ScopedValue{Union{Bool,Nothing}}(nothing) +@warn "Don't blindly set occupancy=0, only do for MPI" maxlog=1 function distribute_tasks!(queue::DataDepsTaskQueue) #= TODO: Improvements to be made: # - Support for copying non-AbstractArray arguments @@ -98,258 +115,377 @@ function distribute_tasks!(queue::DataDepsTaskQueue) =# # Get the set of all processors to be scheduled on - all_procs = Processor[] scope = get_compute_scope() - for w in procs() - append!(all_procs, get_processors(OSProc(w))) + accel = current_acceleration() + accel_procs = filter(procs(Dagger.Sch.eager_context())) do proc + Dagger.accel_matches_proc(accel, proc) end + all_procs = unique(vcat([collect(Dagger.get_processors(gp)) for gp in accel_procs]...)) + # FIXME: This is an unreliable way to ensure processor uniformity + sort!(all_procs, by=short_name) filter!(proc->proc_in_scope(proc, scope), all_procs) if isempty(all_procs) throw(Sch.SchedulingException("No processors available, try widening scope")) end - all_scope = UnionScope(map(ExactScope, all_procs)) exec_spaces = unique(vcat(map(proc->collect(memory_spaces(proc)), all_procs)...)) - if !all(space->space isa CPURAMMemorySpace, exec_spaces) && !all(space->root_worker_id(space) == myid(), exec_spaces) + #=if !all(space->space isa CPURAMMemorySpace, exec_spaces) && !all(space->root_worker_id(space) == myid(), exec_spaces) @warn "Datadeps support for multi-GPU, multi-worker is currently broken\nPlease be prepared for incorrect results or errors" maxlog=1 + end=# + for proc in all_procs + check_uniform(proc) end # Round-robin assign tasks to processors upper_queue = get_options(:task_queue) - # Start launching tasks and necessary copies - state = DataDepsState() - write_num = 1 - proc_to_scope_lfu = BasicLFUCache{Processor,AbstractScope}(1024) - for pair in queue.seen_tasks - spec = pair.spec - task = pair.task - write_num = distribute_task!(queue, state, all_procs, all_scope, spec, task, spec.fargs, proc_to_scope_lfu, write_num) - end - - # Copy args from remote to local - # N.B. We sort the keys to ensure a deterministic order for uniformity - for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) - arg = arg_w.arg - origin_space = state.arg_origin[arg] - remainder, _ = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) - if remainder isa MultiRemainderAliasing - origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) - enqueue_remainder_copy_from!(state, origin_space, arg_w, remainder, origin_scope, write_num) - elseif remainder isa FullCopy - origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) - enqueue_copy_from!(state, origin_space, arg_w, origin_scope, write_num) - else - @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" - @dagdebug nothing :spawn_datadeps "Skipped copy-from (up-to-date): $origin_space" - ctx = Sch.eager_context() - id = rand(UInt) - @maybelog ctx timespan_start(ctx, :datadeps_copy_skip, (;id), (;)) - @maybelog ctx timespan_finish(ctx, :datadeps_copy_skip, (;id), (;thunk_id=0, from_space=origin_space, to_space=origin_space, arg_w, from_arg=arg, to_arg=arg)) + traversal = queue.traversal + if traversal == :inorder + # As-is + task_order = Colon() + elseif traversal == :bfs + # BFS + task_order = Int[1] + to_walk = Int[1] + seen = Set{Int}([1]) + while !isempty(to_walk) + # N.B. next_root has already been seen + next_root = popfirst!(to_walk) + for v in outneighbors(queue.g, next_root) + if !(v in seen) + push!(task_order, v) + push!(seen, v) + push!(to_walk, v) + end + end end - end -end -struct DataDepsTaskDependency - arg_w::ArgumentWrapper - readdep::Bool - writedep::Bool -end -DataDepsTaskDependency(arg, dep) = - DataDepsTaskDependency(ArgumentWrapper(arg, dep[1]), dep[2], dep[3]) -struct DataDepsTaskArgument - arg - pos::ArgPosition - may_alias::Bool - inplace_move::Bool - deps::Vector{DataDepsTaskDependency} -end -struct TypedDataDepsTaskArgument{T,N} - arg::T - pos::ArgPosition - may_alias::Bool - inplace_move::Bool - deps::NTuple{N,DataDepsTaskDependency} -end -map_or_ntuple(f, xs::Vector) = map(f, 1:length(xs)) -@inline map_or_ntuple(@specialize(f), xs::NTuple{N,T}) where {N,T} = ntuple(f, Val(N)) -function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, all_scope, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int) where typed - @specialize spec fargs - - if typed - fargs::Tuple - else - fargs::Vector{Argument} - end - - task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) - scheduler = queue.scheduler - our_proc = datadeps_schedule_task(scheduler, state, all_procs, all_scope, task_scope, spec, task) - @assert our_proc in all_procs - our_space = only(memory_spaces(our_proc)) - - # Find the scope for this task (and its copies) - task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) - if task_scope == all_scope - # Optimize for the common case, cache the proc=>scope mapping - our_scope = get!(proc_to_scope_lfu, our_proc) do - our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - return constrain(UnionScope(map(ExactScope, our_procs)...), all_scope) + elseif traversal == :dfs + # DFS (modified with backtracking) + task_order = Int[] + to_walk = Int[1] + seen = Set{Int}() + while length(task_order) < length(queue.seen_tasks) && !isempty(to_walk) + next_root = popfirst!(to_walk) + if !(next_root in seen) + iv = inneighbors(queue.g, next_root) + if all(v->v in seen, iv) + push!(task_order, next_root) + push!(seen, next_root) + ov = outneighbors(queue.g, next_root) + prepend!(to_walk, ov) + else + push!(to_walk, next_root) + end + end end else - # Use the provided scope and constrain it to the available processors - our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - our_scope = constrain(UnionScope(map(ExactScope, our_procs)...), task_scope) - end - if our_scope isa InvalidScope - throw(Sch.SchedulingException("Scopes are not compatible: $(our_scope.x), $(our_scope.y)")) + throw(ArgumentError("Invalid traversal mode: $traversal")) end - f = spec.fargs[1] - tid = task.uid - # FIXME: May not be correct to move this under uniformity - #f.value = move(default_processor(), our_proc, value(f)) - @dagdebug tid :spawn_datadeps "($(repr(value(f)))) Scheduling: $our_proc ($our_space)" - - # Copy raw task arguments for analysis - # N.B. Used later for checking dependencies - task_args = map_or_ntuple(idx->copy(spec.fargs[idx]), spec.fargs) - - # Populate all task dependencies - task_arg_ws = populate_task_info!(state, task_args, spec, task) - - # Truncate the history for each argument - map_or_ntuple(task_arg_ws) do idx - arg_ws = task_arg_ws[idx] - map_or_ntuple(arg_ws.deps) do dep_idx - dep = arg_ws.deps[dep_idx] - truncate_history!(state, dep.arg_w) - end - return + state = DataDepsState(queue.aliasing) + sstate = DataDepsSchedulerState() + for proc in all_procs + space = only(memory_spaces(proc)) + get!(()->0, sstate.capacities, space) + sstate.capacities[space] += 1 end - # Copy args from local to remote - remote_args = map_or_ntuple(task_arg_ws) do idx - arg_ws = task_arg_ws[idx] - arg = arg_ws.arg - pos = raw_position(arg_ws.pos) + # Start launching tasks and necessary copies + write_num = 1 + proc_idx = 1 + pressures = Dict{Processor,Int}() + proc_to_scope_lfu = BasicLFUCache{Processor,AbstractScope}(1024) + for (spec, task) in queue.seen_tasks[task_order] + DATADEPS_CURRENT_TASK[] = task + + # Populate all task dependencies + populate_task_info!(state, spec, task) + + scheduler = queue.scheduler + if scheduler == :naive + raw_args = map(arg->tochunk(value(arg)), spec.fargs) + our_proc = remotecall_fetch(1, all_procs, raw_args) do all_procs, raw_args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + # Calculate costs per processor and select the most optimal + # FIXME: This should consider any already-allocated slots, + # whether they are up-to-date, and if not, the cost of moving + # data to them + procs, costs = Sch.estimate_task_costs(sch_state, all_procs, nothing, raw_args) + return first(procs) + end + end + elseif scheduler == :smart + raw_args = map(filter(arg->haskey(state.data_locality, value(arg)), spec.fargs)) do arg + arg_chunk = tochunk(value(arg)) + # Only the owned slot is valid + # FIXME: Track up-to-date copies and pass all of those + return arg_chunk => data_locality[arg] + end + f_chunk = tochunk(value(spec.fargs[1])) + our_proc, task_pressure = remotecall_fetch(1, all_procs, pressures, f_chunk, raw_args) do all_procs, pressures, f, chunks_locality + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + tx_rate = sch_state.transfer_rate[] + + costs = Dict{Processor,Float64}() + for proc in all_procs + # Filter out chunks that are already local + chunks_filt = Iterators.filter(((chunk, space)=chunk_locality)->!(proc in processors(space)), chunks_locality) + + # Estimate network transfer costs based on data size + # N.B. `affinity(x)` really means "data size of `x`" + # N.B. We treat same-worker transfers as having zero transfer cost + tx_cost = Sch.impute_sum(affinity(chunk)[2] for chunk in chunks_filt) + + # Estimate total cost to move data and get task running after currently-scheduled tasks + est_time_util = get(pressures, proc, UInt64(0)) + costs[proc] = est_time_util + (tx_cost/tx_rate) + end + + # Look up estimated task cost + sig = Sch.signature(sch_state, f, map(first, chunks_locality)) + task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) + + # Shuffle procs around, so equally-costly procs are equally considered + P = randperm(length(all_procs)) + procs = getindex.(Ref(all_procs), P) + + # Sort by lowest cost first + sort!(procs, by=p->costs[p]) + + best_proc = first(procs) + return best_proc, task_pressure + end + end + # FIXME: Pressure should be decreased by pressure of syncdeps on same processor + pressures[our_proc] = get(pressures, our_proc, UInt64(0)) + task_pressure + elseif scheduler == :ultra + args = Base.mapany(spec.fargs) do arg + pos, data = arg + data, _ = unwrap_inout(data) + if data isa DTask + data = fetch(data; move_value=false, unwrap=false) + end + return pos => tochunk(data) + end + f_chunk = tochunk(value(spec.fargs[1])) + task_time = remotecall_fetch(1, f_chunk, args) do f, args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + return @lock sch_state.lock begin + sig = Sch.signature(sch_state, f, args) + return get(sch_state.signature_time_cost, sig, 1000^3) + end + end - # Is the data written previously or now? - if !arg_ws.may_alias - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (immutable)" - return arg - end + # FIXME: Copy deps are computed eagerly + deps = @something(spec.options.syncdeps, Set{Any}()) - # Is the data writeable? - if !arg_ws.inplace_move - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (non-writeable)" - return arg - end + # Find latest time-to-completion of all syncdeps + deps_completed = UInt64(0) + for dep in deps + haskey(sstate.task_completions, dep) || continue # copy deps aren't recorded + deps_completed = max(deps_completed, sstate.task_completions[dep]) + end - # Is the source of truth elsewhere? - arg_remote = get_or_generate_slot!(state, our_space, arg) - map_or_ntuple(arg_ws.deps) do dep_idx - dep = arg_ws.deps[dep_idx] - arg_w = dep.arg_w - dep_mod = arg_w.dep_mod - remainder, _ = compute_remainder_for_arg!(state, our_space, arg_w, write_num) - if remainder isa MultiRemainderAliasing - enqueue_remainder_copy_to!(state, our_space, arg_w, remainder, value(f), idx, our_scope, task, write_num) - elseif remainder isa FullCopy - enqueue_copy_to!(state, our_space, arg_w, value(f), idx, our_scope, task, write_num) - else - @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Skipped copy-to (up-to-date): $our_space" + # Find latest time-to-completion of each memory space + # FIXME: Figure out space completions based on optimal packing + spaces_completed = Dict{MemorySpace,UInt64}() + for space in exec_spaces + completed = UInt64(0) + for (task, other_space) in sstate.assignments + space == other_space || continue + completed = max(completed, sstate.task_completions[task]) + end + spaces_completed[space] = completed end - end - return arg_remote - end - write_num += 1 - # Validate that we're not accidentally performing a copy - map_or_ntuple(task_arg_ws) do idx - arg_ws = task_arg_ws[idx] - arg = remote_args[idx] + # Choose the earliest-available memory space and processor + # FIXME: Consider move time + move_time = UInt64(0) + local our_space_completed + while true + our_space_completed, our_space = findmin(spaces_completed) + our_space_procs = filter(proc->proc in all_procs, processors(our_space)) + if isempty(our_space_procs) + delete!(spaces_completed, our_space) + continue + end + our_proc = rand(our_space_procs) + break + end - # Get the dependencies again as (dep_mod, readdep, writedep) - deps = map_or_ntuple(arg_ws.deps) do dep_idx - dep = arg_ws.deps[dep_idx] - (dep.arg_w.dep_mod, dep.readdep, dep.writedep) + sstate.task_to_spec[task] = spec + sstate.assignments[task] = our_space + sstate.task_completions[task] = our_space_completed + move_time + task_time + elseif scheduler == :roundrobin + our_proc = all_procs[proc_idx] + else + error("Invalid scheduler: $sched") end - - # Check that any mutable and written arguments are already in the correct space - # N.B. We only do this check when the argument supports in-place - # moves, because for the moment, we are not guaranteeing updates or - # write-back of results - if is_writedep(arg, deps, task) && arg_ws.may_alias && arg_ws.inplace_move - arg_space = memory_space(arg) - @assert arg_space == our_space "($(repr(value(f))))[$(idx-1)] Tried to pass $(typeof(arg)) from $arg_space to $our_space" + @assert our_proc in all_procs + our_space = only(memory_spaces(our_proc)) + + # Find the scope for this task (and its copies) + task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) + if task_scope == scope + # Optimize for the common case, cache the proc=>scope mapping + our_scope = get!(proc_to_scope_lfu, our_proc) do + our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) + return constrain(UnionScope(map(ExactScope, our_procs)...), scope) + end + else + # Use the provided scope and constrain it to the available processors + our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) + our_scope = constrain(UnionScope(map(ExactScope, our_procs)...), task_scope) end - end + if our_scope isa InvalidScope + throw(Sch.SchedulingException("Scopes are not compatible: $(our_scope.x), $(our_scope.y)")) + end + check_uniform(our_proc) + check_uniform(our_space) + + f = spec.fargs[1] + # FIXME: May not be correct to move this under uniformity + f.value = move(default_processor(), our_proc, value(f)) + @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Scheduling: $our_proc ($our_space)" + + # Copy raw task arguments for analysis + task_args = map(copy, spec.fargs) + + # Generate a list of ArgumentWrappers for each task argument + task_arg_ws = map(task_args) do _arg + arg = value(_arg) + arg, deps = unwrap_inout(arg) + arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg + if !type_may_alias(typeof(arg)) || !supports_inplace_move(state, arg) + return [(ArgumentWrapper(arg, identity), false, false)] + end + arg_ws = Tuple{ArgumentWrapper,Bool,Bool}[] + for (dep_mod, readdep, writedep) in deps + push!(arg_ws, (ArgumentWrapper(arg, dep_mod), readdep, writedep)) + end + return arg_ws + end + task_arg_ws = task_arg_ws::Vector{Vector{Tuple{ArgumentWrapper,Bool,Bool}}} + + # Copy args from local to remote + for (idx, arg_ws) in enumerate(task_arg_ws) + arg = first(arg_ws)[1].arg + pos = raw_position(task_args[idx]) + + # Is the data written previously or now? + if !type_may_alias(typeof(arg)) + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (immutable)" + spec.fargs[idx].value = arg + continue + end - # Calculate this task's syncdeps - if spec.options.syncdeps === nothing - spec.options.syncdeps = Set{ThunkSyncdep}() - end - syncdeps = spec.options.syncdeps - map_or_ntuple(task_arg_ws) do idx - arg_ws = task_arg_ws[idx] - arg = arg_ws.arg - arg_ws.may_alias || return - arg_ws.inplace_move || return - map_or_ntuple(arg_ws.deps) do dep_idx - dep = arg_ws.deps[dep_idx] - arg_w = dep.arg_w - ainfo = aliasing!(state, our_space, arg_w) - dep_mod = arg_w.dep_mod - if dep.writedep - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as writer" - get_write_deps!(state, our_space, ainfo, write_num, syncdeps) - else - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as reader" - get_read_deps!(state, our_space, ainfo, write_num, syncdeps) + # Is the data writeable? + if !supports_inplace_move(state, arg) + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (non-writeable)" + spec.fargs[idx].value = arg + continue + end + + # Is the source of truth elsewhere? + arg_remote = get_or_generate_slot!(state, our_space, arg) + for (arg_w, _, _) in arg_ws + dep_mod = arg_w.dep_mod + remainder = compute_remainder_for_arg!(state, our_space, arg_w, write_num) + if remainder isa MultiRemainderAliasing + enqueue_remainder_copy_to!(state, our_space, arg_w, remainder, value(f), idx, our_scope, task, write_num) + elseif remainder isa FullCopy + enqueue_copy_to!(state, our_space, arg_w, value(f), idx, our_scope, task, write_num) + else + @assert remainder isa NoAliasing + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Skipped copy-to (up-to-date): $our_space" + end + end + spec.fargs[idx].value = arg_remote + end + write_num += 1 + + # Validate that we're not accidentally performing a copy + for (idx, _arg) in enumerate(spec.fargs) + arg = value(_arg) + _, deps = unwrap_inout(value(task_args[idx])) + # N.B. We only do this check when the argument supports in-place + # moves, because for the moment, we are not guaranteeing updates or + # write-back of results + if is_writedep(arg, deps, task) && supports_inplace_move(state, arg) + arg_space = memory_space(arg) + @assert arg_space == our_space "($(repr(value(f))))[$(idx-1)] Tried to pass $(typeof(arg)) from $arg_space to $our_space" end end - return - end - @dagdebug tid :spawn_datadeps "($(repr(value(f)))) Task has $(length(syncdeps)) syncdeps" - # Launch user's task - new_fargs = map_or_ntuple(task_arg_ws) do idx - if is_typed(spec) - return TypedArgument(task_arg_ws[idx].pos, remote_args[idx]) - else - return Argument(task_arg_ws[idx].pos, remote_args[idx]) + # Calculate this task's syncdeps + if spec.options.syncdeps === nothing + spec.options.syncdeps = Set{Any}() end - end - new_spec = DTaskSpec(new_fargs, spec.options) - new_spec.options.scope = our_scope - new_spec.options.exec_scope = our_scope - new_spec.options.occupancy = Dict(Any=>0) - ctx = Sch.eager_context() - @maybelog ctx timespan_start(ctx, :datadeps_execute, (;thunk_id=task.uid), (;)) - enqueue!(queue.upper_queue, DTaskPair(new_spec, task)) - @maybelog ctx timespan_finish(ctx, :datadeps_execute, (;thunk_id=task.uid), (;space=our_space, deps=task_arg_ws, args=remote_args)) - - # Update read/write tracking for arguments - map_or_ntuple(task_arg_ws) do idx - arg_ws = task_arg_ws[idx] - arg = arg_ws.arg - arg_ws.may_alias || return - arg_ws.inplace_move || return - for dep in arg_ws.deps - arg_w = dep.arg_w - ainfo = aliasing!(state, our_space, arg_w) - dep_mod = arg_w.dep_mod - if dep.writedep - @dagdebug tid :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Task set as writer" - add_writer!(state, arg_w, our_space, ainfo, task, write_num) - else - add_reader!(state, arg_w, our_space, ainfo, task, write_num) + syncdeps = spec.options.syncdeps + for (idx, arg_ws) in enumerate(task_arg_ws) + arg = first(arg_ws)[1].arg + type_may_alias(typeof(arg)) || continue + supports_inplace_move(state, arg) || continue + for (arg_w, _, writedep) in arg_ws + ainfo = aliasing!(state, our_space, arg_w) + dep_mod = arg_w.dep_mod + if writedep + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as writer" + get_write_deps!(state, our_space, ainfo, write_num, syncdeps) + else + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as reader" + get_read_deps!(state, our_space, ainfo, write_num, syncdeps) + end + end + end + @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Task has $(length(syncdeps)) syncdeps" + + # Launch user's task + spec.options.scope = our_scope + spec.options.exec_scope = our_scope + spec.options.occupancy = Dict(Any=>0) + enqueue!(upper_queue, spec=>task) + + # Update read/write tracking for arguments + for (idx, arg_ws) in enumerate(task_arg_ws) + arg = first(arg_ws)[1].arg + type_may_alias(typeof(arg)) || continue + for (arg_w, _, writedep) in arg_ws + ainfo = aliasing!(state, our_space, arg_w) + dep_mod = arg_w.dep_mod + if writedep + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Task set as writer" + add_writer!(state, arg_w, our_space, ainfo, task, write_num) + else + add_reader!(state, arg_w, our_space, ainfo, task, write_num) + end end end - return - end - write_num += 1 + write_num += 1 + proc_idx = mod1(proc_idx + 1, length(all_procs)) + end - return write_num + # Copy args from remote to local + for arg_w in keys(state.arg_owner) + arg = arg_w.arg + origin_space = state.arg_origin[arg] + remainder = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) + if remainder isa MultiRemainderAliasing + origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) + enqueue_remainder_copy_from!(state, origin_space, arg_w, remainder, origin_scope, write_num) + elseif remainder isa FullCopy + origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) + enqueue_copy_from!(state, origin_space, arg_w, origin_scope, write_num) + else + @assert remainder isa NoAliasing + @dagdebug nothing :spawn_datadeps "Skipped copy-from (up-to-date): $origin_space" + end + end end diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl index 2c2c49920..0ac90aa78 100644 --- a/src/datadeps/remainders.jl +++ b/src/datadeps/remainders.jl @@ -9,11 +9,10 @@ This is used to perform partial data copies that only update the "remainder" reg struct RemainderAliasing{S<:MemorySpace} <: AbstractAliasing space::S spans::Vector{Tuple{LocalMemorySpan,LocalMemorySpan}} - ainfos::Vector{AliasingWrapper} syncdeps::Set{ThunkSyncdep} end -RemainderAliasing(space::S, spans::Vector{Tuple{LocalMemorySpan,LocalMemorySpan}}, ainfos::Vector{AliasingWrapper}, syncdeps::Set{ThunkSyncdep}) where S = - RemainderAliasing{S}(space, spans, ainfos, syncdeps) +RemainderAliasing(space::S, spans::Vector{Tuple{LocalMemorySpan,LocalMemorySpan}}, syncdeps::Set{ThunkSyncdep}) where S = + RemainderAliasing{S}(space, spans, syncdeps) memory_spans(ra::RemainderAliasing) = ra.spans @@ -43,6 +42,42 @@ memory_spans(mra::MultiRemainderAliasing) = vcat(memory_spans.(mra.remainders).. Base.hash(mra::MultiRemainderAliasing, h::UInt) = hash(mra.remainders, hash(MultiRemainderAliasing, h)) Base.:(==)(mra1::MultiRemainderAliasing, mra2::MultiRemainderAliasing) = mra1.remainders == mra2.remainders +#= FIXME: Integrate with main documentation +Problem statement: + +Remainder copy calculation needs to ensure that, for a given argument and +dependency modifier, and for a given target memory space, any data not yet +updated (whether through this arg or through another that aliases) is added to +the remainder, while any data that has been updated is not in the remainder. +Remainder copies may be multi-part, as data may be spread across multiple other +memory spaces. + +Ainfo is not alone sufficient to identify the combination of argument and +dependency modifier, as ainfo is specific to an allocation in a given memory +space. Thus, this combination needs to be tracked together, and separately from +memory space. However, information may span multiple memory spaces (and thus +multiple ainfos), so we should try to make queries of cross-memory space +information fast, as they will need to be performed for every task, for every +combination. + +Game Plan: + +- Use ArgumentWrapper to track this combination throughout the codebase, ideally generated just once +- Maintain the keying of remote_args only on argument, as the dependency modifier doesn’t affect the argument being passed into the task, so it should not factor into generating and tracking remote argument copies +- Add a structure to track the mapping from ArgumentWrapper to memory space to ainfo, as a quick way to lookup all ainfos needing to be considered +- When considering a remainder copy, only look at a single memory space’s ainfos at a time, as the ainfos should overlap exactly the same way on any memory space, and this allows us to use ainfo_overlaps to track overlaps +- Remainder copies will need to separately consider the source memory space, and the destination memory space when acquiring spans to copy to/from +- Memory spans for ainfos generated from the same ArgumentWrapper should be assumed to be paired in the same order, regardless of memory space, to ensure we can perform the translation from source to destination span address + - Alternatively, we might provide an API to take source and destination ainfos, and desired remainder memory spans, which then performs the copy for us +- When a task or copy writes to arguments, we should record this happening for all overlapping ainfos, in a manner that will be efficient to query from another memory space. We can probably walk backwards and attach this to a structure keyed on ArgumentWrapper, as that will be very efficient for later queries (because the history will now be linearized in one vector). +- Remainder copies will need to know, for all overlapping ainfos of the ArgumentWrapper ainfo at the target memory space, how recently that ainfo was updated relative to other ainfos, and relative to how recently the target ainfo was written. + - The last time the target ainfo was written is the furthest back we need to consider, as the target data must have been fully up-to-date when that write completed. + - Consideration of updates should start at most recent first, walking backwards in time, as the most recent updates contain the up-to-date data. + - For each span under consideration, we should subtract from it the current remainder set, to ensure we only copy up-to-date data. + - We must add that span portion to the remainder set no matter what, but if it was updated on the target memory space, we don’t need to schedule a copy for it, since it’s already where it needs to be. + - Even before the last target write is seen, we are allowed to stop searching if we find that our target ainfo is fully covered (because this implies that the target ainfo is fully out-of-date). +=# + struct FullCopy end """ @@ -86,17 +121,16 @@ and returned. function compute_remainder_for_arg!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper, - write_num::Int; compute_syncdeps::Bool=true) + write_num::Int) + @label restart + + # Determine all memory spaces of the history spaces_set = Set{MemorySpace}() push!(spaces_set, target_space) owner_space = state.arg_owner[arg_w] push!(spaces_set, owner_space) - - @label restart - - # Determine all memory spaces of the history - for entry in state.arg_history[arg_w] - push!(spaces_set, entry.space) + for (_, space, _) in state.arg_history[arg_w] + push!(spaces_set, space) end spaces = collect(spaces_set) N = length(spaces) @@ -109,12 +143,10 @@ function compute_remainder_for_arg!(state::DataDepsState, push!(target_ainfos, LocalMemorySpan.(spans)) end nspans = length(first(target_ainfos)) - @assert all(==(nspans), length.(target_ainfos)) "Aliasing info for $(typeof(arg_w.arg))[$(arg_w.dep_mod)] has different number of spans in different memory spaces" # FIXME: This is a hack to ensure that we don't miss any history generated by aliasing(...) - for entry in state.arg_history[arg_w] - if !in(entry.space, spaces) - @opcounter :compute_remainder_for_arg_restart + for (_, space, _) in state.arg_history[arg_w] + if !in(space, spaces) @goto restart end end @@ -123,37 +155,29 @@ function compute_remainder_for_arg!(state::DataDepsState, # target space if this is the first time we've written to `arg_w` if isempty(state.arg_history[arg_w]) if owner_space != target_space - return FullCopy(), 0 + return FullCopy() else - return NoAliasing(), 0 + return NoAliasing() end end # Create our remainder as an interval tree over all target ainfos - VERIFY_SPAN_CURRENT_OBJECT[] = arg_w.arg remainder = IntervalTree{ManyMemorySpan{N}}(ManyMemorySpan{N}(ntuple(i -> target_ainfos[i][j], N)) for j in 1:nspans) - for span in remainder - verify_span(span) - end # Create our tracker - tracker = Dict{MemorySpace,Tuple{Vector{Tuple{LocalMemorySpan,LocalMemorySpan}},Vector{AliasingWrapper},Set{ThunkSyncdep}}}() + tracker = Dict{MemorySpace,Tuple{Vector{Tuple{LocalMemorySpan,LocalMemorySpan}},Set{ThunkSyncdep}}}() # Walk backwards through the history of writes to this target # other_ainfo is the overlapping ainfo that was written to # other_space is the memory space of the overlapping ainfo - last_idx = length(state.arg_history[arg_w]) for idx in length(state.arg_history[arg_w]):-1:0 if isempty(remainder) # All done! - last_idx = idx break end if idx > 0 - other_entry = state.arg_history[arg_w][idx] - other_ainfo = other_entry.ainfo - other_space = other_entry.space + (other_ainfo, other_space, _) = state.arg_history[arg_w][idx] else # If we've reached the end of the history, evaluate ourselves other_ainfo = aliasing!(state, owner_space, arg_w) @@ -161,7 +185,7 @@ function compute_remainder_for_arg!(state::DataDepsState, end # Lookup all memory spans for arg_w in these spaces - other_remote_arg_w = first(collect(state.ainfo_arg[other_ainfo])) + other_remote_arg_w = state.ainfo_arg[other_ainfo] other_arg_w = ArgumentWrapper(state.remote_arg_to_original[other_remote_arg_w.arg], other_remote_arg_w.dep_mod) other_ainfos = Vector{Vector{LocalMemorySpan}}() for space in spaces @@ -171,15 +195,11 @@ function compute_remainder_for_arg!(state::DataDepsState, end nspans = length(first(other_ainfos)) other_many_spans = [ManyMemorySpan{N}(ntuple(i -> other_ainfos[i][j], N)) for j in 1:nspans] - foreach(other_many_spans) do span - verify_span(span) - end if other_space == target_space # Only subtract, this data is already up-to-date in target_space # N.B. We don't add to syncdeps here, because we'll see this ainfo # in get_write_deps! - @opcounter :compute_remainder_for_arg_subtract subtract_spans!(remainder, other_many_spans) continue end @@ -188,34 +208,22 @@ function compute_remainder_for_arg!(state::DataDepsState, other_space_idx = something(findfirst(==(other_space), spaces)) target_space_idx = something(findfirst(==(target_space), spaces)) tracker_other_space = get!(tracker, other_space) do - (Vector{Tuple{LocalMemorySpan,LocalMemorySpan}}(), Vector{AliasingWrapper}(), Set{ThunkSyncdep}()) - end - @opcounter :compute_remainder_for_arg_schedule - has_overlap = schedule_remainder!(tracker_other_space[1], other_space_idx, target_space_idx, remainder, other_many_spans) - if compute_syncdeps && has_overlap - @assert haskey(state.ainfos_owner, other_ainfo) "[idx $idx] ainfo $(typeof(other_ainfo)) has no owner" - get_read_deps!(state, other_space, other_ainfo, write_num, tracker_other_space[3]) - push!(tracker_other_space[2], other_ainfo) + (Vector{Tuple{LocalMemorySpan,LocalMemorySpan}}(), Set{ThunkSyncdep}()) end + schedule_remainder!(tracker_other_space[1], other_space_idx, target_space_idx, remainder, other_many_spans) + get_read_deps!(state, other_space, other_ainfo, write_num, tracker_other_space[2]) end - VERIFY_SPAN_CURRENT_OBJECT[] = nothing - if isempty(tracker) || all(tracked->isempty(tracked[1]), values(tracker)) - return NoAliasing(), 0 + if isempty(tracker) + return NoAliasing() end - # Return scheduled copies and the index of the last ainfo we considered + # Return scheduled copies mra = MultiRemainderAliasing() - for space in spaces - if haskey(tracker, space) - spans, ainfos, syncdeps = tracker[space] - if !isempty(spans) - push!(mra.remainders, RemainderAliasing(space, spans, ainfos, syncdeps)) - end - end + for (space, (spans, syncdeps)) in tracker + push!(mra.remainders, RemainderAliasing(space, spans, syncdeps)) end - @assert !isempty(mra.remainders) "Expected at least one remainder (spaces: $spaces, tracker spaces: $(collect(keys(tracker))))" - return mra, last_idx + return mra end ### Memory Span Set Operations for Remainder Computation @@ -230,13 +238,12 @@ copy from `other_many_spans` to the subtraced portion of `remainder`. function schedule_remainder!(tracker::Vector, source_space_idx::Int, dest_space_idx::Int, remainder::IntervalTree, other_many_spans::Vector{ManyMemorySpan{N}}) where N diff = Vector{ManyMemorySpan{N}}() subtract_spans!(remainder, other_many_spans, diff) + for span in diff source_span = span.spans[source_space_idx] dest_span = span.spans[dest_space_idx] - @assert span_len(source_span) == span_len(dest_span) "Source and dest spans are not the same size: $(span_len(source_span)) != $(span_len(dest_span))" push!(tracker, (source_span, dest_span)) end - return !isempty(diff) end ### Remainder copy functions @@ -250,7 +257,6 @@ Enqueues a copy operation to update the remainder regions of an object before a function enqueue_remainder_copy_to!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, remainder_aliasing::MultiRemainderAliasing, f, idx, dest_scope, task, write_num::Int) for remainder in remainder_aliasing.remainders - @assert !isempty(remainder.spans) enqueue_remainder_copy_to!(state, dest_space, arg_w, remainder, f, idx, dest_scope, task, write_num) end end @@ -263,7 +269,7 @@ function enqueue_remainder_copy_to!(state::DataDepsState, dest_space::MemorySpac # overwritten by more recent partial updates source_space = remainder_aliasing.space - @dagdebug task.uid :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Enqueueing remainder copy-to for $(typeof(arg_w.arg))[$(arg_w.dep_mod)]: $source_space => $dest_space" + @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Enqueueing remainder copy-to for $(typeof(arg_w.arg))[$(arg_w.dep_mod)]: $source_space => $dest_space" # Get the source and destination arguments arg_dest = state.remote_args[dest_space][arg_w.arg] @@ -276,23 +282,14 @@ function enqueue_remainder_copy_to!(state::DataDepsState, dest_space::MemorySpac push!(remainder_syncdeps, syncdep) end empty!(remainder_aliasing.syncdeps) # We can't bring these to move! - source_ainfos = copy(remainder_aliasing.ainfos) - empty!(remainder_aliasing.ainfos) get_write_deps!(state, dest_space, target_ainfo, write_num, remainder_syncdeps) - @dagdebug task.uid :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Remainder copy-to has $(length(remainder_syncdeps)) syncdeps" + @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Remainder copy-to has $(length(remainder_syncdeps)) syncdeps" # Launch the remainder copy task - ctx = Sch.eager_context() - id = rand(UInt) - @maybelog ctx timespan_start(ctx, :datadeps_copy, (;id), (;)) - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) - @maybelog ctx timespan_finish(ctx, :datadeps_copy, (;id), (;thunk_id=copy_task.uid, from_space=source_space, to_space=dest_space, arg_w, from_arg=arg_source, to_arg=arg_dest)) - - # This copy task reads the sources and writes to the target - for ainfo in source_ainfos - add_reader!(state, arg_w, source_space, ainfo, copy_task, write_num) - end + copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + + # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) end """ @@ -304,7 +301,6 @@ Enqueues a copy operation to update the remainder regions of an object back to t function enqueue_remainder_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, remainder_aliasing::MultiRemainderAliasing, dest_scope, write_num::Int) for remainder in remainder_aliasing.remainders - @assert !isempty(remainder.spans) enqueue_remainder_copy_from!(state, dest_space, arg_w, remainder, dest_scope, write_num) end end @@ -330,23 +326,14 @@ function enqueue_remainder_copy_from!(state::DataDepsState, dest_space::MemorySp push!(remainder_syncdeps, syncdep) end empty!(remainder_aliasing.syncdeps) # We can't bring these to move! - source_ainfos = copy(remainder_aliasing.ainfos) - empty!(remainder_aliasing.ainfos) get_write_deps!(state, dest_space, target_ainfo, write_num, remainder_syncdeps) @dagdebug nothing :spawn_datadeps "($(typeof(arg_w.arg)))[$dep_mod] Remainder copy-from has $(length(remainder_syncdeps)) syncdeps" # Launch the remainder copy task - ctx = Sch.eager_context() - id = rand(UInt) - @maybelog ctx timespan_start(ctx, :datadeps_copy, (;id), (;)) - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) - @maybelog ctx timespan_finish(ctx, :datadeps_copy, (;id), (;thunk_id=copy_task.uid, from_space=source_space, to_space=dest_space, arg_w, from_arg=arg_source, to_arg=arg_dest)) - - # This copy task reads the sources and writes to the target - for ainfo in source_ainfos - add_reader!(state, arg_w, source_space, ainfo, copy_task, write_num) - end + copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + + # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) end @@ -357,7 +344,7 @@ function enqueue_copy_to!(state::DataDepsState, dest_space::MemorySpace, arg_w:: source_space = state.arg_owner[arg_w] target_ainfo = aliasing!(state, dest_space, arg_w) - @dagdebug task.uid :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Enqueueing full copy-to for $(typeof(arg_w.arg))[$(arg_w.dep_mod)]: $source_space => $dest_space" + @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Enqueueing full copy-to for $(typeof(arg_w.arg))[$(arg_w.dep_mod)]: $source_space => $dest_space" # Get the source and destination arguments arg_dest = state.remote_args[dest_space][arg_w.arg] @@ -370,17 +357,12 @@ function enqueue_copy_to!(state::DataDepsState, dest_space::MemorySpace, arg_w:: get_read_deps!(state, source_space, source_ainfo, write_num, copy_syncdeps) get_write_deps!(state, dest_space, target_ainfo, write_num, copy_syncdeps) - @dagdebug task.uid :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Full copy-to has $(length(copy_syncdeps)) syncdeps" + @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Full copy-to has $(length(copy_syncdeps)) syncdeps" # Launch the remainder copy task - ctx = Sch.eager_context() - id = rand(UInt) - @maybelog ctx timespan_start(ctx, :datadeps_copy, (;id), (;)) - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) - @maybelog ctx timespan_finish(ctx, :datadeps_copy, (;id), (;thunk_id=copy_task.uid, from_space=source_space, to_space=dest_space, arg_w, from_arg=arg_source, to_arg=arg_dest)) - - # This copy task reads the source and writes to the target - add_reader!(state, arg_w, source_space, source_ainfo, copy_task, write_num) + copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) + + # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) end function enqueue_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, @@ -405,47 +387,36 @@ function enqueue_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w @dagdebug nothing :spawn_datadeps "($(typeof(arg_w.arg)))[$dep_mod] Full copy-from has $(length(copy_syncdeps)) syncdeps" # Launch the remainder copy task - ctx = Sch.eager_context() - id = rand(UInt) - @maybelog ctx timespan_start(ctx, :datadeps_copy, (;id), (;)) - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) - @maybelog ctx timespan_finish(ctx, :datadeps_copy, (;id), (;thunk_id=copy_task.uid, from_space=source_space, to_space=dest_space, arg_w, from_arg=arg_source, to_arg=arg_dest)) - - # This copy task reads the source and writes to the target - add_reader!(state, arg_w, source_space, source_ainfo, copy_task, write_num) + copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) + + # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) end # Main copy function for RemainderAliasing -function move!(dep_mod::RemainderAliasing{S}, to_space::MemorySpace, from_space::MemorySpace, to::Chunk, from::Chunk) where S - # TODO: Support direct copy between GPU memory spaces - - # Copy the data from the source object - copies = remotecall_fetch(root_worker_id(from_space), from_space, dep_mod, from) do from_space, dep_mod, from - len = sum(span_tuple->span_len(span_tuple[1]), dep_mod.spans) - copies = Vector{UInt8}(undef, len) - from_raw = unwrap(from) - offset = UInt64(1) - with_context!(from_space) - GC.@preserve copies begin - for (from_span, _) in dep_mod.spans - read_remainder!(copies, offset, from_raw, from_span.ptr, from_span.len) - offset += from_span.len +function move!(dep_mod::RemainderAliasing, to_space::MemorySpace, from_space::MemorySpace, to::Chunk, from::Chunk) + # Get the source data for each span + copies = remotecall_fetch(root_worker_id(from_space), dep_mod) do dep_mod + copies = Vector{UInt8}[] + for (from_span, _) in dep_mod.spans + copy = Vector{UInt8}(undef, from_span.len) + GC.@preserve copy begin + from_ptr = Ptr{UInt8}(from_span.ptr) + to_ptr = Ptr{UInt8}(pointer(copy)) + unsafe_copyto!(to_ptr, from_ptr, from_span.len) end + push!(copies, copy) end - @assert offset == len+UInt64(1) return copies end # Copy the data into the destination object - offset = UInt64(1) - to_raw = unwrap(to) - GC.@preserve copies begin - for (_, to_span) in dep_mod.spans - write_remainder!(copies, offset, to_raw, to_span.ptr, to_span.len) - offset += to_span.len + for (copy, (_, to_span)) in zip(copies, dep_mod.spans) + GC.@preserve copy begin + from_ptr = Ptr{UInt8}(pointer(copy)) + to_ptr = Ptr{UInt8}(to_span.ptr) + unsafe_copyto!(to_ptr, from_ptr, to_span.len) end - @assert offset == length(copies)+UInt64(1) end # Ensure that the data is visible @@ -453,88 +424,3 @@ function move!(dep_mod::RemainderAliasing{S}, to_space::MemorySpace, from_space: return end - -function read_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, from::Array, from_ptr::UInt64, len::UInt64) - elsize = sizeof(eltype(from)) - @assert len / elsize == round(UInt64, len / elsize) "Span length is not an integer multiple of the element size: $(len) / $(elsize) = $(len / elsize) (elsize: $elsize)" - n = UInt64(len / elsize) - from_offset_n = UInt64((from_ptr - UInt64(pointer(from))) / elsize) + UInt64(1) - from_vec = reshape(from, prod(size(from)))::DenseVector{eltype(from)} - # unsafe_wrap(Array, ...) doesn't like unaligned memory - unsafe_copyto!(Ptr{eltype(from)}(pointer(copies, copies_offset)), pointer(from_vec, from_offset_n), n) -end -function read_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, from::DenseArray, from_ptr::UInt64, len::UInt64) - elsize = sizeof(eltype(from)) - @assert len / elsize == round(UInt64, len / elsize) "Span length is not an integer multiple of the element size: $(len) / $(elsize) = $(len / elsize) (elsize: $elsize)" - n = UInt64(len / elsize) - from_offset_n = UInt64((from_ptr - UInt64(pointer(from))) / elsize) + UInt64(1) - from_vec = reshape(from, prod(size(from)))::DenseVector{eltype(from)} - copies_typed = unsafe_wrap(Vector{eltype(from)}, Ptr{eltype(from)}(pointer(copies, copies_offset)), n) - copyto!(copies_typed, 1, from_vec, Int(from_offset_n), Int(n)) -end -function read_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, from, from_ptr::UInt64, n::UInt64) - real_from = find_object_holding_ptr(from, from_ptr) - return read_remainder!(copies, copies_offset, real_from, from_ptr, n) -end - -function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::Array, to_ptr::UInt64, len::UInt64) - elsize = sizeof(eltype(to)) - @assert len / elsize == round(UInt64, len / elsize) "Span length is not an integer multiple of the element size: $(len) / $(elsize) = $(len / elsize) (elsize: $elsize)" - n = UInt64(len / elsize) - to_offset_n = UInt64((to_ptr - UInt64(pointer(to))) / elsize) + UInt64(1) - to_vec = reshape(to, prod(size(to)))::DenseVector{eltype(to)} - # unsafe_wrap(Array, ...) doesn't like unaligned memory - unsafe_copyto!(pointer(to_vec, to_offset_n), Ptr{eltype(to)}(pointer(copies, copies_offset)), n) -end -function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::DenseArray, to_ptr::UInt64, len::UInt64) - elsize = sizeof(eltype(to)) - @assert len / elsize == round(UInt64, len / elsize) "Span length is not an integer multiple of the element size: $(len) / $(elsize) = $(len / elsize) (elsize: $elsize)" - n = UInt64(len / elsize) - to_offset_n = UInt64((to_ptr - UInt64(pointer(to))) / elsize) + UInt64(1) - to_vec = reshape(to, prod(size(to)))::DenseVector{eltype(to)} - copies_typed = unsafe_wrap(Vector{eltype(to)}, Ptr{eltype(to)}(pointer(copies, copies_offset)), n) - copyto!(to_vec, Int(to_offset_n), copies_typed, 1, Int(n)) -end -function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to, to_ptr::UInt64, n::UInt64) - real_to = find_object_holding_ptr(to, to_ptr) - return write_remainder!(copies, copies_offset, real_to, to_ptr, n) -end - -# Remainder copies for common objects -for wrapper in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular, SubArray) - @eval function read_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, from::$wrapper, from_ptr::UInt64, n::UInt64) - read_remainder!(copies, copies_offset, parent(from), from_ptr, n) - end - @eval function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::$wrapper, to_ptr::UInt64, n::UInt64) - write_remainder!(copies, copies_offset, parent(to), to_ptr, n) - end -end - -function read_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, from::Base.RefValue, from_ptr::UInt64, n::UInt64) - if from_ptr == UInt64(Base.pointer_from_objref(from) + fieldoffset(typeof(from), 1)) - unsafe_copyto!(pointer(copies, copies_offset), Ptr{UInt8}(from_ptr), n) - else - read_remainder!(copies, copies_offset, from[], from_ptr, n) - end -end -function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::Base.RefValue, to_ptr::UInt64, n::UInt64) - if to_ptr == UInt64(Base.pointer_from_objref(to) + fieldoffset(typeof(to), 1)) - unsafe_copyto!(Ptr{UInt8}(to_ptr), pointer(copies, copies_offset), n) - else - write_remainder!(copies, copies_offset, to[], to_ptr, n) - end -end - -function find_object_holding_ptr(A::SparseMatrixCSC, ptr::UInt64) - span = LocalMemorySpan(pointer(A.nzval), length(A.nzval)*sizeof(eltype(A.nzval))) - if span_start(span) <= ptr <= span_end(span) - return A.nzval - end - span = LocalMemorySpan(pointer(A.colptr), length(A.colptr)*sizeof(eltype(A.colptr))) - if span_start(span) <= ptr <= span_end(span) - return A.colptr - end - span = LocalMemorySpan(pointer(A.rowval), length(A.rowval)*sizeof(eltype(A.rowval))) - @assert span_start(span) <= ptr <= span_end(span) "Pointer $ptr not found in SparseMatrixCSC" - return A.rowval -end \ No newline at end of file diff --git a/src/dtask.jl b/src/dtask.jl index e94803502..f24cd1027 100644 --- a/src/dtask.jl +++ b/src/dtask.jl @@ -11,18 +11,31 @@ Base.wait(t::ThunkFuture) = Dagger.Sch.thunk_yield() do wait(t.future) return end -function Base.fetch(t::ThunkFuture; proc=OSProc(), raw=false) +const FETCH_UNIFORM = ScopedValue{Bool}(false) +@warn "Docstrings" maxlog=1 +# uniform: Asserts that this is a uniform call +# move_value: Moves the value to the specified processor +# unwrap: Unwraps the value if it is unwrappable +function Base.fetch(t::ThunkFuture; proc::Processor=OSProc(), + throw_on_error::Bool=true, + uniform::Bool=false, + move_value::Bool=true, + unwrap::Bool=false) error, value = Dagger.Sch.thunk_yield() do fetch(t.future) end - if error + if throw_on_error && error throw(value) end - if raw - return value - else - return move(proc, value) + if move_value + value = @with FETCH_UNIFORM => uniform begin + move(proc, value) + end + end + if unwrap && unwrappable(value) + return fetch(value; proc, throw_on_error, uniform, move_value, unwrap) end + return value end Base.put!(t::ThunkFuture, x; error=false) = put!(t.future, (error, x)) @@ -65,12 +78,13 @@ function Base.wait(t::DTask) wait(t.future) return end -function Base.fetch(t::DTask; raw=false) +function Base.fetch(t::DTask; kwargs...) if !istaskstarted(t) throw(ConcurrencyViolationError("Cannot `fetch` an unlaunched `DTask`")) end - return fetch(t.future; raw) + return fetch(t.future; kwargs...) end +unwrappable(x::DTask) = true function waitany(tasks::Vector{DTask}) if isempty(tasks) return diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index 1184f34dd..dd9b8dc3f 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -1,24 +1,56 @@ +struct DistributedAcceleration <: Acceleration end + +const ACCELERATION = TaskLocalValue{Acceleration}(() -> DistributedAcceleration()) + +current_acceleration() = ACCELERATION[] + +default_processor(::DistributedAcceleration) = OSProc(myid()) +default_processor(accel::DistributedAcceleration, x) = default_processor(accel) +default_processor() = default_processor(current_acceleration()) + +accelerate!(accel::Symbol) = accelerate!(Val{accel}()) +accelerate!(::Val{:distributed}) = accelerate!(DistributedAcceleration()) + +initialize_acceleration!(a::DistributedAcceleration) = nothing +function accelerate!(accel::Acceleration) + initialize_acceleration!(accel) + ACCELERATION[] = accel +end + +accel_matches_proc(accel::DistributedAcceleration, proc::OSProc) = true +accel_matches_proc(accel::DistributedAcceleration, proc) = true + struct CPURAMMemorySpace <: MemorySpace owner::Int end -CPURAMMemorySpace() = CPURAMMemorySpace(myid()) root_worker_id(space::CPURAMMemorySpace) = space.owner -memory_space(x) = CPURAMMemorySpace(myid()) -function memory_space(x::Chunk) - proc = processor(x) - if proc isa OSProc - # TODO: This should probably be programmable - return CPURAMMemorySpace(proc.pid) - else - return only(memory_spaces(proc)) - end -end -memory_space(x::DTask) = - memory_space(fetch(x; raw=true)) +CPURAMMemorySpace() = CPURAMMemorySpace(myid()) + +default_processor(space::CPURAMMemorySpace) = OSProc(space.owner) +default_memory_space(accel::DistributedAcceleration) = CPURAMMemorySpace(myid()) +default_memory_space(accel::DistributedAcceleration, x) = default_memory_space(accel) +default_memory_space(x) = default_memory_space(current_acceleration(), x) +default_memory_space() = default_memory_space(current_acceleration()) + +memory_space(x, proc::Processor=default_processor()) = first(memory_spaces(proc)) +memory_space(x::Processor) = first(memory_spaces(x)) +memory_space(x::Chunk) = x.space +memory_space(x::DTask) = memory_space(fetch(x; move_value=false, unwrap=false)) memory_spaces(::P) where {P<:Processor} = throw(ArgumentError("Must define `memory_spaces` for `$P`")) + +function memory_spaces(proc::OSProc) + children = get_processors(proc) + spaces = Set{MemorySpace}() + for proc in children + for space in memory_spaces(proc) + push!(spaces, space) + end + end + return spaces +end memory_spaces(proc::ThreadProc) = Set([CPURAMMemorySpace(proc.owner)]) processors(::S) where {S<:MemorySpace} = @@ -28,9 +60,12 @@ processors(space::CPURAMMemorySpace) = ### In-place Data Movement -function unwrap(x::Chunk) - @assert x.handle.owner == myid() - MemPool.poolget(x.handle) +function unwrap(x::Chunk; uniform::Bool=false) + @assert root_worker_id(x.handle) == myid() "Chunk $x is not owned by this process: $(root_worker_id(x.handle)) != $(myid())" + if x.handle isa DRef + return MemPool.poolget(x.handle) + end + return MemPool.poolget(x.handle; uniform) end move!(dep_mod, to_space::MemorySpace, from_space::MemorySpace, to::T, from::F) where {T,F} = throw(ArgumentError("No `move!` implementation defined for $F -> $T")) @@ -69,6 +104,16 @@ function move!(::Type{<:Tridiagonal}, to_space::MemorySpace, from_space::MemoryS return end +# FIXME: Take MemorySpace instead +function move_type(from_proc::Processor, to_proc::Processor, ::Type{T}) where T + if from_proc == to_proc + return T + end + return Base._return_type(move, Tuple{typeof(from_proc), typeof(to_proc), T}) +end +move_type(from_proc::Processor, to_proc::Processor, ::Type{<:Chunk{T}}) where T = + move_type(from_proc, to_proc, T) + ### Aliasing and Memory Spans type_may_alias(::Type{String}) = false @@ -88,20 +133,49 @@ function type_may_alias(::Type{T}) where T return false end -may_alias(::MemorySpace, ::MemorySpace) = false -may_alias(space1::M, space2::M) where M<:MemorySpace = space1 == space2 +may_alias(::MemorySpace, ::MemorySpace) = true may_alias(space1::CPURAMMemorySpace, space2::CPURAMMemorySpace) = space1.owner == space2.owner +struct RemotePtr{T,S<:MemorySpace} <: Ref{T} + addr::UInt + space::S +end +RemotePtr{T}(addr::UInt, space::S) where {T,S} = RemotePtr{T,S}(addr, space) +RemotePtr{T}(ptr::Ptr{V}, space::S) where {T,V,S} = RemotePtr{T,S}(UInt(ptr), space) +RemotePtr{T}(ptr::Ptr{V}) where {T,V} = RemotePtr{T}(UInt(ptr), CPURAMMemorySpace(myid())) +# FIXME: Don't hardcode CPURAMMemorySpace +RemotePtr(addr::UInt) = RemotePtr{Cvoid}(addr, CPURAMMemorySpace(myid())) +Base.convert(::Type{RemotePtr}, x::Ptr{T}) where T = + RemotePtr(UInt(x), CPURAMMemorySpace(myid())) +Base.convert(::Type{<:RemotePtr{V}}, x::Ptr{T}) where {V,T} = + RemotePtr{V}(UInt(x), CPURAMMemorySpace(myid())) +Base.convert(::Type{UInt}, ptr::RemotePtr) = ptr.addr +Base.:+(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr + offset, ptr.space) +Base.:-(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr - offset, ptr.space) +function Base.isless(ptr1::RemotePtr, ptr2::RemotePtr) + @assert ptr1.space == ptr2.space + return ptr1.addr < ptr2.addr +end + +struct MemorySpan{S} + ptr::RemotePtr{Cvoid,S} + len::UInt +end +MemorySpan(ptr::RemotePtr{Cvoid,S}, len::Integer) where S = + MemorySpan{S}(ptr, UInt(len)) +MemorySpan{S}(addr::UInt, len::Integer) where S = + MemorySpan{S}(RemotePtr{Cvoid,S}(addr), UInt(len)) +Base.isless(a::MemorySpan, b::MemorySpan) = a.ptr < b.ptr +Base.isempty(x::MemorySpan) = x.len == 0 abstract type AbstractAliasing end memory_spans(::T) where T<:AbstractAliasing = throw(ArgumentError("Must define `memory_spans` for `$T`")) memory_spans(x) = memory_spans(aliasing(x)) memory_spans(x, T) = memory_spans(aliasing(x, T)) -### Type-generic aliasing info wrapper - -mutable struct AliasingWrapper <: AbstractAliasing +struct AliasingWrapper <: AbstractAliasing inner::AbstractAliasing hash::UInt64 + AliasingWrapper(inner::AbstractAliasing) = new(inner, hash(inner)) end memory_spans(x::AliasingWrapper) = memory_spans(x.inner) @@ -110,204 +184,8 @@ equivalent_structure(x::AliasingWrapper, y::AliasingWrapper) = Base.hash(x::AliasingWrapper, h::UInt64) = hash(x.hash, h) Base.isequal(x::AliasingWrapper, y::AliasingWrapper) = x.hash == y.hash Base.:(==)(x::AliasingWrapper, y::AliasingWrapper) = x.hash == y.hash -will_alias(x::AliasingWrapper, y::AliasingWrapper) = will_alias(x.inner, y.inner) - -### Small dictionary type - -struct SmallDict{K,V} <: AbstractDict{K,V} - keys::Vector{K} - vals::Vector{V} -end -SmallDict{K,V}() where {K,V} = SmallDict{K,V}(Vector{K}(), Vector{V}()) -function Base.getindex(d::SmallDict{K,V}, key) where {K,V} - key_idx = findfirst(==(convert(K, key)), d.keys) - if key_idx === nothing - throw(KeyError(key)) - end - return @inbounds d.vals[key_idx] -end -function Base.setindex!(d::SmallDict{K,V}, val, key) where {K,V} - key_conv = convert(K, key) - key_idx = findfirst(==(key_conv), d.keys) - if key_idx === nothing - push!(d.keys, key_conv) - push!(d.vals, convert(V, val)) - else - d.vals[key_idx] = convert(V, val) - end - return val -end -Base.haskey(d::SmallDict{K,V}, key) where {K,V} = in(convert(K, key), d.keys) -Base.keys(d::SmallDict) = d.keys -Base.length(d::SmallDict) = length(d.keys) -Base.iterate(d::SmallDict) = iterate(d, 1) -Base.iterate(d::SmallDict, state) = state > length(d.keys) ? nothing : (d.keys[state] => d.vals[state], state+1) - -### Type-stable lookup structure for AliasingWrappers - -struct AliasingLookup - # The set of memory spaces that are being tracked - spaces::Vector{MemorySpace} - # The set of AliasingWrappers that are being tracked - # One entry for each AliasingWrapper - ainfos::Vector{AliasingWrapper} - # The memory spaces for each AliasingWrapper - # One entry for each AliasingWrapper - ainfos_spaces::Vector{Vector{Int}} - # The spans for each AliasingWrapper in each memory space - # One entry for each AliasingWrapper - spans::Vector{SmallDict{Int,Vector{LocalMemorySpan}}} - # The set of AliasingWrappers that only exist in a single memory space - # One entry for each AliasingWrapper - ainfos_only_space::Vector{Int} - # The bounding span for each AliasingWrapper in each memory space - # One entry for each AliasingWrapper - bounding_spans::Vector{SmallDict{Int,LocalMemorySpan}} - # The interval tree of the bounding spans for each AliasingWrapper - # One entry for each MemorySpace - bounding_spans_tree::Vector{IntervalTree{LocatorMemorySpan{Int},UInt64}} - - AliasingLookup() = new(MemorySpace[], - AliasingWrapper[], - Vector{Int}[], - SmallDict{Int,Vector{LocalMemorySpan}}[], - Int[], - SmallDict{Int,LocalMemorySpan}[], - IntervalTree{LocatorMemorySpan{Int},UInt64}[]) -end -function Base.push!(lookup::AliasingLookup, ainfo::AliasingWrapper) - # Update the set of memory spaces and spans, - # and find the bounding spans for this AliasingWrapper - spaces_set = Set{MemorySpace}(lookup.spaces) - self_spaces_set = Set{Int}() - spans = SmallDict{Int,Vector{LocalMemorySpan}}() - for span in memory_spans(ainfo) - space = span.ptr.space - if !in(space, spaces_set) - push!(spaces_set, space) - push!(lookup.spaces, space) - push!(lookup.bounding_spans_tree, IntervalTree{LocatorMemorySpan{Int}}()) - end - space_idx = findfirst(==(space), lookup.spaces) - push!(self_spaces_set, space_idx) - spans_in_space = get!(Vector{LocalMemorySpan}, spans, space_idx) - push!(spans_in_space, LocalMemorySpan(span)) - end - push!(lookup.ainfos_spaces, collect(self_spaces_set)) - push!(lookup.spans, spans) - - # Update the set of AliasingWrappers - push!(lookup.ainfos, ainfo) - ainfo_idx = length(lookup.ainfos) - - # Check if the AliasingWrapper only exists in a single memory space - if length(self_spaces_set) == 1 - space_idx = only(self_spaces_set) - push!(lookup.ainfos_only_space, space_idx) - else - push!(lookup.ainfos_only_space, 0) - end - - # Add the bounding spans for this AliasingWrapper - bounding_spans = SmallDict{Int,LocalMemorySpan}() - for space_idx in keys(spans) - space_spans = spans[space_idx] - bound_start = minimum(span_start, space_spans) - bound_end = maximum(span_end, space_spans) - bounding_span = LocalMemorySpan(bound_start, bound_end - bound_start) - bounding_spans[space_idx] = bounding_span - insert!(lookup.bounding_spans_tree[space_idx], LocatorMemorySpan(bounding_span, ainfo_idx)) - end - push!(lookup.bounding_spans, bounding_spans) - - return ainfo_idx -end -struct AliasingLookupFinder - lookup::AliasingLookup - ainfo::AliasingWrapper - ainfo_idx::Int - spaces_idx::Vector{Int} - to_consider::Vector{Int} -end -Base.eltype(::AliasingLookupFinder) = AliasingWrapper -Base.IteratorSize(::AliasingLookupFinder) = Base.SizeUnknown() -# FIXME: We should use a Dict{UInt,Int} to find the ainfo_idx instead of linear search -function Base.intersect(lookup::AliasingLookup, ainfo::AliasingWrapper; ainfo_idx=nothing) - if ainfo_idx === nothing - ainfo_idx = something(findfirst(==(ainfo), lookup.ainfos)) - end - spaces_idx = lookup.ainfos_spaces[ainfo_idx] - to_consider_spans = LocatorMemorySpan{Int}[] - for space_idx in spaces_idx - bounding_spans_tree = lookup.bounding_spans_tree[space_idx] - self_bounding_span = LocatorMemorySpan(lookup.bounding_spans[ainfo_idx][space_idx], 0) - find_overlapping!(bounding_spans_tree, self_bounding_span, to_consider_spans; exact=false) - end - to_consider = Int[locator.owner for locator in to_consider_spans] - @assert all(to_consider .> 0) - return AliasingLookupFinder(lookup, ainfo, ainfo_idx, spaces_idx, to_consider) -end -Base.iterate(finder::AliasingLookupFinder) = iterate(finder, 1) -function Base.iterate(finder::AliasingLookupFinder, cursor_ainfo_idx) - ainfo_spaces = nothing - cursor_space_idx = 1 - - # New ainfos enter here - @label ainfo_restart - - # Check if we've exhausted all ainfos - if cursor_ainfo_idx > length(finder.to_consider) - return nothing - end - ainfo_idx = finder.to_consider[cursor_ainfo_idx] - - # Find the appropriate memory spaces for this ainfo - if ainfo_spaces === nothing - ainfo_spaces = finder.lookup.ainfos_spaces[ainfo_idx] - end - - # New memory spaces (for the same ainfo) enter here - @label space_restart - - # Check if we've exhausted all memory spaces for this ainfo, and need to move to the next ainfo - if cursor_space_idx > length(ainfo_spaces) - cursor_ainfo_idx += 1 - ainfo_spaces = nothing - cursor_space_idx = 1 - @goto ainfo_restart - end - - # Find the currently considered memory space for this ainfo - space_idx = ainfo_spaces[cursor_space_idx] - - # Check if this memory space is part of our target ainfo's spaces - if !(space_idx in finder.spaces_idx) - cursor_space_idx += 1 - @goto space_restart - end - - # Check if this ainfo's bounding span is part of our target ainfo's bounding span in this memory space - other_ainfo_bounding_span = finder.lookup.bounding_spans[ainfo_idx][space_idx] - self_bounding_span = finder.lookup.bounding_spans[finder.ainfo_idx][space_idx] - if !spans_overlap(other_ainfo_bounding_span, self_bounding_span) - cursor_space_idx += 1 - @goto space_restart - end - - # We have a overlapping bounds in the same memory space, so check if the ainfos are aliasing - # This is the slow path! - other_ainfo = finder.lookup.ainfos[ainfo_idx] - aliasing = will_alias(finder.ainfo, other_ainfo) - if !aliasing - cursor_ainfo_idx += 1 - ainfo_spaces = nothing - cursor_space_idx = 1 - @goto ainfo_restart - end - - # We overlap, so return the ainfo and the next ainfo index - return other_ainfo, cursor_ainfo_idx+1 -end +will_alias(x::AliasingWrapper, y::AliasingWrapper) = + will_alias(x.inner, y.inner) struct NoAliasing <: AbstractAliasing end memory_spans(::NoAliasing) = MemorySpan{CPURAMMemorySpace}[] @@ -322,11 +200,8 @@ struct CombinedAliasing <: AbstractAliasing end function memory_spans(ca::CombinedAliasing) # FIXME: Don't hardcode CPURAMMemorySpace - if length(ca.sub_ainfos) == 0 - return MemorySpan{CPURAMMemorySpace}[] - end - all_spans = memory_spans(ca.sub_ainfos[1]) - for sub_a in ca.sub_ainfos[2:end] + all_spans = MemorySpan{CPURAMMemorySpace}[] + for sub_a in ca.sub_ainfos append!(all_spans, memory_spans(sub_a)) end return all_spans @@ -336,23 +211,23 @@ Base.:(==)(ca1::CombinedAliasing, ca2::CombinedAliasing) = Base.hash(ca1::CombinedAliasing, h::UInt) = hash(ca1.sub_ainfos, hash(CombinedAliasing, h)) -struct ObjectAliasing{S<:MemorySpace} <: AbstractAliasing - ptr::RemotePtr{Cvoid,S} +struct ObjectAliasing <: AbstractAliasing + ptr::Ptr{Cvoid} sz::UInt end -ObjectAliasing(ptr::RemotePtr{Cvoid,S}, sz::Integer) where {S<:MemorySpace} = - ObjectAliasing{S}(ptr, UInt(sz)) function ObjectAliasing(x::T) where T @nospecialize x - ptr = RemotePtr{Cvoid}(pointer_from_objref(x)) + ptr = pointer_from_objref(x) sz = sizeof(T) return ObjectAliasing(ptr, sz) end -function memory_spans(oa::ObjectAliasing{S}) where S - span = MemorySpan{S}(oa.ptr, oa.sz) +function memory_spans(oa::ObjectAliasing) + rptr = RemotePtr{Cvoid}(oa.ptr) + span = MemorySpan{CPURAMMemorySpace}(rptr, oa.sz) return [span] end +aliasing(accel::Acceleration, x, T) = aliasing(x, T) function aliasing(x, dep_mod) if dep_mod isa Symbol return aliasing(getfield(x, dep_mod)) @@ -388,31 +263,16 @@ end aliasing(::String) = NoAliasing() # FIXME: Not necessarily true aliasing(::Symbol) = NoAliasing() aliasing(::Type) = NoAliasing() -function aliasing(x::Chunk, T) +aliasing(x::DTask, T) = aliasing(fetch(x; move_value=false, unwrap=false), T) +aliasing(x::DTask) = aliasing(fetch(x; move_value=false, unwrap=false)) +function aliasing(accel::DistributedAcceleration, x::Chunk, T) @assert x.handle isa DRef - if root_worker_id(x.processor) == myid() - return aliasing(unwrap(x), T) - end return remotecall_fetch(root_worker_id(x.processor), x, T) do x, T aliasing(unwrap(x), T) end end -aliasing(x::Chunk) = remotecall_fetch(root_worker_id(x.processor), x) do x - aliasing(unwrap(x)) -end -aliasing(x::DTask, T) = aliasing(fetch(x; raw=true), T) -aliasing(x::DTask) = aliasing(fetch(x; raw=true)) - -function aliasing(x::Base.RefValue{T}) where T - addr = UInt(Base.pointer_from_objref(x) + fieldoffset(typeof(x), 1)) - ptr = RemotePtr{Cvoid}(addr, CPURAMMemorySpace(myid())) - ainfo = ObjectAliasing(ptr, sizeof(x)) - if isassigned(x) && type_may_alias(T) && type_may_alias(typeof(x[])) - return CombinedAliasing([ainfo, aliasing(x[])]) - else - return CombinedAliasing([ainfo]) - end -end +aliasing(x::Chunk, T) = aliasing(unwrap(x), T) +aliasing(x::Chunk) = aliasing(unwrap(x)) struct ContiguousAliasing{S} <: AbstractAliasing span::MemorySpan{S} @@ -465,22 +325,13 @@ function _memory_spans(a::StridedAliasing{T,N,S}, spans, ptr, dim) where {T,N,S} return spans end -function aliasing(x::SubArray{T,N}) where {T,N} +function aliasing(x::SubArray{T,N,A}) where {T,N,A<:Array} if isbitstype(T) - p = parent(x) - space = memory_space(p) - S = typeof(space) - parent_ptr = RemotePtr{Cvoid}(UInt64(pointer(p)), space) - ptr = RemotePtr{Cvoid}(UInt64(pointer(x)), space) - NA = ndims(p) - raw_inds = parentindices(x) - inds = ntuple(i->raw_inds[i] isa Integer ? (raw_inds[i]:raw_inds[i]) : UnitRange(raw_inds[i]), NA) - sz = ntuple(i->length(inds[i]), NA) - return StridedAliasing{T,NA,S}(parent_ptr, - ptr, - inds, - sz, - strides(p)) + S = CPURAMMemorySpace + return StridedAliasing{T,ndims(x),S}(RemotePtr{Cvoid}(pointer(parent(x))), + RemotePtr{Cvoid}(pointer(x)), + parentindices(x), + size(x), strides(x)) else # FIXME: Also ContiguousAliasing of container #return IteratedAliasing(x) @@ -597,8 +448,40 @@ end function will_alias(x_span::MemorySpan, y_span::MemorySpan) may_alias(x_span.ptr.space, y_span.ptr.space) || return false # FIXME: Allow pointer conversion instead of just failing - @assert x_span.ptr.space == y_span.ptr.space "Memory spans are in different spaces: $(x_span.ptr.space) vs. $(y_span.ptr.space)" + @assert x_span.ptr.space == y_span.ptr.space x_end = x_span.ptr + x_span.len - 1 y_end = y_span.ptr + y_span.len - 1 return x_span.ptr <= y_end && y_span.ptr <= x_end end + +### More space-efficient memory spans + +struct LocalMemorySpan + ptr::UInt + len::UInt +end +LocalMemorySpan(span::MemorySpan) = LocalMemorySpan(span.ptr.addr, span.len) +Base.isempty(x::LocalMemorySpan) = x.len == 0 + +# FIXME: Store the length separately, since it's shared by all spans +struct ManyMemorySpan{N} + spans::NTuple{N,LocalMemorySpan} +end +Base.isempty(x::ManyMemorySpan) = all(isempty, x.spans) + +struct ManyPair{N} <: Unsigned + pairs::NTuple{N,UInt} +end +Base.promote_rule(::Type{ManyPair}, ::Type{T}) where {T<:Integer} = ManyPair +Base.convert(::Type{ManyPair{N}}, x::T) where {T<:Integer,N} = ManyPair(ntuple(i -> x, N)) +Base.convert(::Type{ManyPair}, x::ManyPair) = x +Base.:+(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] + y.pairs[i], N)) +Base.:-(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] - y.pairs[i], N)) +Base.:-(x::ManyPair) = error("Can't negate a ManyPair") +Base.:(==)(x::ManyPair, y::ManyPair) = x.pairs == y.pairs +Base.isless(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] +Base.:(<)(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] +Base.string(x::ManyPair) = "ManyPair($(x.pairs))" + +ManyMemorySpan{N}(start::ManyPair{N}, len::ManyPair{N}) where N = + ManyMemorySpan{N}(ntuple(i -> LocalMemorySpan(start.pairs[i], len.pairs[i]), N)) diff --git a/src/mpi.jl b/src/mpi.jl new file mode 100644 index 000000000..c3eaf0652 --- /dev/null +++ b/src/mpi.jl @@ -0,0 +1,870 @@ +using MPI + +const CHECK_UNIFORMITY = TaskLocalValue{Bool}(()->false) +function check_uniformity!(check::Bool=true) + CHECK_UNIFORMITY[] = check +end +function check_uniform(value::Integer) + CHECK_UNIFORMITY[] || return + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + matched = compare_all(value, comm) + if !matched + if rank == 0 + Core.print("[$rank] Found non-uniform value!\n") + end + Core.print("[$rank] value=$value\n") + throw(ArgumentError("Non-uniform value")) + end + MPI.Barrier(comm) +end +function check_uniform(value) + CHECK_UNIFORMITY[] || return + check_uniform(hash(value)) +end + +function compare_all(value, comm) + rank = MPI.Comm_rank(comm) + size = MPI.Comm_size(comm) + for i in 0:(size-1) + if i != rank + send_yield(value, comm, i, UInt32(0); check_seen=false) + end + end + match = true + for i in 0:(size-1) + if i != rank + other_value = recv_yield(comm, i, UInt32(0)) + if value != other_value + match = false + end + end + end + return match +end + +struct MPIAcceleration <: Acceleration + comm::MPI.Comm +end +MPIAcceleration() = MPIAcceleration(MPI.COMM_WORLD) + +function aliasing(accel::MPIAcceleration, x::Chunk, T) + handle = x.handle::MPIRef + @assert accel.comm == handle.comm "MPIAcceleration comm mismatch" + tag = to_tag(hash(handle.id, hash(:aliasing))) + check_uniform(tag) + rank = MPI.Comm_rank(accel.comm) + if handle.rank == rank + ainfo = aliasing(x, T) + #Core.print("[$rank] aliasing: $ainfo, sending\n") + bcast_send_yield(ainfo, accel.comm, handle.rank, tag) + else + #Core.print("[$rank] aliasing: receiving from $(handle.rank)\n") + ainfo = recv_yield(accel.comm, handle.rank, tag) + #Core.print("[$rank] aliasing: received $ainfo\n") + end + check_uniform(ainfo) + return ainfo +end +default_processor(accel::MPIAcceleration) = MPIOSProc(accel.comm, 0) +default_processor(accel::MPIAcceleration, x) = MPIOSProc(accel.comm, 0) +default_processor(accel::MPIAcceleration, x::Chunk) = MPIOSProc(x.handle.comm, x.handle.rank) +default_processor(accel::MPIAcceleration, x::Function) = MPIOSProc(accel.comm, MPI.Comm_rank(accel.comm)) +default_processor(accel::MPIAcceleration, T::Type) = MPIOSProc(accel.comm, MPI.Comm_rank(accel.comm)) + +#TODO: Add a lock +const MPIClusterProcChildren = Dict{MPI.Comm, Set{Processor}}() + +struct MPIClusterProc <: Processor + comm::MPI.Comm + function MPIClusterProc(comm::MPI.Comm) + populate_children(comm) + return new(comm) + end +end + +Sch.init_proc(state, proc::MPIClusterProc, log_sink) = Sch.init_proc(state, MPIOSProc(proc.comm), log_sink) + +MPIClusterProc() = MPIClusterProc(MPI.COMM_WORLD) + +function populate_children(comm::MPI.Comm) + children = get_processors(OSProc()) + MPIClusterProcChildren[comm] = children +end + +struct MPIOSProc <: Processor + comm::MPI.Comm + rank::Int +end + +function MPIOSProc(comm::MPI.Comm) + rank = MPI.Comm_rank(comm) + return MPIOSProc(comm, rank) +end + +function MPIOSProc() + return MPIOSProc(MPI.COMM_WORLD) +end + +ProcessScope(p::MPIOSProc) = ProcessScope(myid()) + +function check_uniform(proc::MPIOSProc) + check_uniform(hash(MPIOSProc)) + check_uniform(proc.rank) +end + +function memory_spaces(proc::MPIOSProc) + children = get_processors(proc) + spaces = Set{MemorySpace}() + for proc in children + for space in memory_spaces(proc) + push!(spaces, space) + end + end + return spaces +end + +struct MPIProcessScope <: AbstractScope + comm::MPI.Comm + rank::Int +end + +Base.isless(::MPIProcessScope, ::MPIProcessScope) = false +Base.isless(::MPIProcessScope, ::NodeScope) = true +Base.isless(::MPIProcessScope, ::UnionScope) = true +Base.isless(::MPIProcessScope, ::TaintScope) = true +Base.isless(::MPIProcessScope, ::AnyScope) = true +constrain(x::MPIProcessScope, y::MPIProcessScope) = + x == y ? y : InvalidScope(x, y) +constrain(x::NodeScope, y::MPIProcessScope) = + x == y.parent ? y : InvalidScope(x, y) + +Base.isless(::ExactScope, ::MPIProcessScope) = true +constrain(x::MPIProcessScope, y::ExactScope) = + x == y.parent ? y : InvalidScope(x, y) + +function enclosing_scope(proc::MPIOSProc) + return MPIProcessScope(proc.comm, proc.rank) +end + +function Dagger.to_scope(::Val{:mpi_rank}, sc::NamedTuple) + if sc.mpi_rank == Colon() + return Dagger.to_scope(Val{:mpi_ranks}(), merge(sc, (;mpi_ranks=Colon()))) + else + @assert sc.mpi_rank isa Integer "Expected a single GPU device ID for :mpi_rank, got $(sc.mpi_rank)\nConsider using :mpi_ranks instead." + return Dagger.to_scope(Val{:mpi_ranks}(), merge(sc, (;mpi_ranks=[sc.mpi_rank]))) + end +end +Dagger.scope_key_precedence(::Val{:mpi_rank}) = 2 +function Dagger.to_scope(::Val{:mpi_ranks}, sc::NamedTuple) + comm = get(sc, :mpi_comm, MPI.COMM_WORLD) + if sc.ranks != Colon() + ranks = sc.ranks + else + ranks = MPI.Comm_size(comm) + end + inner_sc = NamedTuple(filter(kv->kv[1] != :mpi_ranks, Base.pairs(sc))...) + # FIXME: What to do here? + inner_scope = Dagger.to_scope(inner_sc) + scopes = Dagger.ExactScope[] + for rank in ranks + procs = Dagger.get_processors(Dagger.MPIOSProc(comm, rank)) + rank_scope = MPIProcessScope(comm, rank) + for proc in procs + proc_scope = Dagger.ExactScope(proc) + constrain(proc_scope, rank_scope) isa Dagger.InvalidScope && continue + push!(scopes, proc_scope) + end + end + return Dagger.UnionScope(scopes) +end +Dagger.scope_key_precedence(::Val{:mpi_ranks}) = 2 + +struct MPIProcessor{P<:Processor} <: Processor + innerProc::P + comm::MPI.Comm + rank::Int +end +proc_in_scope(proc::Processor, scope::MPIProcessScope) = false +proc_in_scope(proc::MPIProcessor, scope::MPIProcessScope) = + proc.comm == scope.comm && proc.rank == scope.rank + +function check_uniform(proc::MPIProcessor) + check_uniform(hash(MPIProcessor)) + check_uniform(proc.rank) + # TODO: Not always valid (if pointer is embedded, say for GPUs) + check_uniform(hash(proc.innerProc)) +end + +Dagger.iscompatible_func(::MPIProcessor, opts, ::Any) = true +Dagger.iscompatible_arg(::MPIProcessor, opts, ::Any) = true + +default_enabled(proc::MPIProcessor) = default_enabled(proc.innerProc) + +root_worker_id(proc::MPIProcessor) = myid() +root_worker_id(proc::MPIOSProc) = myid() +root_worker_id(proc::MPIClusterProc) = myid() + +get_parent(proc::MPIClusterProc) = proc +get_parent(proc::MPIOSProc) = MPIClusterProc(proc.comm) +get_parent(proc::MPIProcessor) = MPIOSProc(proc.comm, proc.rank) + +short_name(proc::MPIProcessor) = "(MPI: $(proc.rank), $(short_name(proc.innerProc)))" + +function get_processors(mosProc::MPIOSProc) + populate_children(mosProc.comm) + children = MPIClusterProcChildren[mosProc.comm] + mpiProcs = Set{Processor}() + for proc in children + push!(mpiProcs, MPIProcessor(proc, mosProc.comm, mosProc.rank)) + end + return mpiProcs +end + +#TODO: non-uniform ranking through MPI groups +#TODO: use a lazy iterator +function get_processors(proc::MPIClusterProc) + children = Set{Processor}() + for i in 0:(MPI.Comm_size(proc.comm)-1) + for innerProc in MPIClusterProcChildren[proc.comm] + push!(children, MPIProcessor(innerProc, proc.comm, i)) + end + end + return children +end + +struct MPIMemorySpace{S<:MemorySpace} <: MemorySpace + innerSpace::S + comm::MPI.Comm + rank::Int +end + +function check_uniform(space::MPIMemorySpace) + check_uniform(space.rank) + # TODO: Not always valid (if pointer is embedded, say for GPUs) + check_uniform(hash(space.innerSpace)) +end + +default_processor(space::MPIMemorySpace) = MPIOSProc(space.comm, space.rank) +default_memory_space(accel::MPIAcceleration) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) + +default_memory_space(accel::MPIAcceleration, x) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) +default_memory_space(accel::MPIAcceleration, x::Chunk) = MPIMemorySpace(CPURAMMemorySpace(myid()), x.handle.comm, x.handle.rank) +default_memory_space(accel::MPIAcceleration, x::Function) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) +default_memory_space(accel::MPIAcceleration, T::Type) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) + +function memory_spaces(proc::MPIClusterProc) + rawMemSpace = Set{MemorySpace}() + for rnk in 0:(MPI.Comm_size(proc.comm) - 1) + for innerSpace in memory_spaces(OSProc()) + push!(rawMemSpace, MPIMemorySpace(innerSpace, proc.comm, rnk)) + end + end + return rawMemSpace +end + +function memory_spaces(proc::MPIProcessor) + rawMemSpace = Set{MemorySpace}() + for innerSpace in memory_spaces(proc.innerProc) + push!(rawMemSpace, MPIMemorySpace(innerSpace, proc.comm, proc.rank)) + end + return rawMemSpace +end + +root_worker_id(mem_space::MPIMemorySpace) = myid() + +function processors(memSpace::MPIMemorySpace) + rawProc = Set{Processor}() + for innerProc in processors(memSpace.innerSpace) + push!(rawProc, MPIProcessor(innerProc, memSpace.comm, memSpace.rank)) + end + return rawProc +end + +struct MPIRefID + tid::Int + uid::UInt + id::Int + function MPIRefID(tid, uid, id) + @assert tid > 0 || uid > 0 "Invalid MPIRefID: tid=$tid, uid=$uid, id=$id" + return new(tid, uid, id) + end +end + +function check_uniform(ref::MPIRefID) + check_uniform(ref.tid) + check_uniform(ref.uid) + check_uniform(ref.id) +end + +const MPIREF_TID = Dict{Int, Threads.Atomic{Int}}() +const MPIREF_UID = Dict{Int, Threads.Atomic{Int}}() + +mutable struct MPIRef + comm::MPI.Comm + rank::Int + size::Int + innerRef::Union{DRef, Nothing} + id::MPIRefID +end +root_worker_id(ref::MPIRef) = myid() +@warn "Move this definition somewhere else" maxlog=1 +root_worker_id(ref::DRef) = ref.owner + +function check_uniform(ref::MPIRef) + check_uniform(ref.rank) + check_uniform(ref.id) +end + +move(from_proc::Processor, to_proc::Processor, x::MPIRef) = + move(from_proc, to_proc, poolget(x; uniform=FETCH_UNIFORM[])) + +function affinity(x::MPIRef) + if x.innerRef === nothing + return MPIOSProc(x.comm, x.rank)=>0 + else + return MPIOSProc(x.comm, x.rank)=>x.innerRef.size + end +end + +function take_ref_id!() + tid = 0 + uid = 0 + id = 0 + if Dagger.in_task() + tid = sch_handle().thunk_id.id + uid = 0 + counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) + id = Threads.atomic_add!(counter, 1) + elseif MPI_TID[] != 0 + tid = MPI_TID[] + uid = 0 + counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) + id = Threads.atomic_add!(counter, 1) + elseif MPI_UID[] != 0 + tid = 0 + uid = MPI_UID[] + counter = get!(MPIREF_UID, uid, Threads.Atomic{Int}(1)) + id = Threads.atomic_add!(counter, 1) + end + return MPIRefID(tid, uid, id) +end + +function to_tag(h::UInt) + # FIXME: Use some kind of bounded re-hashing + # FIXME: Re-hash with upper and lower + bound = MPI.tag_ub() + tag = abs(Base.unsafe_trunc(Int32, h)) + while tag > bound + tag = tag - bound + end + return tag +end + +#TODO: partitioned scheduling with comm bifurcation +function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) + @assert space.comm == MPI.COMM_WORLD "$(space.comm) != $(MPI.COMM_WORLD)" + local_rank = MPI.Comm_rank(space.comm) + Mid = take_ref_id!() + if local_rank != space.rank + return MPIRef(space.comm, space.rank, 0, nothing, Mid) + else + return MPIRef(space.comm, space.rank, sizeof(x), poolset(x; device, kwargs...), Mid) + end +end + +const DEADLOCK_DETECT = TaskLocalValue{Bool}(()->true) +const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) +const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->60.0) +const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) + +function supports_inplace_mpi(value) + if value isa DenseArray && isbitstype(eltype(value)) + return true + else + return false + end +end +function recv_yield!(buffer, comm, src, tag) + #Core.println("buffer recv: $buffer, type of buffer: $(typeof(buffer)), is in place? $(supports_inplace_mpi(buffer))") + if !supports_inplace_mpi(buffer) + return recv_yield(comm, src, tag), false + end + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + rank = MPI.Comm_rank(comm) + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv! from [$src]") + + # Ensure no other receiver is waiting + our_event = Base.Event() + @label retry + other_event = lock(RECV_WAITING) do waiting + if haskey(waiting, (comm, src, tag)) + waiting[(comm, src, tag)] + else + waiting[(comm, src, tag)] = our_event + nothing + end + end + if other_event !== nothing + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") + wait(other_event) + @goto retry + end + while true + (got, msg, stat) = MPI.Improbe(src, tag, comm, MPI.Status) + if got + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield (Improbe) failed with error $(MPI.Get_error(stat))") + end + + req = MPI.Imrecv!(MPI.Buffer(buffer), msg) + while true + finish, stat = MPI.Test(req, MPI.Status) + if finish + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield (Test) failed with error $(MPI.Get_error(stat))") + end + + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Received value") + lock(RECV_WAITING) do waiting + delete!(waiting, (comm, src, tag)) + notify(our_event) + end + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Released lock") + return value, true + end + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) + yield() + end + end + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) + yield() + end +end +struct InplaceInfo + type::DataType + shape::Tuple +end +function recv_yield(comm, src, tag) + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + rank = MPI.Comm_rank(comm) + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv from [$src]") + + # Ensure no other receiver is waiting + our_event = Base.Event() + @label retry + other_event = lock(RECV_WAITING) do waiting + if haskey(waiting, (comm, src, tag)) + waiting[(comm, src, tag)] + else + waiting[(comm, src, tag)] = our_event + nothing + end + end + if other_event !== nothing + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") + wait(other_event) + @goto retry + end + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Receiving...") + + type = nothing + @label receive + value = recv_yield_serialized(comm, rank, src, tag) + if value isa InplaceInfo + value = recv_yield_inplace(value, comm, rank, src, tag) + end + lock(RECV_WAITING) do waiting + delete!(waiting, (comm, src, tag)) + notify(our_event) + end + return value +end +function recv_yield_serialized(comm, my_rank, their_rank, tag) + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + while true + (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) + if got + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield failed with error $(MPI.Get_error(stat))") + end + count = MPI.Get_count(stat, UInt8) + buf = Array{UInt8}(undef, count) + req = MPI.Imrecv!(MPI.Buffer(buf), msg) + __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") + return MPI.deserialize(buf) + end + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) + yield() + end +end +function recv_yield_inplace(_value::InplaceInfo, comm, my_rank, their_rank, tag) + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + + T = _value.type + @assert T <: Array && isbitstype(eltype(T)) "recv_yield_inplace only supports inplace MPI transfers of bitstype dense arrays" + array = Array{eltype(T)}(undef, _value.shape) + + while true + (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) + if got + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield failed with error $(MPI.Get_error(stat))") + end + count = MPI.Get_count(stat, UInt8) + @assert count == sizeof(array) "recv_yield_inplace: expected $(sizeof(array)) bytes, got $count" + buf = MPI.Buffer(array) + req = MPI.Imrecv!(buf, msg) + __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") + break + end + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) + yield() + end + + return array +end + +const SEEN_TAGS = Dict{Int32, Type}() +send_yield!(value, comm, dest, tag; check_seen::Bool=true) = + _send_yield(value, comm, dest, tag; check_seen, inplace=true) +send_yield(value, comm, dest, tag; check_seen::Bool=true) = + _send_yield(value, comm, dest, tag; check_seen, inplace=false) +function _send_yield(value, comm, dest, tag; check_seen::Bool=true, inplace::Bool) + rank = MPI.Comm_rank(comm) + + if check_seen && haskey(SEEN_TAGS, tag) && SEEN_TAGS[tag] !== typeof(value) + @error "[rank $(MPI.Comm_rank(comm))][tag $tag] Already seen tag (previous type: $(SEEN_TAGS[tag]), new type: $(typeof(value)))" exception=(InterruptException(),backtrace()) + end + if check_seen + SEEN_TAGS[tag] = typeof(value) + end + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting send to [$dest]: $(typeof(value)), is support inplace? $(supports_inplace_mpi(value))") + if inplace && supports_inplace_mpi(value) + send_yield_inplace(value, comm, rank, dest, tag) + else + send_yield_serialized(value, comm, rank, dest, tag) + end +end +function send_yield_inplace(value, comm, my_rank, their_rank, tag) + req = MPI.Isend(value, comm; dest=their_rank, tag) + __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") +end +function send_yield_serialized(value, comm, my_rank, their_rank, tag) + if value isa Array && isbitstype(eltype(value)) + send_yield_serialized(InplaceInfo(typeof(value), size(value)), comm, my_rank, their_rank, tag) + send_yield_inplace(value, comm, my_rank, their_rank, tag) + else + req = MPI.isend(value, comm; dest=their_rank, tag) + __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") + end +end +function __wait_for_request(req, comm, my_rank, their_rank, tag, fn::String, kind::String) + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + while true + finish, status = MPI.Test(req, MPI.Status) + if finish + if MPI.Get_error(status) != MPI.SUCCESS + error("$fn failed with error $(MPI.Get_error(status))") + end + return + end + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, kind, their_rank) + yield() + end +end + +function bcast_send_yield(value, comm, root, tag) + sz = MPI.Comm_size(comm) + rank = MPI.Comm_rank(comm) + for other_rank in 0:(sz-1) + rank == other_rank && continue + send_yield(value, comm, other_rank, tag) + end +end +function mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, kind, srcdest) + time_elapsed = (time_ns() - time_start) + if detect && time_elapsed > warn_period + @warn "[rank $rank][tag $tag] Hit probable hang on $kind (dest: $srcdest)" + return typemax(UInt64) + end + if detect && time_elapsed > timeout_period + error("[rank $rank][tag $tag] Hit hang on $kind (dest: $srcdest)") + end + return warn_period +end + +#discuss this with julian +WeakChunk(c::Chunk{T,H}) where {T,H<:MPIRef} = WeakChunk(c.handle.rank, c.handle.id.id, WeakRef(c)) + +function MemPool.poolget(ref::MPIRef; uniform::Bool=false) + @assert uniform || ref.rank == MPI.Comm_rank(ref.comm) "MPIRef rank mismatch: $(ref.rank) != $(MPI.Comm_rank(ref.comm))" + if uniform + tag = to_tag(hash(ref.id, hash(:poolget))) + if ref.rank == MPI.Comm_rank(ref.comm) + value = poolget(ref.innerRef) + bcast_send_yield(value, ref.comm, ref.rank, tag) + return value + else + return recv_yield(ref.comm, ref.rank, tag) + end + else + return poolget(ref.innerRef) + end +end +fetch_handle(ref::MPIRef; uniform::Bool=false) = poolget(ref; uniform) + +function move!(dep_mod, to_space::MPIMemorySpace, from_space::MPIMemorySpace, to::Chunk, from::Chunk) + @assert to.handle isa MPIRef && from.handle isa MPIRef "MPIRef expected" + @assert to.handle.comm == from.handle.comm "MPIRef comm mismatch" + @assert to.handle.rank == to_space.rank && from.handle.rank == from_space.rank "MPIRef rank mismatch" + local_rank = MPI.Comm_rank(from.handle.comm) + if to_space.rank == from_space.rank == local_rank + move!(dep_mod, to_space.innerSpace, from_space.innerSpace, to, from) + else + tag = to_tag(hash(dep_mod, hash(to.handle.id, hash(from.handle.id, hash(:move!))))) + @dagdebug nothing :mpi "[$local_rank][$tag] Moving from $(from_space.rank) to $(to_space.rank)\n" + if local_rank == from_space.rank + send_yield!(poolget(from.handle; uniform=false), to_space.comm, to_space.rank, tag) + elseif local_rank == to_space.rank + #@dagdebug nothing :mpi "[$local_rank][$tag] Receiving from rank $(from_space.rank) with tag $tag, type of buffer: $(typeof(poolget(to.handle; uniform=false)))" + to_val = poolget(to.handle; uniform=false) + val, inplace = recv_yield!(to_val, from_space.comm, from_space.rank, tag) + if !inplace + move!(dep_mod, to_space.innerSpace, from_space.innerSpace, to_val, val) + end + end + end + @dagdebug nothing :mpi "[$local_rank][$tag] Finished moving from $(from_space.rank) to $(to_space.rank) successfuly\n" +end +function move!(dep_mod::RemainderAliasing{<:MPIMemorySpace}, to_space::MPIMemorySpace, from_space::MPIMemorySpace, to::Chunk, from::Chunk) + @assert to.handle isa MPIRef && from.handle isa MPIRef "MPIRef expected" + @assert to.handle.comm == from.handle.comm "MPIRef comm mismatch" + @assert to.handle.rank == to_space.rank && from.handle.rank == from_space.rank "MPIRef rank mismatch" + local_rank = MPI.Comm_rank(from.handle.comm) + if to_space.rank == from_space.rank == local_rank + move!(dep_mod, to_space.innerSpace, from_space.innerSpace, to, from) + else + tag = to_tag(hash(dep_mod, hash(to.handle.id, hash(from.handle.id, hash(:move!))))) + @dagdebug nothing :mpi "[$local_rank][$tag] Moving from $(from_space.rank) to $(to_space.rank)\n" + if local_rank == from_space.rank + # Get the source data for each span + len = sum(span_tuple->span_len(span_tuple[1]), dep_mod.spans) + copies = Vector{UInt8}(undef, len) + offset = 1 + for (from_span, _) in dep_mod.spans + #GC.@preserve copy begin + from_ptr = Ptr{UInt8}(from_span.ptr) + to_ptr = Ptr{UInt8}(pointer(copies, offset)) + unsafe_copyto!(to_ptr, from_ptr, from_span.len) + offset += from_span.len + #end + end + + # Send the spans + send_yield(len, to_space.comm, to_space.rank, tag) + send_yield!(copies, to_space.comm, to_space.rank, tag; check_seen=false) + #send_yield(copies, to_space.comm, to_space.rank, tag) + elseif local_rank == to_space.rank + # Receive the spans + len = recv_yield(from_space.comm, from_space.rank, tag) + copies = Vector{UInt8}(undef, len) + recv_yield!(copies, from_space.comm, from_space.rank, tag) + #copies = recv_yield(from_space.comm, from_space.rank, tag) + + # Copy the data into the destination object + #for (copy, (_, to_span)) in zip(copies, dep_mod.spans) + offset = 1 + for (_, to_span) in dep_mod.spans + #GC.@preserve copy begin + from_ptr = Ptr{UInt8}(pointer(copies, offset)) + to_ptr = Ptr{UInt8}(to_span.ptr) + unsafe_copyto!(to_ptr, from_ptr, to_span.len) + offset += to_span.len + #end + end + + # Ensure that the data is visible + Core.Intrinsics.atomic_fence(:release) + end + end + + return +end + + +move(::MPIOSProc, ::MPIProcessor, x::Union{Function,Type}) = x +move(::MPIOSProc, ::MPIProcessor, x::Chunk{<:Union{Function,Type}}) = poolget(x.handle) + +#TODO: out of place MPI move +function move(src::MPIOSProc, dst::MPIProcessor, x::Chunk) + @assert src.comm == dst.comm "Multi comm move not supported" + if Sch.SCHED_MOVE[] + if dst.rank == MPI.Comm_rank(dst.comm) + return poolget(x.handle) + end + else + @assert src.rank == MPI.Comm_rank(src.comm) "Unwrapping not permited" + @assert src.rank == x.handle.rank == dst.rank + return poolget(x.handle) + end +end + +const MPI_UNIFORM = ScopedValue{Bool}(false) + +@warn "bcast T if return type is not concrete" maxlog=1 +function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_proc, from_space, to_space, data) + loc_rank = MPI.Comm_rank(accel.comm) + task = DATADEPS_CURRENT_TASK[] + return with(MPI_UID=>task.uid, MPI_UNIFORM=>true) do + if data isa Chunk + tag = to_tag(hash(data.handle.id)) + space = memory_space(data) + if space.rank != from_proc.rank + # If the data is already where it needs to be + @assert space.rank == to_proc.rank + if space.rank == loc_rank + value = poolget(data.handle) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) + return tochunk(data_converted, to_proc, to_space) + else + T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) + T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T + @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" + return tochunk(nothing, to_proc, to_space; type=T_new) + end + end + + # The data is on the source rank + @assert space.rank == from_proc.rank + if loc_rank == from_proc.rank == to_proc.rank + value = poolget(data.handle) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) + return tochunk(data_converted, to_proc, to_space) + elseif loc_rank == from_proc.rank + value = poolget(data.handle) + data_moved = move(from_proc.innerProc, to_proc.innerProc, value) + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + # FIXME: This is wrong to take typeof(data_moved), because the type may change + return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) + elseif loc_rank == to_proc.rank + data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) + return tochunk(data_converted, to_proc, to_space) + else + T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) + T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T + @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" + return tochunk(nothing, to_proc, to_space; type=T_new) + end + else + error("We shouldn't call f here, if we're not the destination rank") + data_converted = f(move(from_proc, to_proc, data)) + return tochunk(data_converted, to_proc, to_space) + end + end +end + +move(src::Processor, dst::MPIProcessor, x::Chunk) = error("MPI move not supported") +move(to_proc::MPIProcessor, chunk::Chunk) = + move(chunk.processor, to_proc, chunk) +move(to_proc::Processor, d::MPIRef) = + move(MPIOSProc(d.rank), to_proc, d) +move(to_proc::MPIProcessor, x) = + move(MPIOSProc(), to_proc, x) + +move(::MPIProcessor, ::MPIProcessor, x::Union{Function,Type}) = x +move(::MPIProcessor, ::MPIProcessor, x::Chunk{<:Union{Function,Type}}) = poolget(x.handle) + +@warn "Is this uniform logic valuable to have?" maxlog=1 +function move(src::MPIProcessor, dst::MPIProcessor, x::Chunk) + uniform = false #uniform = MPI_UNIFORM[] + @assert uniform || src.rank == dst.rank "Unwrapping not permitted" + if Sch.SCHED_MOVE[] + # We can either unwrap locally, or return nothing + if dst.rank == MPI.Comm_rank(dst.comm) + return poolget(x.handle) + end + else + # Either we're uniform (so everyone cooperates), or we're unwrapping locally + if !uniform + @assert src.rank == MPI.Comm_rank(src.comm) "Unwrapping not permitted" + @assert src.rank == x.handle.rank == dst.rank + end + return poolget(x.handle; uniform) + end +end + +#FIXME:try to think of a better move! scheme +function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) + local_rank = MPI.Comm_rank(proc.comm) + tag = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) + islocal = local_rank == proc.rank + inplace_move = f === move! + result = nothing + if islocal || inplace_move + result = execute!(proc.innerProc, world, f, args...; kwargs...) + end + if inplace_move + # move! already handles communication + space = memory_space(nothing, proc)::MPIMemorySpace + return tochunk(nothing, proc, space) + else + # Handle communication ourselves + if islocal + T = typeof(result) + space = memory_space(result, proc)::MPIMemorySpace + bcast_send_yield((T, space.innerSpace), proc.comm, proc.rank, tag) + return tochunk(result, proc, space) + else + T, innerSpace = recv_yield(proc.comm, proc.rank, tag) + space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) + return tochunk(nothing, proc, space; type=T) + end + end +end + +accelerate!(::Val{:mpi}) = accelerate!(MPIAcceleration()) + +function initialize_acceleration!(a::MPIAcceleration) + if !MPI.Initialized() + MPI.Init(;threadlevel=:multiple) + end + ctx = Dagger.Sch.eager_context() + sz = MPI.Comm_size(a.comm) + for i in 0:(sz-1) + push!(ctx.procs, MPIOSProc(a.comm, i)) + end + unique!(ctx.procs) +end + +accel_matches_proc(accel::MPIAcceleration, proc::MPIOSProc) = true +accel_matches_proc(accel::MPIAcceleration, proc::MPIClusterProc) = true +accel_matches_proc(accel::MPIAcceleration, proc::MPIProcessor) = true +accel_matches_proc(accel::MPIAcceleration, proc) = false + +function distribute(accel::MPIAcceleration, A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} + comm = accel.comm + rank = MPI.Comm_rank(comm) + + DA = view(A, dist) + DB = DArray{T,N}(undef, dist, size(A)) + copyto!(DB, DA) + + return DB +end diff --git a/src/mutable.jl b/src/mutable.jl new file mode 100644 index 000000000..1f48ead53 --- /dev/null +++ b/src/mutable.jl @@ -0,0 +1,41 @@ +function _mutable_inner(@nospecialize(f), proc, scope) + result = f() + return Ref(Dagger.tochunk(result, proc, scope)) +end + +""" + mutable(f::Base.Callable; worker, processor, scope) -> Chunk + +Calls `f()` on the specified worker or processor, returning a `Chunk` +referencing the result with the specified scope `scope`. +""" +function mutable(@nospecialize(f); worker=nothing, processor=nothing, scope=nothing) + if processor === nothing + if worker === nothing + processor = OSProc() + else + processor = OSProc(worker) + end + else + @assert worker === nothing "mutable: Can't mix worker and processor" + end + if scope === nothing + scope = processor isa OSProc ? ProcessScope(processor) : ExactScope(processor) + end + return fetch(Dagger.@spawn scope=scope _mutable_inner(f, processor, scope))[] +end + +""" + @mutable [worker=1] [processor=OSProc()] [scope=ProcessorScope()] f() + +Helper macro for [`mutable()`](@ref). +""" +macro mutable(exs...) + opts = esc.(exs[1:end-1]) + ex = exs[end] + quote + let f = @noinline ()->$(esc(ex)) + $mutable(f; $(opts...)) + end + end +end diff --git a/src/options.jl b/src/options.jl index eca59fbc9..ee53faa04 100644 --- a/src/options.jl +++ b/src/options.jl @@ -7,6 +7,7 @@ Stores per-task options to be passed to the scheduler. # Arguments - `propagates::Vector{Symbol}`: The set of option names that will be propagated by this task to tasks that it spawns. +- `acceleration::Acceleration`: The acceleration (cluster/network) type to use for this task. - `processor::Processor`: The processor associated with this task's function. Generally ignored by the scheduler. - `compute_scope::AbstractScope`: The execution scope of the task, which determines where the task can be scheduled and executed. `scope` is another name for this option. - `result_scope::AbstractScope`: The data scope of the task's result, which determines where the task's result can be accessed from. @@ -34,6 +35,7 @@ Stores per-task options to be passed to the scheduler. Base.@kwdef mutable struct Options propagates::Union{Vector{Symbol},Nothing} = nothing + acceleration::Union{Acceleration,Nothing} = nothing processor::Union{Processor,Nothing} = nothing scope::Union{AbstractScope,Nothing} = nothing compute_scope::Union{AbstractScope,Nothing} = scope @@ -121,6 +123,7 @@ signature `sig`, if the option was previously unspecified in `opts`. """ function populate_defaults!(opts::Options, sig) maybe_default!(opts, Val{:propagates}(), sig) + maybe_default!(opts, Val{:acceleration}(), sig) maybe_default!(opts, Val{:processor}(), sig) maybe_default!(opts, Val{:compute_scope}(), sig) maybe_default!(opts, Val{:result_scope}(), sig) diff --git a/src/processor.jl b/src/processor.jl index ac2e74f14..75e19094d 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -2,18 +2,6 @@ export OSProc, Context, addprocs!, rmprocs! import Base: @invokelatest -""" - Processor - -An abstract type representing a processing device and associated memory, where -data can be stored and operated on. Subtypes should be immutable, and -instances should compare equal if they represent the same logical processing -device/memory. Subtype instances should be serializable between different -nodes. Subtype instances may contain a "parent" `Processor` to make it easy to -transfer data to/from other types of `Processor` at runtime. -""" -abstract type Processor end - const PROCESSOR_CALLBACKS = Dict{Symbol,Any}() const OSPROC_PROCESSOR_CACHE = LockedObject(Dict{Int,Set{Processor}}()) diff --git a/src/queue.jl b/src/queue.jl index 37947a0ac..d55b31e6a 100644 --- a/src/queue.jl +++ b/src/queue.jl @@ -125,7 +125,7 @@ function wait_all(f; check_errors::Bool=false) result = with_options(f; task_queue=queue) for task in queue.tasks if check_errors - fetch(task; raw=true) + fetch(task; move_value=false, unwrap=false, throw_on_error=true) else wait(task) end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index f0bed125e..a6d252575 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -15,7 +15,7 @@ import Base: @invokelatest import ..Dagger import ..Dagger: Context, Processor, SchedulerOptions, Options, Thunk, WeakThunk, ThunkFuture, ThunkID, DTaskFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, InvalidScope, LockedObject, Argument, Signature -import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc! +import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, root_worker_id, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc! import ..Dagger: @dagdebug, @safe_lock_spin1, @maybelog, @take_or_alloc! import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek @@ -25,7 +25,7 @@ import ..Dagger: @reusable, @reusable_dict, @reusable_vector, @reusable_tasks, @ import TimespanLogging import TaskLocalValues: TaskLocalValue -import ScopedValues: @with +import ScopedValues: ScopedValue, @with, with const OneToMany = Dict{Thunk, Set{Thunk}} @@ -56,7 +56,7 @@ Fields: - `cache::WeakKeyDict{Thunk, Any}` - Maps from a finished `Thunk` to it's cached result, often a DRef - `valid::WeakKeyDict{Thunk, Nothing}` - Tracks all `Thunk`s that are in a valid scheduling state - `running::Set{Thunk}` - The set of currently-running `Thunk`s -- `running_on::Dict{Thunk,OSProc}` - Map from `Thunk` to the OS process executing it +- `running_on::Dict{Thunk,Processor}` - Map from `Thunk` to the OS process executing it - `thunk_dict::Dict{Int, WeakThunk}` - Maps from thunk IDs to a `Thunk` - `node_order::Any` - Function that returns the order of a thunk - `equiv_chunks::WeakKeyDict{DRef,Chunk}` - Cache mapping from `DRef` to a `Chunk` which contains it @@ -82,15 +82,15 @@ struct ComputeState ready::Vector{Thunk} valid::Dict{Thunk, Nothing} running::Set{Thunk} - running_on::Dict{Thunk,OSProc} + running_on::Dict{Thunk,Processor} thunk_dict::Dict{Int, WeakThunk} node_order::Any - equiv_chunks::WeakKeyDict{DRef,Chunk} - worker_time_pressure::Dict{Int,Dict{Processor,UInt64}} - worker_storage_pressure::Dict{Int,Dict{Union{StorageResource,Nothing},UInt64}} - worker_storage_capacity::Dict{Int,Dict{Union{StorageResource,Nothing},UInt64}} - worker_loadavg::Dict{Int,NTuple{3,Float64}} - worker_chans::Dict{Int, Tuple{RemoteChannel,RemoteChannel}} + equiv_chunks::WeakKeyDict{Any,Chunk} + worker_time_pressure::Dict{Processor,Dict{Processor,UInt64}} + worker_storage_pressure::Dict{Processor,Dict{Union{StorageResource,Nothing},UInt64}} + worker_storage_capacity::Dict{Processor,Dict{Union{StorageResource,Nothing},UInt64}} + worker_loadavg::Dict{Processor,NTuple{3,Float64}} + worker_chans::Dict{Int,Tuple{RemoteChannel,RemoteChannel}} signature_time_cost::Dict{Signature,UInt64} signature_alloc_cost::Dict{Signature,UInt64} transfer_rate::Ref{UInt64} @@ -111,10 +111,10 @@ function start_state(deps::Dict, node_order, chan) Vector{Thunk}(undef, 0), Dict{Thunk, Nothing}(), Set{Thunk}(), - Dict{Thunk,OSProc}(), + Dict{Thunk,Processor}(), Dict{Int, WeakThunk}(), node_order, - WeakKeyDict{DRef,Chunk}(), + WeakKeyDict{Any,Chunk}(), Dict{Int,Dict{Processor,UInt64}}(), Dict{Int,Dict{Union{StorageResource,Nothing},UInt64}}(), Dict{Int,Dict{Union{StorageResource,Nothing},UInt64}}(), @@ -152,29 +152,29 @@ const WORKER_MONITOR_TASKS = Dict{Int,Task}() const WORKER_MONITOR_CHANS = Dict{Int,Dict{UInt64,RemoteChannel}}() function init_proc(state, p, log_sink) ctx = Context(Int[]; log_sink) - @maybelog ctx timespan_start(ctx, :init_proc, (;uid=state.uid, worker=p.pid), nothing) + pid = Dagger.root_worker_id(p) + @maybelog ctx timespan_start(ctx, :init_proc, (;uid=state.uid, worker=pid), nothing) # Initialize pressure and capacity - gproc = OSProc(p.pid) lock(state.lock) do - state.worker_time_pressure[p.pid] = Dict{Processor,UInt64}() + state.worker_time_pressure[p] = Dict{Processor,UInt64}() - state.worker_storage_pressure[p.pid] = Dict{Union{StorageResource,Nothing},UInt64}() - state.worker_storage_capacity[p.pid] = Dict{Union{StorageResource,Nothing},UInt64}() + state.worker_storage_pressure[p] = Dict{Union{StorageResource,Nothing},UInt64}() + state.worker_storage_capacity[p] = Dict{Union{StorageResource,Nothing},UInt64}() #= FIXME for storage in get_storage_resources(gproc) - pressure, capacity = remotecall_fetch(gproc.pid, storage) do storage + pressure, capacity = remotecall_fetch(root_worker_id(gproc), storage) do storage storage_pressure(storage), storage_capacity(storage) end - state.worker_storage_pressure[p.pid][storage] = pressure - state.worker_storage_capacity[p.pid][storage] = capacity + state.worker_storage_pressure[p][storage] = pressure + state.worker_storage_capacity[p][storage] = capacity end =# - state.worker_loadavg[p.pid] = (0.0, 0.0, 0.0) + state.worker_loadavg[p] = (0.0, 0.0, 0.0) end - if p.pid != 1 + if pid != 1 lock(WORKER_MONITOR_LOCK) do - wid = p.pid + wid = pid if !haskey(WORKER_MONITOR_TASKS, wid) t = Threads.@spawn begin try @@ -208,16 +208,16 @@ function init_proc(state, p, log_sink) end # Setup worker-to-scheduler channels - inp_chan = RemoteChannel(p.pid) - out_chan = RemoteChannel(p.pid) + inp_chan = RemoteChannel(pid) + out_chan = RemoteChannel(pid) lock(state.lock) do - state.worker_chans[p.pid] = (inp_chan, out_chan) + state.worker_chans[pid] = (inp_chan, out_chan) end # Setup dynamic listener - dynamic_listener!(ctx, state, p.pid) + dynamic_listener!(ctx, state, pid) - @maybelog ctx timespan_finish(ctx, :init_proc, (;uid=state.uid, worker=p.pid), nothing) + @maybelog ctx timespan_finish(ctx, :init_proc, (;uid=state.uid, worker=pid), nothing) end function _cleanup_proc(uid, log_sink) empty!(CHUNK_CACHE) # FIXME: Should be keyed on uid! @@ -235,7 +235,7 @@ function _cleanup_proc(uid, log_sink) end function cleanup_proc(state, p, log_sink) ctx = Context(Int[]; log_sink) - wid = p.pid + wid = root_worker_id(p) @maybelog ctx timespan_start(ctx, :cleanup_proc, (;uid=state.uid, worker=wid), nothing) lock(WORKER_MONITOR_LOCK) do if haskey(WORKER_MONITOR_CHANS, wid) @@ -298,7 +298,7 @@ function compute_dag(ctx::Context, d::Thunk, options=SchedulerOptions()) node_order = x -> -get(ord, x, 0) state = start_state(deps, node_order, chan) - master = OSProc(myid()) + master = Dagger.default_processor() @maybelog ctx timespan_start(ctx, :scheduler_init, (;uid=state.uid), master) try @@ -393,8 +393,8 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options::SchedulerOpt res = tresult.result @dagdebug thunk_id :take "Got finished task" - gproc = OSProc(pid) safepoint(state) + gproc = proc != nothing ? get_parent(proc) : OSProc(pid) lock(state.lock) do thunk_failed = false if res isa Exception @@ -421,11 +421,11 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options::SchedulerOpt node = unwrap_weak_checked(state.thunk_dict[thunk_id])::Thunk metadata = tresult.metadata if metadata !== nothing - state.worker_time_pressure[pid][proc] = metadata.time_pressure + state.worker_time_pressure[gproc][proc] = metadata.time_pressure #to_storage = fetch(node.options.storage) #state.worker_storage_pressure[pid][to_storage] = metadata.storage_pressure #state.worker_storage_capacity[pid][to_storage] = metadata.storage_capacity - #state.worker_loadavg[pid] = metadata.loadavg + #state.worker_loadavg[gproc] = metadata.loadavg sig = signature(state, node) state.signature_time_cost[sig] = (metadata.threadtime + get(state.signature_time_cost, sig, 0)) ÷ 2 state.signature_alloc_cost[sig] = (metadata.gc_allocd + get(state.signature_alloc_cost, sig, 0)) ÷ 2 @@ -434,8 +434,8 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options::SchedulerOpt end end if res isa Chunk - if !haskey(state.equiv_chunks, res) - state.equiv_chunks[res.handle::DRef] = res + if !haskey(state.equiv_chunks, res.handle) + state.equiv_chunks[res.handle] = res end end store_result!(state, node, res; error=thunk_failed) @@ -522,7 +522,7 @@ end const CHUNK_CACHE = Dict{Chunk,Dict{Processor,Any}}() struct ScheduleTaskLocation - gproc::OSProc + gproc::Processor proc::Processor end struct ScheduleTaskSpec @@ -546,6 +546,7 @@ end to_fire_cleanup = @reuse_defer_cleanup empty!(to_fire) failed_scheduling = @reusable_vector :schedule!_failed_scheduling Union{Thunk,Nothing} nothing 32 failed_scheduling_cleanup = @reuse_defer_cleanup empty!(failed_scheduling) + # Select a new task and get its options task = nothing @label pop_task @@ -622,9 +623,9 @@ end end @label scope_computed - input_procs = @reusable_vector :schedule!_input_procs Processor OSProc() 32 + input_procs = @reusable_vector :schedule!_input_procs Union{Processor,Nothing} nothing 32 input_procs_cleanup = @reuse_defer_cleanup empty!(input_procs) - for proc in Dagger.compatible_processors(scope, procs) + for proc in Dagger.compatible_processors(options.acceleration, scope, procs) if !(proc in input_procs) push!(input_procs, proc) end @@ -656,7 +657,7 @@ end can_use, scope = can_use_proc(state, task, gproc, proc, options, scope) if can_use has_cap, est_time_util, est_alloc_util, est_occupancy = - has_capacity(state, proc, gproc.pid, options.time_util, options.alloc_util, options.occupancy, sig) + has_capacity(state, proc, gproc, options.time_util, options.alloc_util, options.occupancy, sig) if has_cap # Schedule task onto proc # FIXME: est_time_util = est_time_util isa MaxUtilization ? cap : est_time_util @@ -665,10 +666,10 @@ end Vector{ScheduleTaskSpec}() end push!(proc_tasks, ScheduleTaskSpec(task, scope, est_time_util, est_alloc_util, est_occupancy)) - state.worker_time_pressure[gproc.pid][proc] = - get(state.worker_time_pressure[gproc.pid], proc, 0) + + state.worker_time_pressure[gproc][proc] = + get(state.worker_time_pressure[gproc], proc, 0) + est_time_util - @dagdebug task :schedule "Scheduling to $gproc -> $proc (cost: $(costs[proc]), pressure: $(state.worker_time_pressure[gproc.pid][proc]))" + @dagdebug task :schedule "Scheduling to $gproc -> $proc (cost: $(costs[proc]), pressure: $(state.worker_time_pressure[gproc][proc]))" sorted_procs_cleanup() costs_cleanup() @goto pop_task @@ -736,13 +737,13 @@ function monitor_procs_changed!(ctx, state, options) end function remove_dead_proc!(ctx, state, proc, options) - @assert options.single !== proc.pid "Single worker failed, cannot continue." + @assert options.single !== root_worker_id(proc) "Single worker failed, cannot continue." rmprocs!(ctx, [proc]) - delete!(state.worker_time_pressure, proc.pid) - delete!(state.worker_storage_pressure, proc.pid) - delete!(state.worker_storage_capacity, proc.pid) - delete!(state.worker_loadavg, proc.pid) - delete!(state.worker_chans, proc.pid) + delete!(state.worker_time_pressure, proc) + delete!(state.worker_storage_pressure, proc) + delete!(state.worker_storage_capacity, proc) + delete!(state.worker_loadavg, proc) + delete!(state.worker_chans, root_worker_id(proc)) end function finish_task!(ctx, state, node, thunk_failed) @@ -785,7 +786,7 @@ end function evict_all_chunks!(ctx, options, to_evict) if !isempty(to_evict) - @sync for w in map(p->p.pid, procs_to_use(ctx, options)) + @sync for w in map(p->root_worker_id(p), procs_to_use(ctx, options)) Threads.@spawn remote_do(evict_chunks!, w, ctx.log_sink, to_evict) end end @@ -856,9 +857,10 @@ Base.hash(task::TaskSpec, h::UInt) = hash(task.thunk_id, hash(TaskSpec, h)) end Tf = chunktype(first(args)) - @assert (options.single === nothing) || (gproc.pid == options.single) + pid = root_worker_id(gproc) + @assert (options.single === nothing) || (pid == options.single) # TODO: Set `sch_handle.tid.ref` to the right `DRef` - sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[gproc.pid]...) + sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[pid]...) # TODO: De-dup common fields (log_sink, uid, etc.) push!(to_send, TaskSpec( @@ -870,7 +872,7 @@ Base.hash(task::TaskSpec, h::UInt) = hash(task.thunk_id, hash(TaskSpec, h)) end if !isempty(to_send) - if Dagger.root_worker_id(gproc) == myid() + if root_worker_id(gproc) == myid() @reusable_tasks :fire_tasks!_task_cache 32 _->nothing "fire_tasks!" FireTaskSpec(proc, state.chan, to_send) else # N.B. We don't batch these because we might get a deserialization @@ -1076,7 +1078,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re proc_occupancy = istate.proc_occupancy time_pressure = istate.time_pressure - wid = get_parent(to_proc).pid + wid = root_worker_id(to_proc) work_to_do = false while isopen(return_queue) # Wait for new tasks @@ -1151,7 +1153,8 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re end task, occupancy = peek(queue) scope = task.scope - if Dagger.proc_in_scope(to_proc, scope) + accel = task.options.acceleration + if Dagger.proc_in_scope(to_proc, scope) && Dagger.accel_matches_proc(accel, to_proc) typemax(UInt32) - proc_occupancy_cached >= occupancy # Compatible, steal this task return dequeue_pair!(queue) @@ -1357,6 +1360,8 @@ function do_tasks(to_proc, return_queue, tasks) end @dagdebug nothing :processor "Kicked processors" end + +const SCHED_MOVE = ScopedValue{Bool}(false) """ do_task(to_proc, task::TaskSpec) -> Any @@ -1369,13 +1374,15 @@ Executes a single task specified by `task` on `to_proc`. ctx_vars = task.ctx_vars ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile) - from_proc = OSProc() + options = task.options + Dagger.accelerate!(options.acceleration) + + from_proc = Dagger.default_processor() data = task.data Tf = task.Tf f = isdefined(Tf, :instance) ? Tf.instance : nothing # Wait for required resources to become available - options = task.options propagated = get_propagated_options(options) to_storage = options.storage !== nothing ? fetch(options.storage) : MemPool.GLOBAL_DEVICE[] #to_storage_name = nameof(typeof(to_storage)) @@ -1443,7 +1450,7 @@ Executes a single task specified by `task` on `to_proc`. @maybelog ctx timespan_finish(ctx, :storage_wait, (;thunk_id, processor=to_proc), (;f, device=typeof(to_storage))) =# - @dagdebug thunk_id :execute "Moving data" + @dagdebug thunk_id :execute "Moving data for $Tf" # Initiate data transfers for function and arguments transfer_time = Threads.Atomic{UInt64}(0) @@ -1501,7 +1508,9 @@ Executes a single task specified by `task` on `to_proc`. end else =# - new_value = @invokelatest move(to_proc, value) + new_value = with(SCHED_MOVE=>true) do + @invokelatest move(to_proc, value) + end #end if new_value !== value @dagdebug thunk_id :move "Moved argument @ $position to $to_proc: $(typeof(value)) -> $(typeof(new_value))" @@ -1546,7 +1555,7 @@ Executes a single task specified by `task` on `to_proc`. # FIXME #gcnum_start = Base.gc_num() - @dagdebug thunk_id :execute "Executing $(typeof(f))" + @dagdebug thunk_id :execute "Executing $Tf" logging_enabled = !(ctx.log_sink isa TimespanLogging.NoOpLog) @@ -1609,7 +1618,7 @@ Executes a single task specified by `task` on `to_proc`. notify(TASK_SYNC) end - @dagdebug thunk_id :execute "Returning" + @dagdebug thunk_id :execute "Returning $Tf with $(typeof(result_meta))" # TODO: debug_storage("Releasing $to_storage_name") metadata = ( diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 67e895815..3478863da 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -31,7 +31,8 @@ function init_eager() sopts = SchedulerOptions(;allow_errors=true) opts = Dagger.Options((;scope=Dagger.ExactScope(Dagger.ThreadProc(1, 1)), occupancy=Dict(Dagger.ThreadProc=>0), - time_util=Dict(Dagger.ThreadProc=>0))) + time_util=Dict(Dagger.ThreadProc=>0), + acceleration=Dagger.DistributedAcceleration())) Dagger.compute(ctx, Dagger._delayed(eager_thunk, opts)(); options=sopts) catch err diff --git a/src/sch/util.jl b/src/sch/util.jl index d3b7a4804..514604b11 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -373,7 +373,7 @@ function signature(f, args) value = Dagger.value(arg) if value isa Dagger.DTask # Only occurs via manual usage of signature - value = fetch(value; raw=true) + value = fetch(value; move_value=false, unwrap=false) end if istask(value) throw(ConcurrencyViolationError("Must call `collect_task_inputs!(state, task)` before calling `signature`")) @@ -403,6 +403,7 @@ end function can_use_proc(state, task, gproc, proc, opts, scope) # Check against proclist + pid = Dagger.root_worker_id(gproc) if opts.proclist !== nothing @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if opts.proclist isa Function @@ -421,6 +422,10 @@ function can_use_proc(state, task, gproc, proc, opts, scope) else throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end + if !Dagger.accel_matches_proc(opts.acceleration, proc) + @dagdebug task :scope "Rejected $proc: Not compatible with acceleration ($opts.acceleration)" + return false, scope + end if scope isa Dagger.InvalidScope @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" return false, scope @@ -430,8 +435,8 @@ function can_use_proc(state, task, gproc, proc, opts, scope) # Check against single if opts.single !== nothing @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 - if gproc.pid != opts.single - @dagdebug task :scope "Rejected $proc: gproc.pid ($(gproc.pid)) != single ($(opts.single))" + if pid != opts.single + @dagdebug task :scope "Rejected $proc: pid ($(pid)) != single ($(opts.single))" return false, scope end scope = constrain(scope, Dagger.ProcessScope(opts.single)) @@ -583,7 +588,7 @@ end # Add fixed cost for cross-worker task transfer (esimated at 1ms) # TODO: Actually estimate/benchmark this - task_xfer_cost = gproc.pid != myid() ? 1_000_000 : 0 # 1ms + task_xfer_cost = root_worker_id(gproc) != myid() ? 1_000_000 : 0 # 1ms # Compute final cost costs[proc] = est_time_util + (tx_cost/tx_rate) + task_xfer_cost diff --git a/src/scopes.jl b/src/scopes.jl index 79190c292..ff76e121c 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -1,7 +1,5 @@ export AnyScope, DefaultScope, UnionScope, NodeScope, ProcessScope, ExactScope, ProcessorTypeScope -abstract type AbstractScope end - "Widest scope that contains all processors." struct AnyScope <: AbstractScope end proc_in_scope(::Processor, ::AnyScope) = true @@ -97,11 +95,12 @@ ProcessorTypeScope(T, inner_scope=AnyScope()) = Set{AbstractScopeTaint}([ProcessorTypeTaint{T}()])) "Scoped to a specific processor." -struct ExactScope <: AbstractScope - parent::ProcessScope +struct ExactScope{P<:AbstractScope} <: AbstractScope + parent::P processor::Processor end -ExactScope(proc) = ExactScope(ProcessScope(get_parent(proc).pid), proc) +ExactScope(proc) = ExactScope(enclosing_scope(get_parent(proc)), proc) +enclosing_scope(proc::OSProc) = ProcessScope(proc.pid) proc_in_scope(proc::Processor, scope::ExactScope) = proc == scope.processor "Indicates that the applied scopes `x` and `y` are incompatible." @@ -457,4 +456,4 @@ function Base.issubset(scope1::AbstractScope, scope2::AbstractScope) proc in scope2_procs || return false end return true -end \ No newline at end of file +end diff --git a/src/shard.jl b/src/shard.jl new file mode 100644 index 000000000..ecd0ee570 --- /dev/null +++ b/src/shard.jl @@ -0,0 +1,89 @@ +""" +Maps a value to one of multiple distributed "mirror" values automatically when +used as a thunk argument. Construct using `@shard` or `shard`. +""" +struct Shard + chunks::Dict{Processor,Chunk} +end + +""" + shard(f; kwargs...) -> Chunk{Shard} + +Executes `f` on all workers in `workers`, wrapping the result in a +process-scoped `Chunk`, and constructs a `Chunk{Shard}` containing all of these +`Chunk`s on the current worker. + +Keyword arguments: +- `procs` -- The list of processors to create pieces on. May be any iterable container of `Processor`s. +- `workers` -- The list of workers to create pieces on. May be any iterable container of `Integer`s. +- `per_thread::Bool=false` -- If `true`, creates a piece per each thread, rather than a piece per each worker. +""" +function shard(@nospecialize(f); procs=nothing, workers=nothing, per_thread=false) + if procs === nothing + if workers !== nothing + procs = [OSProc(w) for w in workers] + else + procs = lock(Sch.eager_context()) do + copy(Sch.eager_context().procs) + end + end + if per_thread + _procs = ThreadProc[] + for p in procs + append!(_procs, filter(p->p isa ThreadProc, get_processors(p))) + end + procs = _procs + end + else + if workers !== nothing + throw(ArgumentError("Cannot combine `procs` and `workers`")) + elseif per_thread + throw(ArgumentError("Cannot combine `procs` and `per_thread=true`")) + end + end + isempty(procs) && throw(ArgumentError("Cannot create empty Shard")) + shard_running_dict = Dict{Processor,DTask}() + for proc in procs + scope = proc isa OSProc ? ProcessScope(proc) : ExactScope(proc) + thunk = Dagger.@spawn scope=scope _mutable_inner(f, proc, scope) + shard_running_dict[proc] = thunk + end + shard_dict = Dict{Processor,Chunk}() + for proc in procs + shard_dict[proc] = fetch(shard_running_dict[proc])[] + end + return Shard(shard_dict) +end + +"Creates a `Shard`. See [`Dagger.shard`](@ref) for details." +macro shard(exs...) + opts = esc.(exs[1:end-1]) + ex = exs[end] + quote + let f = @noinline ()->$(esc(ex)) + $shard(f; $(opts...)) + end + end +end + +function move(from_proc::Processor, to_proc::Processor, shard::Shard) + # Match either this proc or some ancestor + # N.B. This behavior may bypass the piece's scope restriction + proc = to_proc + if haskey(shard.chunks, proc) + return move(from_proc, to_proc, shard.chunks[proc]) + end + parent = Dagger.get_parent(proc) + while parent != proc + proc = parent + parent = Dagger.get_parent(proc) + if haskey(shard.chunks, proc) + return move(from_proc, to_proc, shard.chunks[proc]) + end + end + + throw(KeyError(to_proc)) +end +Base.iterate(s::Shard) = iterate(values(s.chunks)) +Base.iterate(s::Shard, state) = iterate(values(s.chunks), state) +Base.length(s::Shard) = length(s.chunks) diff --git a/src/thunk.jl b/src/thunk.jl index e13e299f0..c4783762a 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -493,7 +493,7 @@ function _par(mod, ex::Expr; lazy=true, recur=true, opts=()) $spawn($f, $Options(;$(opts...)), $(args...); $(kwargs...)) end if $(Expr(:islocal, sync_var)) - put!($sync_var, schedule(Task(()->fetch($result; raw=true)))) + put!($sync_var, schedule(Task(()->fetch($result; move_value=false, unwrap=false)))) end $result end @@ -578,6 +578,8 @@ function _spawn(args_kwargs, task_options) # Get task queue, and don't let it propagate task_queue = get(scoped_options, :task_queue, DefaultTaskQueue())::AbstractTaskQueue filter!(prop -> prop != :task_queue, propagates) + + # Update propagates from scoped options propagates if task_options.propagates !== nothing append!(task_options.propagates, propagates) else @@ -585,6 +587,11 @@ function _spawn(args_kwargs, task_options) end unique!(task_options.propagates) + # Read task-local acceleration into options + if task_options.acceleration === nothing + task_options.acceleration = current_acceleration() + end + # Construct task spec and handle spec = DTaskSpec(args_kwargs, task_options) task = eager_spawn(spec) diff --git a/src/tochunk.jl b/src/tochunk.jl new file mode 100644 index 000000000..25ae9a965 --- /dev/null +++ b/src/tochunk.jl @@ -0,0 +1,106 @@ +@warn "Update tochunk docstring" maxlog=1 +""" + tochunk(x, proc::Processor, scope::AbstractScope; device=nothing, rewrap=false, kwargs...) -> Chunk + +Create a chunk from data `x` which resides on `proc` and which has scope +`scope`. + +`device` specifies a `MemPool.StorageDevice` (which is itself wrapped in a +`Chunk`) which will be used to manage the reference contained in the `Chunk` +generated by this function. If `device` is `nothing` (the default), the data +will be inspected to determine if it's safe to serialize; if so, the default +MemPool storage device will be used; if not, then a `MemPool.CPURAMDevice` will +be used. + +`type` can be specified manually to force the type to be `Chunk{type}`. + +If `rewrap==true` and `x isa Chunk`, then the `Chunk` will be rewrapped in a +new `Chunk`. + +All other kwargs are passed directly to `MemPool.poolset`. +""" +tochunk(x::X, proc::P, space::M; kwargs...) where {X,P<:Processor,M<:MemorySpace} = + tochunk(x, proc, space, AnyScope(); kwargs...) +function tochunk(x::X, proc::P, space::M, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,P<:Processor,S,M<:MemorySpace} + if x isa Chunk + check_proc_space(x, proc, space) + return maybe_rewrap(x, proc, space, scope; type, rewrap) + end + if device === nothing + device = if Sch.walk_storage_safe(x) + MemPool.GLOBAL_DEVICE[] + else + MemPool.CPURAMDevice() + end + end + ref = tochunk_pset(x, space; device, kwargs...) + return Chunk{type,typeof(ref),P,S,typeof(space)}(type, domain(x), ref, proc, scope, space) +end +function tochunk(x::X, proc::P, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,P<:Processor,S} + if device === nothing + device = if Sch.walk_storage_safe(x) + MemPool.GLOBAL_DEVICE[] + else + MemPool.CPURAMDevice() + end + end + if x isa Chunk + space = x.space + check_proc_space(x, proc, space) + return maybe_rewrap(x, proc, space, scope; type, rewrap) + end + space = default_memory_space(current_acceleration(), x) + ref = tochunk_pset(x, space; device, kwargs...) + return Chunk{type,typeof(ref),P,S,typeof(space)}(type, domain(x), ref, proc, scope, space) +end +function tochunk(x::X, space::M, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,M<:MemorySpace,S} + if device === nothing + device = if Sch.walk_storage_safe(x) + MemPool.GLOBAL_DEVICE[] + else + MemPool.CPURAMDevice() + end + end + if x isa Chunk + proc = x.processor + check_proc_space(x, proc, space) + return maybe_rewrap(x, proc, space, scope; type, rewrap) + end + proc = default_processor(current_acceleration(), x) + ref = tochunk_pset(x, space; device, kwargs...) + return Chunk{type,typeof(ref),typeof(proc),S,M}(type, domain(x), ref, proc, scope, space) +end +tochunk(x, procOrSpace; kwargs...) = tochunk(x, procOrSpace, AnyScope(); kwargs...) +tochunk(x; kwargs...) = tochunk(x, default_memory_space(current_acceleration(), x), AnyScope(); kwargs...) + +check_proc_space(x, proc, space) = nothing +function check_proc_space(x::Chunk, proc, space) + if x.space !== space + throw(ArgumentError("Memory space mismatch: Chunk=$(x.space) != Requested=$space")) + end +end +function check_proc_space(x::Thunk, proc, space) + # FIXME: Validate +end +function maybe_rewrap(x, proc, space, scope; type, rewrap) + if rewrap + return remotecall_fetch(x.handle.owner) do + tochunk(MemPool.poolget(x.handle), proc, scope; kwargs...) + end + else + return x + end +end + +tochunk_pset(x, space::MemorySpace; device=nothing, kwargs...) = poolset(x; device, kwargs...) + +function savechunk(data, dir, f) + sz = open(joinpath(dir, f), "w") do io + serialize(io, MemPool.MMWrap(data)) + return position(io) + end + fr = FileRef(f, sz) + proc = OSProc() + scope = AnyScope() # FIXME: Scoped to this node + return Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true) +end diff --git a/src/types/acceleration.jl b/src/types/acceleration.jl new file mode 100644 index 000000000..b647dd303 --- /dev/null +++ b/src/types/acceleration.jl @@ -0,0 +1 @@ +abstract type Acceleration end \ No newline at end of file diff --git a/src/types/chunk.jl b/src/types/chunk.jl new file mode 100644 index 000000000..9b8102a6d --- /dev/null +++ b/src/types/chunk.jl @@ -0,0 +1,27 @@ +""" + Chunk + +A reference to a piece of data located on a remote worker. `Chunk`s are +typically created with `Dagger.tochunk(data)`, and the data can then be +accessed from any worker with `collect(::Chunk)`. `Chunk`s are +serialization-safe, and use distributed refcounting (provided by +`MemPool.DRef`) to ensure that the data referenced by a `Chunk` won't be GC'd, +as long as a reference exists on some worker. + +Each `Chunk` is associated with a given `Dagger.Processor`, which is (in a +sense) the processor that "owns" or contains the data. Calling +`collect(::Chunk)` will perform data movement and conversions defined by that +processor to safely serialize the data to the calling worker. + +## Constructors +See [`tochunk`](@ref). +""" + +mutable struct Chunk{T, H, P<:Processor, S<:AbstractScope, M<:MemorySpace} + chunktype::Type{T} + domain + handle::H + processor::P + scope::S + space::M +end diff --git a/src/types/memory-space.jl b/src/types/memory-space.jl new file mode 100644 index 000000000..247ceccb0 --- /dev/null +++ b/src/types/memory-space.jl @@ -0,0 +1 @@ +abstract type MemorySpace end \ No newline at end of file diff --git a/src/types/processor.jl b/src/types/processor.jl new file mode 100644 index 000000000..e70600b24 --- /dev/null +++ b/src/types/processor.jl @@ -0,0 +1,11 @@ +""" + Processor + +An abstract type representing a processing device and associated memory, where +data can be stored and operated on. Subtypes should be immutable, and +instances should compare equal if they represent the same logical processing +device/memory. Subtype instances should be serializable between different +nodes. Subtype instances may contain a "parent" `Processor` to make it easy to +transfer data to/from other types of `Processor` at runtime. +""" +abstract type Processor end \ No newline at end of file diff --git a/src/types/scope.jl b/src/types/scope.jl new file mode 100644 index 000000000..0197fddf9 --- /dev/null +++ b/src/types/scope.jl @@ -0,0 +1 @@ +abstract type AbstractScope end \ No newline at end of file diff --git a/src/utils/chunks.jl b/src/utils/chunks.jl deleted file mode 100644 index 9f0c3b487..000000000 --- a/src/utils/chunks.jl +++ /dev/null @@ -1,189 +0,0 @@ -### Mutation - -function _mutable_inner(@nospecialize(f), proc, scope) - result = f() - return Ref(Dagger.tochunk(result, proc, scope)) -end - -""" - mutable(f::Base.Callable; worker, processor, scope) -> Chunk - -Calls `f()` on the specified worker or processor, returning a `Chunk` -referencing the result with the specified scope `scope`. -""" -function mutable(@nospecialize(f); worker=nothing, processor=nothing, scope=nothing) - if processor === nothing - if worker === nothing - processor = OSProc() - else - processor = OSProc(worker) - end - else - @assert worker === nothing "mutable: Can't mix worker and processor" - end - if scope === nothing - scope = processor isa OSProc ? ProcessScope(processor) : ExactScope(processor) - end - return fetch(Dagger.@spawn scope=scope _mutable_inner(f, processor, scope))[] -end - -""" - @mutable [worker=1] [processor=OSProc()] [scope=ProcessorScope()] f() - -Helper macro for [`mutable()`](@ref). -""" -macro mutable(exs...) - opts = esc.(exs[1:end-1]) - ex = exs[end] - quote - let f = @noinline ()->$(esc(ex)) - $mutable(f; $(opts...)) - end - end -end - -""" -Maps a value to one of multiple distributed "mirror" values automatically when -used as a thunk argument. Construct using `@shard` or `shard`. -""" -struct Shard - chunks::Dict{Processor,Chunk} -end - -""" - shard(f; kwargs...) -> Chunk{Shard} - -Executes `f` on all workers in `workers`, wrapping the result in a -process-scoped `Chunk`, and constructs a `Chunk{Shard}` containing all of these -`Chunk`s on the current worker. - -Keyword arguments: -- `procs` -- The list of processors to create pieces on. May be any iterable container of `Processor`s. -- `workers` -- The list of workers to create pieces on. May be any iterable container of `Integer`s. -- `per_thread::Bool=false` -- If `true`, creates a piece per each thread, rather than a piece per each worker. -""" -function shard(@nospecialize(f); procs=nothing, workers=nothing, per_thread=false) - if procs === nothing - if workers !== nothing - procs = [OSProc(w) for w in workers] - else - procs = lock(Sch.eager_context()) do - copy(Sch.eager_context().procs) - end - end - if per_thread - _procs = ThreadProc[] - for p in procs - append!(_procs, filter(p->p isa ThreadProc, get_processors(p))) - end - procs = _procs - end - else - if workers !== nothing - throw(ArgumentError("Cannot combine `procs` and `workers`")) - elseif per_thread - throw(ArgumentError("Cannot combine `procs` and `per_thread=true`")) - end - end - isempty(procs) && throw(ArgumentError("Cannot create empty Shard")) - shard_running_dict = Dict{Processor,DTask}() - for proc in procs - scope = proc isa OSProc ? ProcessScope(proc) : ExactScope(proc) - thunk = Dagger.@spawn scope=scope _mutable_inner(f, proc, scope) - shard_running_dict[proc] = thunk - end - shard_dict = Dict{Processor,Chunk}() - for proc in procs - shard_dict[proc] = fetch(shard_running_dict[proc])[] - end - return Shard(shard_dict) -end - -"Creates a `Shard`. See [`Dagger.shard`](@ref) for details." -macro shard(exs...) - opts = esc.(exs[1:end-1]) - ex = exs[end] - quote - let f = @noinline ()->$(esc(ex)) - $shard(f; $(opts...)) - end - end -end - -function move(from_proc::Processor, to_proc::Processor, shard::Shard) - # Match either this proc or some ancestor - # N.B. This behavior may bypass the piece's scope restriction - proc = to_proc - if haskey(shard.chunks, proc) - return move(from_proc, to_proc, shard.chunks[proc]) - end - parent = Dagger.get_parent(proc) - while parent != proc - proc = parent - parent = Dagger.get_parent(proc) - if haskey(shard.chunks, proc) - return move(from_proc, to_proc, shard.chunks[proc]) - end - end - - throw(KeyError(to_proc)) -end -Base.iterate(s::Shard) = iterate(values(s.chunks)) -Base.iterate(s::Shard, state) = iterate(values(s.chunks), state) -Base.length(s::Shard) = length(s.chunks) - -### Core Stuff - -""" - tochunk(x, proc::Processor, scope::AbstractScope; device=nothing, rewrap=false, kwargs...) -> Chunk - -Create a chunk from data `x` which resides on `proc` and which has scope -`scope`. - -`device` specifies a `MemPool.StorageDevice` (which is itself wrapped in a -`Chunk`) which will be used to manage the reference contained in the `Chunk` -generated by this function. If `device` is `nothing` (the default), the data -will be inspected to determine if it's safe to serialize; if so, the default -MemPool storage device will be used; if not, then a `MemPool.CPURAMDevice` will -be used. - -If `rewrap==true` and `x isa Chunk`, then the `Chunk` will be rewrapped in a -new `Chunk`. - -All other kwargs are passed directly to `MemPool.poolset`. -""" -function tochunk(x::X, proc::P=OSProc(), scope::S=AnyScope(); device=nothing, rewrap=false, kwargs...) where {X,P,S} - if device === nothing - device = if Sch.walk_storage_safe(x) - MemPool.GLOBAL_DEVICE[] - else - MemPool.CPURAMDevice() - end - end - ref = poolset(x; device, kwargs...) - Chunk{X,typeof(ref),P,S}(X, domain(x), ref, proc, scope) -end -function tochunk(x::Chunk, proc=nothing, scope=nothing; rewrap=false, kwargs...) - if rewrap - return remotecall_fetch(x.handle.owner) do - tochunk(MemPool.poolget(x.handle), proc, scope; kwargs...) - end - else - return x - end -end -tochunk(x::Thunk, proc=nothing, scope=nothing; kwargs...) = x - -root_worker_id(chunk::Chunk) = root_worker_id(chunk.handle) -root_worker_id(dref::DRef) = dref.owner # FIXME: Migration - -function savechunk(data, dir, f) - sz = open(joinpath(dir, f), "w") do io - serialize(io, MemPool.MMWrap(data)) - return position(io) - end - fr = FileRef(f, sz) - proc = OSProc() - scope = AnyScope() # FIXME: Scoped to this node - Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true) -end diff --git a/src/utils/scopes.jl b/src/utils/scopes.jl index 949ae2276..84aecc179 100644 --- a/src/utils/scopes.jl +++ b/src/utils/scopes.jl @@ -29,14 +29,23 @@ compatible_processors(scope::AbstractScope=get_compute_scope(), ctx::Context=Sch function compatible_processors(scope::AbstractScope, procs::Vector{<:Processor}) compat_procs = Set{Processor}() for gproc in procs - # Fast-path in case entire process is incompatible - gproc_scope = ProcessScope(gproc) - if !isa(constrain(scope, gproc_scope), InvalidScope) - for proc in get_processors(gproc) - if proc_in_scope(proc, scope) - push!(compat_procs, proc) - end - end + for proc in get_processors(gproc) + proc_in_scope(proc, scope) || continue + push!(compat_procs, proc) + end + end + return compat_procs +end +compatible_processors(acceleration::Acceleration, scope::AbstractScope=get_compute_scope(), ctx::Context=Sch.eager_context()) = + compatible_processors(acceleration, scope, procs(ctx)) +function compatible_processors(acceleration::Acceleration, scope::AbstractScope, procs::Vector{<:Processor}) + compat_procs = Set{Processor}() + for gproc in procs + accel_matches_proc(acceleration, gproc) || continue + for proc in get_processors(gproc) + accel_matches_proc(acceleration, proc) || continue + proc_in_scope(proc, scope) || continue + push!(compat_procs, proc) end end return compat_procs diff --git a/src/weakchunk.jl b/src/weakchunk.jl new file mode 100644 index 000000000..e31070536 --- /dev/null +++ b/src/weakchunk.jl @@ -0,0 +1,23 @@ +struct WeakChunk + wid::Int + id::Int + x::WeakRef +end + +function WeakChunk(c::Chunk) + return WeakChunk(c.handle.owner, c.handle.id, WeakRef(c)) +end + +unwrap_weak(c::WeakChunk) = c.x.value +function unwrap_weak_checked(c::WeakChunk) + cw = unwrap_weak(c) + @assert cw !== nothing "WeakChunk expired: ($(c.wid), $(c.id))" + return cw +end +wrap_weak(c::Chunk) = WeakChunk(c) +isweak(c::WeakChunk) = true +isweak(c::Chunk) = false +is_task_or_chunk(c::WeakChunk) = true +Serialization.serialize(io::AbstractSerializer, wc::WeakChunk) = + error("Cannot serialize a WeakChunk") +chunktype(c::WeakChunk) = chunktype(unwrap_weak_checked(c)) diff --git a/test/mpi.jl b/test/mpi.jl new file mode 100644 index 000000000..a428e4256 --- /dev/null +++ b/test/mpi.jl @@ -0,0 +1,33 @@ +using Dagger +using MPI + +Dagger.accelerate!(:mpi) +#= +if MPI.Comm_rank(MPI.COMM_WORLD) == 0 + B = rand(4, 4) + Dagger.send_yield(B, MPI.COMM_WORLD, 1, 0) + println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") +else + B = zeros(4, 4) + Dagger.recv_yield!(B, MPI.COMM_WORLD, 0, 0) + println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") +end + +if MPI.Comm_rank(MPI.COMM_WORLD) == 0 + B = "hello" + Dagger.send_yield(B, MPI.COMM_WORLD, 1, 1) + println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") +else + B = "Goodbye" + B1, _ = Dagger.recv_yield!(B, MPI.COMM_WORLD, 0, 1) + println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B1: $B1") +end +=# +A = rand(Blocks(2,2), 4, 4) +Ac = collect(A) +println(Ac) + + +#move!(identity, Ac[1].space , Ac[2].space, Ac[1], Ac[2]) + + From 2f72b1e464019913326912c8547cb85fc583e2cc Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 4 Sep 2025 08:17:10 -0700 Subject: [PATCH 07/24] linalg: Make norm2 work for non-matrices --- src/array/linalg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/linalg.jl b/src/array/linalg.jl index bfec27aea..553303032 100644 --- a/src/array/linalg.jl +++ b/src/array/linalg.jl @@ -215,4 +215,4 @@ end function LinearAlgebra.ldiv!(C::DVecOrMat, A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLowerTriangular{<:Any,<:DMatrix},UpperTriangular{<:Any,<:DMatrix},UnitUpperTriangular{<:Any,<:DMatrix}}, B::DVecOrMat) LinearAlgebra.ldiv!(A, copyto!(C, B)) -end \ No newline at end of file +end From 4b1b185c30aaca6a1f21c4b74eec1a5ff307d543 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Thu, 4 Sep 2025 08:18:29 -0700 Subject: [PATCH 08/24] TEMP DArray: Add GMRES --- src/Dagger.jl | 4 +- src/array/gmres.jl | 152 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 src/array/gmres.jl diff --git a/src/Dagger.jl b/src/Dagger.jl index 411d184e1..5e07f9ca8 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -10,7 +10,7 @@ import MemPool: DRef, FileRef, poolget, poolset import Base: collect, reduce, view import NextLA import LinearAlgebra -import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, I +import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, I, norm, dot import Random import Random: AbstractRNG @@ -138,7 +138,7 @@ include("array/mul.jl") include("array/cholesky.jl") include("array/trsm.jl") include("array/lu.jl") -include("array/qr.jl") +include("array/gmres.jl") # GPU include("gpu.jl") diff --git a/src/array/gmres.jl b/src/array/gmres.jl new file mode 100644 index 000000000..5127eb314 --- /dev/null +++ b/src/array/gmres.jl @@ -0,0 +1,152 @@ +function gmres(A::DArray, b::DVector; x0=nothing, m=length(b), tol=1e-6, maxiter=100) + """ + GMRES algorithm for solving Ax = b + + Args: + A: coefficient matrix (or function that computes A*v) + b: right-hand side vector + x0: initial guess (default: zero vector) + m: restart parameter (default: no restart) + tol: convergence tolerance + maxiter: maximum number of restarts + + Returns: + x: solution vector + residual_norm: final residual norm + iterations: number of iterations + """ + n = length(b) + x = x0 === nothing ? zeros(AutoBlocks(), n) : DArray(copy(x0)) + + # Initial residual + r = b - A * x + β = norm(r) + + if β < tol + return x, β, 0 + end + + for restart in 1:maxiter + # Krylov subspace basis vectors + V = zeros(AutoBlocks(), n, m + 1) + V[:, 1] = r / β + + # Upper Hessenberg matrix + H = zeros(m + 1, m) + + # Givens rotation matrices (store cos and sin) + cs = zeros(m) + sn = zeros(m) + + # RHS for least squares problem + e1 = zeros(AutoBlocks(), m + 1) + e1[1] = β + + # Arnoldi iteration + for j in 1:m + # Apply matrix to current basis vector + w = A * V[:, j] + + # Modified Gram-Schmidt orthogonalization + for i in 1:j + H[i, j] = dot(w, V[:, i]) + w -= H[i, j] * V[:, i] + end + + H[j + 1, j] = norm(w) + + # Check for breakdown + if abs(H[j + 1, j]) < eps() + m = j + break + end + + V[:, j + 1] = w / H[j + 1, j] + + # Apply previous Givens rotations to new column of H + for i in 1:(j-1) + temp = cs[i] * H[i, j] + sn[i] * H[i + 1, j] + H[i + 1, j] = -sn[i] * H[i, j] + cs[i] * H[i + 1, j] + H[i, j] = temp + end + + # Compute new Givens rotation + if abs(H[j + 1, j]) < eps() + cs[j] = 1.0 + sn[j] = 0.0 + else + if abs(H[j + 1, j]) > abs(H[j, j]) + τ = H[j, j] / H[j + 1, j] + sn[j] = 1.0 / sqrt(1 + τ^2) + cs[j] = sn[j] * τ + else + τ = H[j + 1, j] / H[j, j] + cs[j] = 1.0 / sqrt(1 + τ^2) + sn[j] = cs[j] * τ + end + end + + # Apply new Givens rotation + temp = cs[j] * H[j, j] + sn[j] * H[j + 1, j] + H[j + 1, j] = -sn[j] * H[j, j] + cs[j] * H[j + 1, j] + H[j, j] = temp + + # Apply rotation to RHS + temp = cs[j] * e1[j] + sn[j] * e1[j + 1] + e1[j + 1] = -sn[j] * e1[j] + cs[j] * e1[j + 1] + e1[j] = temp + + # Check convergence + residual_norm = abs(e1[j + 1]) + if residual_norm < tol + m = j + break + end + end + + # Solve upper triangular system H[1:m, 1:m] * y = e1[1:m] + y = zeros(m) + for i in m:-1:1 + y[i] = e1[i] + for k in (i+1):m + y[i] -= H[i, k] * y[k] + end + y[i] /= H[i, i] + end + + # Update solution + for i in 1:m + x += y[i] * V[:, i] + end + + # Check final convergence + r = b - A * x + β = norm(r) + + if β < tol + return x, β, restart + end + end + + return x, β, maxiter +end + +# Example usage +function example_usage() + # Create test problem + n = 100 + A = DArray(randn(n, n) + 5*I) # Well-conditioned matrix + x_true = randn(AutoBlocks(), n) + b = A * x_true + + # Solve with GMRES + allowscalar(false) do + x_gmres, res_norm, iters = gmres(A, b, tol=1e-10) + end + + println("GMRES converged in $iters iterations") + println("Final residual norm: $res_norm") + println("Solution error: $(norm(x_gmres - x_true))") + + return x_gmres +end From e34a3637f1388028553dee89643a2f13f3cc6b7e Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Tue, 17 Jun 2025 16:22:23 -0300 Subject: [PATCH 09/24] sparse array in-place send/recv --- src/mpi.jl | 154 +++++++++++++++++++++++++++++++--------------------- test/mpi.jl | 75 ++++++++++++++++++------- 2 files changed, 149 insertions(+), 80 deletions(-) diff --git a/src/mpi.jl b/src/mpi.jl index c3eaf0652..26a1ce9f0 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -54,6 +54,7 @@ function aliasing(accel::MPIAcceleration, x::Chunk, T) tag = to_tag(hash(handle.id, hash(:aliasing))) check_uniform(tag) rank = MPI.Comm_rank(accel.comm) + if handle.rank == rank ainfo = aliasing(x, T) #Core.print("[$rank] aliasing: $ainfo, sending\n") @@ -378,6 +379,19 @@ const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->60.0) const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) +struct InplaceInfo + type::DataType + shape::Tuple +end +struct InplaceSparseInfo + type::DataType + m::Int + n::Int + colptr::Int + rowval::Int + nzval::Int +end + function supports_inplace_mpi(value) if value isa DenseArray && isbitstype(eltype(value)) return true @@ -386,15 +400,11 @@ function supports_inplace_mpi(value) end end function recv_yield!(buffer, comm, src, tag) + rank = MPI.Comm_rank(comm) #Core.println("buffer recv: $buffer, type of buffer: $(typeof(buffer)), is in place? $(supports_inplace_mpi(buffer))") if !supports_inplace_mpi(buffer) return recv_yield(comm, src, tag), false end - time_start = time_ns() - detect = DEADLOCK_DETECT[] - warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) - timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) - rank = MPI.Comm_rank(comm) #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv! from [$src]") # Ensure no other receiver is waiting @@ -413,46 +423,19 @@ function recv_yield!(buffer, comm, src, tag) wait(other_event) @goto retry end - while true - (got, msg, stat) = MPI.Improbe(src, tag, comm, MPI.Status) - if got - if MPI.Get_error(stat) != MPI.SUCCESS - error("recv_yield (Improbe) failed with error $(MPI.Get_error(stat))") - end - - req = MPI.Imrecv!(MPI.Buffer(buffer), msg) - while true - finish, stat = MPI.Test(req, MPI.Status) - if finish - if MPI.Get_error(stat) != MPI.SUCCESS - error("recv_yield (Test) failed with error $(MPI.Get_error(stat))") - end - - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Received value") - lock(RECV_WAITING) do waiting - delete!(waiting, (comm, src, tag)) - notify(our_event) - end - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Released lock") - return value, true - end - warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) - yield() - end - end - warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) - yield() + + buffer = recv_yield_inplace!(buffer, comm, rank, src, tag) + + lock(RECV_WAITING) do waiting + delete!(waiting, (comm, src, tag)) + notify(our_event) end + + return buffer, true + end -struct InplaceInfo - type::DataType - shape::Tuple -end + function recv_yield(comm, src, tag) - time_start = time_ns() - detect = DEADLOCK_DETECT[] - warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) - timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) rank = MPI.Comm_rank(comm) #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv from [$src]") @@ -468,7 +451,7 @@ function recv_yield(comm, src, tag) end end if other_event !== nothing - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") + Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waitingg for other receiver...") wait(other_event) @goto retry end @@ -477,20 +460,23 @@ function recv_yield(comm, src, tag) type = nothing @label receive value = recv_yield_serialized(comm, rank, src, tag) - if value isa InplaceInfo + if value isa InplaceInfo || value isa InplaceSparseInfo value = recv_yield_inplace(value, comm, rank, src, tag) end + lock(RECV_WAITING) do waiting delete!(waiting, (comm, src, tag)) notify(our_event) end return value end -function recv_yield_serialized(comm, my_rank, their_rank, tag) + +function recv_yield_inplace!(array, comm, my_rank, their_rank, tag) time_start = time_ns() detect = DEADLOCK_DETECT[] warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + while true (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) if got @@ -498,25 +484,42 @@ function recv_yield_serialized(comm, my_rank, their_rank, tag) error("recv_yield failed with error $(MPI.Get_error(stat))") end count = MPI.Get_count(stat, UInt8) - buf = Array{UInt8}(undef, count) - req = MPI.Imrecv!(MPI.Buffer(buf), msg) + @assert count == sizeof(array) "recv_yield_inplace: expected $(sizeof(array)) bytes, got $count" + buf = MPI.Buffer(array) + req = MPI.Imrecv!(buf, msg) __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") - return MPI.deserialize(buf) + return array end warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end end + function recv_yield_inplace(_value::InplaceInfo, comm, my_rank, their_rank, tag) + T = _value.type + @assert T <: Array && isbitstype(eltype(T)) "recv_yield_inplace only supports inplace MPI transfers of bitstype dense arrays" + array = Array{eltype(T)}(undef, _value.shape) + return recv_yield_inplace!(array, comm, my_rank, their_rank, tag) +end + +function recv_yield_inplace(_value::InplaceSparseInfo, comm, my_rank, their_rank, tag) + T = _value.type + @assert T <: SparseMatrixCSC "recv_yield_inplace only supports inplace MPI transfers of SparseMatrixCSC" + + colptr = recv_yield_inplace!(Vector{Int64}(undef, _value.colptr), comm, my_rank, their_rank, tag) + rowval = recv_yield_inplace!(Vector{Int64}(undef, _value.rowval), comm, my_rank, their_rank, tag) + nzval = recv_yield_inplace!(Vector{eltype(T)}(undef, _value.nzval), comm, my_rank, their_rank, tag) + + return SparseMatrixCSC{eltype(T), Int64}(_value.m, _value.n, colptr, rowval, nzval) + +end + +function recv_yield_serialized(comm, my_rank, their_rank, tag) time_start = time_ns() detect = DEADLOCK_DETECT[] warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) - T = _value.type - @assert T <: Array && isbitstype(eltype(T)) "recv_yield_inplace only supports inplace MPI transfers of bitstype dense arrays" - array = Array{eltype(T)}(undef, _value.shape) - while true (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) if got @@ -524,17 +527,14 @@ function recv_yield_inplace(_value::InplaceInfo, comm, my_rank, their_rank, tag) error("recv_yield failed with error $(MPI.Get_error(stat))") end count = MPI.Get_count(stat, UInt8) - @assert count == sizeof(array) "recv_yield_inplace: expected $(sizeof(array)) bytes, got $count" - buf = MPI.Buffer(array) - req = MPI.Imrecv!(buf, msg) + buf = Array{UInt8}(undef, count) + req = MPI.Imrecv!(MPI.Buffer(buf), msg) __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") - break + return MPI.deserialize(buf) end warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end - - return array end const SEEN_TAGS = Dict{Int32, Type}() @@ -558,19 +558,27 @@ function _send_yield(value, comm, dest, tag; check_seen::Bool=true, inplace::Boo send_yield_serialized(value, comm, rank, dest, tag) end end + function send_yield_inplace(value, comm, my_rank, their_rank, tag) req = MPI.Isend(value, comm; dest=their_rank, tag) __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") end + function send_yield_serialized(value, comm, my_rank, their_rank, tag) if value isa Array && isbitstype(eltype(value)) send_yield_serialized(InplaceInfo(typeof(value), size(value)), comm, my_rank, their_rank, tag) send_yield_inplace(value, comm, my_rank, their_rank, tag) + elseif value isa SparseMatrixCSC && isbitstype(eltype(value)) + send_yield_serialized(InplaceSparseInfo(typeof(value), value.m, value.n, length(value.colptr), length(value.rowval), length(value.nzval)), comm, my_rank, their_rank, tag) + send_yield_inplace(value.colptr, comm, my_rank, their_rank, tag) + send_yield_inplace(value.rowval, comm, my_rank, their_rank, tag) + send_yield_inplace(value.nzval, comm, my_rank, their_rank, tag) else req = MPI.isend(value, comm; dest=their_rank, tag) __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") end end + function __wait_for_request(req, comm, my_rank, their_rank, tag, fn::String, kind::String) time_start = time_ns() detect = DEADLOCK_DETECT[] @@ -597,6 +605,26 @@ function bcast_send_yield(value, comm, root, tag) send_yield(value, comm, other_rank, tag) end end + +#= Maybe can be worth it to implement this +function bcast_send_yield!(value, comm, root, tag) + sz = MPI.Comm_size(comm) + rank = MPI.Comm_rank(comm) + + for other_rank in 0:(sz-1) + rank == other_rank && continue + #println("[rank $rank] Sending to rank $other_rank") + send_yield!(value, comm, other_rank, tag) + end +end + +function bcast_recv_yield!(value, comm, root, tag) + sz = MPI.Comm_size(comm) + rank = MPI.Comm_rank(comm) + #println("[rank $rank] receive from rank $root") + recv_yield!(value, comm, root, tag) +end +=# function mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, kind, srcdest) time_elapsed = (time_ns() - time_start) if detect && time_elapsed > warn_period @@ -813,7 +841,8 @@ end #FIXME:try to think of a better move! scheme function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) - tag = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) + #tag_T = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) + tag_space = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(1)))) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing @@ -829,10 +858,13 @@ function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) if islocal T = typeof(result) space = memory_space(result, proc)::MPIMemorySpace - bcast_send_yield((T, space.innerSpace), proc.comm, proc.rank, tag) + T_space = (T, space) + bcast_send_yield(T_space, proc.comm, proc.rank, tag_space) return tochunk(result, proc, space) else - T, innerSpace = recv_yield(proc.comm, proc.rank, tag) + #T = recv_yield(proc.comm, proc.rank, tag_T) + #innerSpace = recv_yield(proc.comm, proc.rank, tag_space) + T, innerSpace = recv_yield(proc.comm, proc.rank, tag_space) space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) return tochunk(nothing, proc, space; type=T) end diff --git a/test/mpi.jl b/test/mpi.jl index a428e4256..a84ffdce1 100644 --- a/test/mpi.jl +++ b/test/mpi.jl @@ -1,33 +1,70 @@ using Dagger using MPI +using LinearAlgebra +using SparseArrays Dagger.accelerate!(:mpi) -#= -if MPI.Comm_rank(MPI.COMM_WORLD) == 0 - B = rand(4, 4) - Dagger.send_yield(B, MPI.COMM_WORLD, 1, 0) - println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") + +comm = MPI.COMM_WORLD +rank = MPI.Comm_rank(comm) +size = MPI.Comm_size(comm) + +# Use a large array (adjust size as needed for your RAM) +N = 100 +tag = 123 + +if rank == 0 + arr = sprand(N, N, 0.6) else - B = zeros(4, 4) - Dagger.recv_yield!(B, MPI.COMM_WORLD, 0, 0) - println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") + arr = spzeros(N, N) end -if MPI.Comm_rank(MPI.COMM_WORLD) == 0 - B = "hello" - Dagger.send_yield(B, MPI.COMM_WORLD, 1, 1) - println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B: $B") -else - B = "Goodbye" - B1, _ = Dagger.recv_yield!(B, MPI.COMM_WORLD, 0, 1) - println("rank $(MPI.Comm_rank(MPI.COMM_WORLD)) B1: $B1") +# --- Out-of-place broadcast --- +function bcast_outofplace() + MPI.Barrier(comm) + if rank == 0 + Dagger.bcast_send_yield(arr, comm, 0, tag+1) + else + Dagger.bcast_recv_yield(comm, 0, tag+1) + end + MPI.Barrier(comm) end -=# +# --- In-place broadcast --- + +function bcast_inplace() + MPI.Barrier(comm) + if rank == 0 + Dagger.bcast_send_yield!(arr, comm, 0, tag) + else + Dagger.bcast_recv_yield!(arr, comm, 0, tag) + end + MPI.Barrier(comm) +end + +function bcast_inplace_metadata() + MPI.Barrier(comm) + if rank == 0 + Dagger.bcast_send_yield_metadata(arr, comm, 0) + end + MPI.Barrier(comm) +end + + +inplace = @time bcast_inplace() + + +MPI.Barrier(comm) +MPI.Finalize() + + + + +#= A = rand(Blocks(2,2), 4, 4) Ac = collect(A) println(Ac) -#move!(identity, Ac[1].space , Ac[2].space, Ac[1], Ac[2]) - +move!(identity, Ac[1].space , Ac[2].space, Ac[1], Ac[2]) +=# From 353335dd3c2575e1efdb56e85ce63c2df45674ab Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Sat, 4 Oct 2025 08:46:07 -0700 Subject: [PATCH 10/24] fixup! sparse array in-place send/recv --- src/mpi.jl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/mpi.jl b/src/mpi.jl index 26a1ce9f0..49a96f99b 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -451,7 +451,7 @@ function recv_yield(comm, src, tag) end end if other_event !== nothing - Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waitingg for other receiver...") + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") wait(other_event) @goto retry end @@ -511,7 +511,6 @@ function recv_yield_inplace(_value::InplaceSparseInfo, comm, my_rank, their_rank nzval = recv_yield_inplace!(Vector{eltype(T)}(undef, _value.nzval), comm, my_rank, their_rank, tag) return SparseMatrixCSC{eltype(T), Int64}(_value.m, _value.n, colptr, rowval, nzval) - end function recv_yield_serialized(comm, my_rank, their_rank, tag) @@ -841,8 +840,7 @@ end #FIXME:try to think of a better move! scheme function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) - #tag_T = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) - tag_space = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(1)))) + tag = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing @@ -858,13 +856,11 @@ function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) if islocal T = typeof(result) space = memory_space(result, proc)::MPIMemorySpace - T_space = (T, space) - bcast_send_yield(T_space, proc.comm, proc.rank, tag_space) + T_space = (T, space.innerSpace) + bcast_send_yield(T_space, proc.comm, proc.rank, tag) return tochunk(result, proc, space) else - #T = recv_yield(proc.comm, proc.rank, tag_T) - #innerSpace = recv_yield(proc.comm, proc.rank, tag_space) - T, innerSpace = recv_yield(proc.comm, proc.rank, tag_space) + T, innerSpace = recv_yield(proc.comm, proc.rank, tag) space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) return tochunk(nothing, proc, space; type=T) end From 2ac8cb359c2d2b104fca9d14d07cc50f3ff22132 Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Mon, 29 Sep 2025 14:59:44 +0000 Subject: [PATCH 11/24] DArray: Restrict copyto! scope to destination --- src/array/copy.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/array/copy.jl b/src/array/copy.jl index 7ed815daf..d032525f9 100644 --- a/src/array/copy.jl +++ b/src/array/copy.jl @@ -119,7 +119,14 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices( Arange_local = Arange_global_clamped .- CartesianIndex(Arange_start) .+ CartesianIndex{Nmax}(1) # Perform local view copy - Dagger.@spawn copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local) + space = (Bpart isa DTask ? fetch(Bpart; move_value=false, unwrap=false) : Bpart).space + procs = processors(space) + scope = UnionScope([ExactScope(proc) for proc in procs]) + check_uniform(space) + for proc in procs + check_uniform(proc) + end + Dagger.@spawn scope = scope copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local) end end end From 77003cf52f1809caf806cff9102d3202e8090dcd Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Mon, 29 Sep 2025 15:07:40 +0000 Subject: [PATCH 12/24] Add opcounter debugging tool --- src/datadeps/aliasing.jl | 2 ++ src/datadeps/remainders.jl | 3 +++ src/utils/dagdebug.jl | 19 ++++++++----------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index 57ebee404..425d9efcd 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -434,6 +434,8 @@ end function merge_history!(state::DataDepsState, arg_w::ArgumentWrapper, other_arg_w::ArgumentWrapper) history = state.arg_history[arg_w] for (other_ainfo, other_space, write_num) in state.arg_history[other_arg_w] + @opcounter :merge_history + @opcounter :merge_history_complexity length(history) idx = findfirst(h->h[3] > write_num, history) if idx === nothing if isempty(history) diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl index 0ac90aa78..5cf8a2621 100644 --- a/src/datadeps/remainders.jl +++ b/src/datadeps/remainders.jl @@ -147,6 +147,7 @@ function compute_remainder_for_arg!(state::DataDepsState, # FIXME: This is a hack to ensure that we don't miss any history generated by aliasing(...) for (_, space, _) in state.arg_history[arg_w] if !in(space, spaces) + @opcounter :compute_remainder_for_arg_restart @goto restart end end @@ -200,6 +201,7 @@ function compute_remainder_for_arg!(state::DataDepsState, # Only subtract, this data is already up-to-date in target_space # N.B. We don't add to syncdeps here, because we'll see this ainfo # in get_write_deps! + @opcounter :compute_remainder_for_arg_subtract subtract_spans!(remainder, other_many_spans) continue end @@ -210,6 +212,7 @@ function compute_remainder_for_arg!(state::DataDepsState, tracker_other_space = get!(tracker, other_space) do (Vector{Tuple{LocalMemorySpan,LocalMemorySpan}}(), Set{ThunkSyncdep}()) end + @opcounter :compute_remainder_for_arg_schedule schedule_remainder!(tracker_other_space[1], other_space_idx, target_space_idx, remainder, other_many_spans) get_read_deps!(state, other_space, other_ainfo, write_num, tracker_other_space[2]) end diff --git a/src/utils/dagdebug.jl b/src/utils/dagdebug.jl index 873e47e79..a4d9bba1d 100644 --- a/src/utils/dagdebug.jl +++ b/src/utils/dagdebug.jl @@ -36,27 +36,24 @@ macro dagdebug(thunk, category, msg, args...) end) end -# FIXME: Calculate fast-growth based on clock time, not iteration +@warn "Make this threadsafe by putting counter into Module" maxlog=1 +@warn "Calculate fast-growth based on clock time, not iteration" maxlog=1 const OPCOUNTER_CATEGORIES = Symbol[] const OPCOUNTER_FAST_GROWTH_THRESHOLD = Ref(10_000_000) -struct OpCounter - value::Threads.Atomic{Int} -end -OpCounter() = OpCounter(Threads.Atomic{Int}(0)) +const OPCOUNTERS = Dict{Symbol,Threads.Atomic{Int}}() macro opcounter(category, count=1) cat_sym = category.value @gensym old - opcounter_sym = Symbol(:OPCOUNTER_, cat_sym) - if !isdefined(__module__, opcounter_sym) - __module__.eval(:(#=const=# $opcounter_sym = OpCounter())) - end esc(quote if $(QuoteNode(cat_sym)) in $OPCOUNTER_CATEGORIES - $old = Threads.atomic_add!($opcounter_sym.value, Int($count)) + if !haskey($OPCOUNTERS, $(QuoteNode(cat_sym))) + $OPCOUNTERS[$(QuoteNode(cat_sym))] = Threads.Atomic{Int}(0) + end + $old = Threads.atomic_add!($OPCOUNTERS[$(QuoteNode(cat_sym))], Int($count)) if $old > 1 && (mod1($old, $OPCOUNTER_FAST_GROWTH_THRESHOLD[]) == 1 || $count > $OPCOUNTER_FAST_GROWTH_THRESHOLD[]) println("Fast-growing counter: $($(QuoteNode(cat_sym))) = $($old)") end end end) end -opcounter(mod::Module, category::Symbol) = getfield(mod, Symbol(:OPCOUNTER_, category)).value[] \ No newline at end of file +opcounters() = Dict(cat=>OPCOUNTERS[cat][] for cat in keys(OPCOUNTERS)) \ No newline at end of file From f2ff6f2fd1362d19bfe19fbcab4ad27abe8c72e0 Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Sat, 4 Oct 2025 15:17:01 +0000 Subject: [PATCH 13/24] Add largest value tracker tool --- src/utils/dagdebug.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/utils/dagdebug.jl b/src/utils/dagdebug.jl index a4d9bba1d..b305f85ce 100644 --- a/src/utils/dagdebug.jl +++ b/src/utils/dagdebug.jl @@ -56,4 +56,16 @@ macro opcounter(category, count=1) end end) end -opcounters() = Dict(cat=>OPCOUNTERS[cat][] for cat in keys(OPCOUNTERS)) \ No newline at end of file +opcounters() = Dict(cat=>OPCOUNTERS[cat][] for cat in keys(OPCOUNTERS)) + +const LARGEST_VALUE_COUNTER = Ref(0) +function largest_value_update!(value) + prev = LARGEST_VALUE_COUNTER[] + if value > prev + LARGEST_VALUE_COUNTER[] = value + if value - prev > 10_000 || value > 1_000_000 + println("Largest value growing: $value") + end + end +end +largest_value_counter() = LARGEST_VALUE_COUNTER[] \ No newline at end of file From 5ffeb88e84cbe492e38b01844f07c647974940ae Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Mon, 29 Sep 2025 15:09:18 +0000 Subject: [PATCH 14/24] MPI: Make check_uniform more useful --- src/mpi.jl | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/src/mpi.jl b/src/mpi.jl index 49a96f99b..81f436e0a 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -1,11 +1,11 @@ using MPI -const CHECK_UNIFORMITY = TaskLocalValue{Bool}(()->false) +const CHECK_UNIFORMITY = Ref{Bool}(false) function check_uniformity!(check::Bool=true) CHECK_UNIFORMITY[] = check end -function check_uniform(value::Integer) - CHECK_UNIFORMITY[] || return +function check_uniform(value::Integer, original=value) + CHECK_UNIFORMITY[] || return true comm = MPI.COMM_WORLD rank = MPI.Comm_rank(comm) matched = compare_all(value, comm) @@ -13,14 +13,15 @@ function check_uniform(value::Integer) if rank == 0 Core.print("[$rank] Found non-uniform value!\n") end - Core.print("[$rank] value=$value\n") + Core.print("[$rank] value=$value, original=$original") throw(ArgumentError("Non-uniform value")) end MPI.Barrier(comm) + return matched end -function check_uniform(value) - CHECK_UNIFORMITY[] || return - check_uniform(hash(value)) +function check_uniform(value, original=value) + CHECK_UNIFORMITY[] || return true + return check_uniform(hash(value), original) end function compare_all(value, comm) @@ -109,9 +110,9 @@ end ProcessScope(p::MPIOSProc) = ProcessScope(myid()) -function check_uniform(proc::MPIOSProc) - check_uniform(hash(MPIOSProc)) - check_uniform(proc.rank) +function check_uniform(proc::MPIOSProc, original=proc) + return check_uniform(hash(MPIOSProc), original) && + check_uniform(proc.rank, original) end function memory_spaces(proc::MPIOSProc) @@ -190,11 +191,11 @@ proc_in_scope(proc::Processor, scope::MPIProcessScope) = false proc_in_scope(proc::MPIProcessor, scope::MPIProcessScope) = proc.comm == scope.comm && proc.rank == scope.rank -function check_uniform(proc::MPIProcessor) - check_uniform(hash(MPIProcessor)) - check_uniform(proc.rank) - # TODO: Not always valid (if pointer is embedded, say for GPUs) - check_uniform(hash(proc.innerProc)) +function check_uniform(proc::MPIProcessor, original=proc) + return check_uniform(hash(MPIProcessor), original) && + check_uniform(proc.rank, original) && + # TODO: Not always valid (if pointer is embedded, say for GPUs) + check_uniform(hash(proc.innerProc), original) end Dagger.iscompatible_func(::MPIProcessor, opts, ::Any) = true @@ -240,10 +241,10 @@ struct MPIMemorySpace{S<:MemorySpace} <: MemorySpace rank::Int end -function check_uniform(space::MPIMemorySpace) - check_uniform(space.rank) - # TODO: Not always valid (if pointer is embedded, say for GPUs) - check_uniform(hash(space.innerSpace)) +function check_uniform(space::MPIMemorySpace, original=space) + return check_uniform(space.rank, original) && + # TODO: Not always valid (if pointer is embedded, say for GPUs) + check_uniform(hash(space.innerSpace), original) end default_processor(space::MPIMemorySpace) = MPIOSProc(space.comm, space.rank) @@ -292,10 +293,10 @@ struct MPIRefID end end -function check_uniform(ref::MPIRefID) - check_uniform(ref.tid) - check_uniform(ref.uid) - check_uniform(ref.id) +function check_uniform(ref::MPIRefID, original=ref) + return check_uniform(ref.tid, original) && + check_uniform(ref.uid, original) && + check_uniform(ref.id, original) end const MPIREF_TID = Dict{Int, Threads.Atomic{Int}}() @@ -312,9 +313,9 @@ root_worker_id(ref::MPIRef) = myid() @warn "Move this definition somewhere else" maxlog=1 root_worker_id(ref::DRef) = ref.owner -function check_uniform(ref::MPIRef) - check_uniform(ref.rank) - check_uniform(ref.id) +function check_uniform(ref::MPIRef, original=ref) + return check_uniform(ref.rank, original) && + check_uniform(ref.id, original) end move(from_proc::Processor, to_proc::Processor, x::MPIRef) = From 85e0b801ae8dc8e2ba7df8e784af7ec10c951d1d Mon Sep 17 00:00:00 2001 From: yanzin00 Date: Mon, 29 Sep 2025 15:05:35 +0000 Subject: [PATCH 15/24] MPI: Optimizations and fix some uniformity issues --- src/datadeps/aliasing.jl | 96 +++++++++++++++++++++++++++++++------- src/datadeps/queue.jl | 24 ++++++++-- src/datadeps/remainders.jl | 53 +++++++++++++++------ src/mpi.jl | 76 ++++++++++++++++-------------- 4 files changed, 178 insertions(+), 71 deletions(-) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index 425d9efcd..aec83d039 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -222,9 +222,11 @@ function unwrap_inout(arg) end _identity_hash(arg, h::UInt=UInt(0)) = ismutable(arg) ? objectid(arg) : hash(arg, h) +_identity_hash(arg::Chunk, h::UInt=UInt(0)) = hash(arg.handle, hash(Chunk, h)) _identity_hash(arg::SubArray, h::UInt=UInt(0)) = hash(arg.indices, hash(arg.offset1, hash(arg.stride1, _identity_hash(arg.parent, h)))) _identity_hash(arg::CartesianIndices, h::UInt=UInt(0)) = hash(arg.indices, hash(typeof(arg), h)) +@warn "Dispatch bcast behavior on acceleration" maxlog=1 struct ArgumentWrapper arg dep_mod @@ -233,15 +235,27 @@ struct ArgumentWrapper function ArgumentWrapper(arg, dep_mod) h = hash(dep_mod) h = _identity_hash(arg, h) + check_uniform(h, arg) return new(arg, dep_mod, h) end end Base.hash(aw::ArgumentWrapper) = hash(ArgumentWrapper, aw.hash) Base.:(==)(aw1::ArgumentWrapper, aw2::ArgumentWrapper) = aw1.hash == aw2.hash +Base.isequal(aw1::ArgumentWrapper, aw2::ArgumentWrapper) = + aw1.hash == aw2.hash + +struct HistoryEntry + ainfo::AliasingWrapper + space::MemorySpace + write_num::Int +end @warn "Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper" maxlog=1 struct DataDepsState + # The mapping of original raw argument to its Chunk + raw_arg_to_chunk::IdDict{Any,Chunk} + # The origin memory space of each argument # Used to track the original location of an argument, for final copy-from arg_origin::IdDict{Any,MemorySpace} @@ -261,7 +275,7 @@ struct DataDepsState # The history of writes (direct or indirect) to each argument and dep_mod, in terms of ainfos directly written to, and the memory space they were written to # Updated when a new write happens on an overlapping ainfo # Used by remainder copies to track which portions of an argument and dep_mod were written to elsewhere, through another argument - arg_history::Dict{ArgumentWrapper,Vector{Tuple{AliasingWrapper,MemorySpace,Int}}} + arg_history::Dict{ArgumentWrapper,Vector{HistoryEntry}} # The mapping of memory space and argument to the memory space of the last direct write # Used by remainder copies to lookup the "backstop" if any portion of the target ainfo is not updated by the remainder @@ -299,6 +313,7 @@ struct DataDepsState @warn "aliasing=false is no longer supported, aliasing is now always enabled" maxlog=1 end + arg_to_chunk = IdDict{Any,Chunk}() arg_origin = IdDict{Any,MemorySpace}() remote_args = Dict{MemorySpace,IdDict{Any,Any}}() remote_arg_to_original = IdDict{Any,Any}() @@ -306,7 +321,7 @@ struct DataDepsState arg_owner = Dict{ArgumentWrapper,MemorySpace}() arg_overlaps = Dict{ArgumentWrapper,Set{ArgumentWrapper}}() ainfo_backing_chunk = Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}() - arg_history = Dict{ArgumentWrapper,Vector{Tuple{AliasingWrapper,MemorySpace,Int}}}() + arg_history = Dict{ArgumentWrapper,Vector{HistoryEntry}}() supports_inplace_cache = IdDict{Any,Bool}() ainfo_cache = Dict{ArgumentWrapper,AliasingWrapper}() @@ -316,7 +331,7 @@ struct DataDepsState ainfos_owner = Dict{AliasingWrapper,Union{Pair{DTask,Int},Nothing}}() ainfos_readers = Dict{AliasingWrapper,Vector{Pair{DTask,Int}}}() - return new(arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_owner, arg_overlaps, ainfo_backing_chunk, arg_history, + return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_owner, arg_overlaps, ainfo_backing_chunk, arg_history, supports_inplace_cache, ainfo_cache, ainfos_overlaps, ainfos_owner, ainfos_readers) end end @@ -374,14 +389,36 @@ function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) # Skip non-aliasing arguments type_may_alias(typeof(arg)) || continue + # Skip arguments not supporting in-place move + supports_inplace_move(state, arg) || continue + + # Generate a Chunk for the argument if necessary + if haskey(state.raw_arg_to_chunk, arg) + arg = state.raw_arg_to_chunk[arg] + else + if !(arg isa Chunk) + new_arg = with(MPI_UID=>task.uid) do + tochunk(arg) + end + state.raw_arg_to_chunk[arg] = new_arg + arg = new_arg + else + state.raw_arg_to_chunk[arg] = arg + end + end + # Track the origin space of the argument origin_space = memory_space(arg) + check_uniform(origin_space) state.arg_origin[arg] = origin_space state.remote_arg_to_original[arg] = arg # Populate argument info for all aliasing dependencies for (dep_mod, _, _) in deps + # Generate an ArgumentWrapper for the argument aw = ArgumentWrapper(arg, dep_mod) + + # Populate argument info populate_argument_info!(state, aw, origin_space) end end @@ -399,7 +436,7 @@ function populate_argument_info!(state::DataDepsState, arg_w::ArgumentWrapper, o state.arg_overlaps[arg_w] = Set{ArgumentWrapper}() end if !haskey(state.arg_history, arg_w) - state.arg_history[arg_w] = Vector{Tuple{AliasingWrapper,MemorySpace,Int}}() + state.arg_history[arg_w] = Vector{HistoryEntry}() end # Calculate the ainfo (which will populate ainfo structures and merge history) @@ -433,18 +470,45 @@ function populate_ainfo!(state::DataDepsState, original_arg_w::ArgumentWrapper, end function merge_history!(state::DataDepsState, arg_w::ArgumentWrapper, other_arg_w::ArgumentWrapper) history = state.arg_history[arg_w] - for (other_ainfo, other_space, write_num) in state.arg_history[other_arg_w] - @opcounter :merge_history - @opcounter :merge_history_complexity length(history) - idx = findfirst(h->h[3] > write_num, history) - if idx === nothing - if isempty(history) - idx = 1 - else - idx = length(history) + 1 + @opcounter :merge_history + @opcounter :merge_history_complexity length(history) + largest_value_update!(length(history)) + origin_space = state.arg_origin[other_arg_w.arg] + for other_entry in state.arg_history[other_arg_w] + write_num_tuple = HistoryEntry(AliasingWrapper(NoAliasing()), origin_space, other_entry.write_num) + range = searchsorted(history, write_num_tuple; by=x->x.write_num) + if !isempty(range) + # Find and skip duplicates + match = false + for source_idx in range + source_entry = history[source_idx] + if source_entry.ainfo == other_entry.ainfo && + source_entry.space == other_entry.space && + source_entry.write_num == other_entry.write_num + match = true + break + end end + match && continue + + # Insert at the first position + idx = first(range) + else + # Insert at the last position + idx = length(history) + 1 + end + insert!(history, idx, other_entry) + end +end +function truncate_history!(state::DataDepsState, arg_w::ArgumentWrapper) + if haskey(state.arg_history, arg_w) && length(state.arg_history[arg_w]) > 100000 + origin_space = state.arg_origin[arg_w.arg] + @opcounter :truncate_history + _, last_idx = compute_remainder_for_arg!(state, origin_space, arg_w, 0; compute_syncdeps=false) + if last_idx > 0 + @opcounter :truncate_history_removed last_idx + deleteat!(state.arg_history[arg_w], 1:last_idx) end - insert!(history, idx, (other_ainfo, other_space, write_num)) end end @@ -518,12 +582,12 @@ function add_writer!(state::DataDepsState, arg_w::ArgumentWrapper, dest_space::M empty!(state.arg_history[arg_w]) # Add our own history - push!(state.arg_history[arg_w], (ainfo, dest_space, write_num)) + push!(state.arg_history[arg_w], HistoryEntry(ainfo, dest_space, write_num)) # Find overlapping arguments and update their history for other_arg_w in state.arg_overlaps[arg_w] other_arg_w == arg_w && continue - push!(state.arg_history[other_arg_w], (ainfo, dest_space, write_num)) + push!(state.arg_history[other_arg_w], HistoryEntry(ainfo, dest_space, write_num)) end # Record the last place we were fully written to diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 4e68ecbca..38b9c9c69 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -365,6 +365,10 @@ function distribute_tasks!(queue::DataDepsTaskQueue) if !type_may_alias(typeof(arg)) || !supports_inplace_move(state, arg) return [(ArgumentWrapper(arg, identity), false, false)] end + + # Get the Chunk for the argument + arg = state.raw_arg_to_chunk[arg] + arg_ws = Tuple{ArgumentWrapper,Bool,Bool}[] for (dep_mod, readdep, writedep) in deps push!(arg_ws, (ArgumentWrapper(arg, dep_mod), readdep, writedep)) @@ -373,6 +377,13 @@ function distribute_tasks!(queue::DataDepsTaskQueue) end task_arg_ws = task_arg_ws::Vector{Vector{Tuple{ArgumentWrapper,Bool,Bool}}} + # Truncate the history for each argument + for arg_ws in task_arg_ws + for (arg_w, _, _) in arg_ws + truncate_history!(state, arg_w) + end + end + # Copy args from local to remote for (idx, arg_ws) in enumerate(task_arg_ws) arg = first(arg_ws)[1].arg @@ -396,13 +407,13 @@ function distribute_tasks!(queue::DataDepsTaskQueue) arg_remote = get_or_generate_slot!(state, our_space, arg) for (arg_w, _, _) in arg_ws dep_mod = arg_w.dep_mod - remainder = compute_remainder_for_arg!(state, our_space, arg_w, write_num) + remainder, _ = compute_remainder_for_arg!(state, our_space, arg_w, write_num) if remainder isa MultiRemainderAliasing enqueue_remainder_copy_to!(state, our_space, arg_w, remainder, value(f), idx, our_scope, task, write_num) elseif remainder isa FullCopy enqueue_copy_to!(state, our_space, arg_w, value(f), idx, our_scope, task, write_num) else - @assert remainder isa NoAliasing + @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Skipped copy-to (up-to-date): $our_space" end end @@ -473,10 +484,13 @@ function distribute_tasks!(queue::DataDepsTaskQueue) end # Copy args from remote to local - for arg_w in keys(state.arg_owner) + # N.B. We sort the keys to ensure a deterministic order for uniformity + check_uniform(length(state.arg_owner)) + for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) + check_uniform(arg_w) arg = arg_w.arg origin_space = state.arg_origin[arg] - remainder = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) + remainder, _ = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) if remainder isa MultiRemainderAliasing origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) enqueue_remainder_copy_from!(state, origin_space, arg_w, remainder, origin_scope, write_num) @@ -484,7 +498,7 @@ function distribute_tasks!(queue::DataDepsTaskQueue) origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) enqueue_copy_from!(state, origin_space, arg_w, origin_scope, write_num) else - @assert remainder isa NoAliasing + @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" @dagdebug nothing :spawn_datadeps "Skipped copy-from (up-to-date): $origin_space" end end diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl index 5cf8a2621..0d1c65f6f 100644 --- a/src/datadeps/remainders.jl +++ b/src/datadeps/remainders.jl @@ -121,7 +121,7 @@ and returned. function compute_remainder_for_arg!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper, - write_num::Int) + write_num::Int; compute_syncdeps::Bool=true) @label restart # Determine all memory spaces of the history @@ -129,8 +129,8 @@ function compute_remainder_for_arg!(state::DataDepsState, push!(spaces_set, target_space) owner_space = state.arg_owner[arg_w] push!(spaces_set, owner_space) - for (_, space, _) in state.arg_history[arg_w] - push!(spaces_set, space) + for entry in state.arg_history[arg_w] + push!(spaces_set, entry.space) end spaces = collect(spaces_set) N = length(spaces) @@ -145,20 +145,22 @@ function compute_remainder_for_arg!(state::DataDepsState, nspans = length(first(target_ainfos)) # FIXME: This is a hack to ensure that we don't miss any history generated by aliasing(...) - for (_, space, _) in state.arg_history[arg_w] - if !in(space, spaces) + for entry in state.arg_history[arg_w] + if !in(entry.space, spaces) @opcounter :compute_remainder_for_arg_restart @goto restart end end + check_uniform(spaces) + check_uniform(target_ainfos) # We may only need to schedule a full copy from the origin space to the # target space if this is the first time we've written to `arg_w` if isempty(state.arg_history[arg_w]) if owner_space != target_space - return FullCopy() + return FullCopy(), 0 else - return NoAliasing() + return NoAliasing(), 0 end end @@ -171,19 +173,25 @@ function compute_remainder_for_arg!(state::DataDepsState, # Walk backwards through the history of writes to this target # other_ainfo is the overlapping ainfo that was written to # other_space is the memory space of the overlapping ainfo + last_idx = length(state.arg_history[arg_w]) for idx in length(state.arg_history[arg_w]):-1:0 if isempty(remainder) # All done! + last_idx = idx break end if idx > 0 - (other_ainfo, other_space, _) = state.arg_history[arg_w][idx] + other_entry = state.arg_history[arg_w][idx] + other_ainfo = other_entry.ainfo + other_space = other_entry.space else # If we've reached the end of the history, evaluate ourselves other_ainfo = aliasing!(state, owner_space, arg_w) other_space = owner_space end + check_uniform(other_ainfo) + check_uniform(other_space) # Lookup all memory spans for arg_w in these spaces other_remote_arg_w = state.ainfo_arg[other_ainfo] @@ -197,6 +205,9 @@ function compute_remainder_for_arg!(state::DataDepsState, nspans = length(first(other_ainfos)) other_many_spans = [ManyMemorySpan{N}(ntuple(i -> other_ainfos[i][j], N)) for j in 1:nspans] + check_uniform(other_many_spans) + check_uniform(spaces) + if other_space == target_space # Only subtract, this data is already up-to-date in target_space # N.B. We don't add to syncdeps here, because we'll see this ainfo @@ -214,19 +225,27 @@ function compute_remainder_for_arg!(state::DataDepsState, end @opcounter :compute_remainder_for_arg_schedule schedule_remainder!(tracker_other_space[1], other_space_idx, target_space_idx, remainder, other_many_spans) - get_read_deps!(state, other_space, other_ainfo, write_num, tracker_other_space[2]) + if compute_syncdeps + @assert haskey(state.ainfos_owner, other_ainfo) "[idx $idx] ainfo $(typeof(other_ainfo)) has no owner" + get_read_deps!(state, other_space, other_ainfo, write_num, tracker_other_space[2]) + end end if isempty(tracker) - return NoAliasing() + return NoAliasing(), 0 end - # Return scheduled copies + # Return scheduled copies and the index of the last ainfo we considered mra = MultiRemainderAliasing() - for (space, (spans, syncdeps)) in tracker - push!(mra.remainders, RemainderAliasing(space, spans, syncdeps)) + for space in spaces + if haskey(tracker, space) + spans, syncdeps = tracker[space] + if !isempty(spans) + push!(mra.remainders, RemainderAliasing(space, spans, syncdeps)) + end + end end - return mra + return mra, last_idx end ### Memory Span Set Operations for Remainder Computation @@ -260,6 +279,9 @@ Enqueues a copy operation to update the remainder regions of an object before a function enqueue_remainder_copy_to!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, remainder_aliasing::MultiRemainderAliasing, f, idx, dest_scope, task, write_num::Int) for remainder in remainder_aliasing.remainders + check_uniform(remainder.space) + @assert !isempty(remainder.spans) + check_uniform(remainder.spans) enqueue_remainder_copy_to!(state, dest_space, arg_w, remainder, f, idx, dest_scope, task, write_num) end end @@ -304,6 +326,9 @@ Enqueues a copy operation to update the remainder regions of an object back to t function enqueue_remainder_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, remainder_aliasing::MultiRemainderAliasing, dest_scope, write_num::Int) for remainder in remainder_aliasing.remainders + check_uniform(remainder.space) + @assert !isempty(remainder.spans) + check_uniform(remainder.spans) enqueue_remainder_copy_from!(state, dest_space, arg_w, remainder, dest_scope, write_num) end end diff --git a/src/mpi.jl b/src/mpi.jl index 81f436e0a..a0750599e 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -59,6 +59,7 @@ function aliasing(accel::MPIAcceleration, x::Chunk, T) if handle.rank == rank ainfo = aliasing(x, T) #Core.print("[$rank] aliasing: $ainfo, sending\n") + @opcounter :aliasing_bcast_send_yield bcast_send_yield(ainfo, accel.comm, handle.rank, tag) else #Core.print("[$rank] aliasing: receiving from $(handle.rank)\n") @@ -292,6 +293,8 @@ struct MPIRefID return new(tid, uid, id) end end +Base.hash(id::MPIRefID, h::UInt=UInt(0)) = + hash(id.tid, hash(id.uid, hash(id.id, hash(MPIRefID, h)))) function check_uniform(ref::MPIRefID, original=ref) return check_uniform(ref.tid, original) && @@ -309,6 +312,7 @@ mutable struct MPIRef innerRef::Union{DRef, Nothing} id::MPIRefID end +Base.hash(ref::MPIRef, h::UInt=UInt(0)) = hash(ref.id, hash(MPIRef, h)) root_worker_id(ref::MPIRef) = myid() @warn "Move this definition somewhere else" maxlog=1 root_worker_id(ref::DRef) = ref.owner @@ -560,11 +564,13 @@ function _send_yield(value, comm, dest, tag; check_seen::Bool=true, inplace::Boo end function send_yield_inplace(value, comm, my_rank, their_rank, tag) + @opcounter :send_yield_inplace req = MPI.Isend(value, comm; dest=their_rank, tag) __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") end function send_yield_serialized(value, comm, my_rank, their_rank, tag) + @opcounter :send_yield_serialized if value isa Array && isbitstype(eltype(value)) send_yield_serialized(InplaceInfo(typeof(value), size(value)), comm, my_rank, their_rank, tag) send_yield_inplace(value, comm, my_rank, their_rank, tag) @@ -598,6 +604,7 @@ function __wait_for_request(req, comm, my_rank, their_rank, tag, fn::String, kin end function bcast_send_yield(value, comm, root, tag) + @opcounter :bcast_send_yield sz = MPI.Comm_size(comm) rank = MPI.Comm_rank(comm) for other_rank in 0:(sz-1) @@ -646,6 +653,7 @@ function MemPool.poolget(ref::MPIRef; uniform::Bool=false) tag = to_tag(hash(ref.id, hash(:poolget))) if ref.rank == MPI.Comm_rank(ref.comm) value = poolget(ref.innerRef) + @opcounter :poolget_bcast_send_yield bcast_send_yield(value, ref.comm, ref.rank, tag) return value else @@ -705,12 +713,12 @@ function move!(dep_mod::RemainderAliasing{<:MPIMemorySpace}, to_space::MPIMemory end # Send the spans - send_yield(len, to_space.comm, to_space.rank, tag) + #send_yield(len, to_space.comm, to_space.rank, tag) send_yield!(copies, to_space.comm, to_space.rank, tag; check_seen=false) #send_yield(copies, to_space.comm, to_space.rank, tag) elseif local_rank == to_space.rank # Receive the spans - len = recv_yield(from_space.comm, from_space.rank, tag) + len = sum(span_tuple->span_len(span_tuple[1]), dep_mod.spans) copies = Vector{UInt8}(undef, len) recv_yield!(copies, from_space.comm, from_space.rank, tag) #copies = recv_yield(from_space.comm, from_space.rank, tag) @@ -760,50 +768,45 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro loc_rank = MPI.Comm_rank(accel.comm) task = DATADEPS_CURRENT_TASK[] return with(MPI_UID=>task.uid, MPI_UNIFORM=>true) do - if data isa Chunk - tag = to_tag(hash(data.handle.id)) - space = memory_space(data) - if space.rank != from_proc.rank - # If the data is already where it needs to be - @assert space.rank == to_proc.rank - if space.rank == loc_rank - value = poolget(data.handle) - data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) - return tochunk(data_converted, to_proc, to_space) - else - T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) - T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T - @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" - return tochunk(nothing, to_proc, to_space; type=T_new) - end - end - - # The data is on the source rank - @assert space.rank == from_proc.rank - if loc_rank == from_proc.rank == to_proc.rank + @assert data isa Chunk "Expected Chunk, got $(typeof(data))" + tag = to_tag(hash(data.handle.id)) + space = memory_space(data) + if space.rank != from_proc.rank + # If the data is already where it needs to be + @assert space.rank == to_proc.rank + if space.rank == loc_rank value = poolget(data.handle) data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) return tochunk(data_converted, to_proc, to_space) - elseif loc_rank == from_proc.rank - value = poolget(data.handle) - data_moved = move(from_proc.innerProc, to_proc.innerProc, value) - Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) - # FIXME: This is wrong to take typeof(data_moved), because the type may change - return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) - elseif loc_rank == to_proc.rank - data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) - data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) - return tochunk(data_converted, to_proc, to_space) else T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" return tochunk(nothing, to_proc, to_space; type=T_new) end - else - error("We shouldn't call f here, if we're not the destination rank") - data_converted = f(move(from_proc, to_proc, data)) + end + + # The data is on the source rank + @assert space.rank == from_proc.rank + if loc_rank == from_proc.rank == to_proc.rank + value = poolget(data.handle) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) return tochunk(data_converted, to_proc, to_space) + elseif loc_rank == from_proc.rank + value = poolget(data.handle) + data_moved = move(from_proc.innerProc, to_proc.innerProc, value) + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + # FIXME: This is wrong to take typeof(data_moved), because the type may change + return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) + elseif loc_rank == to_proc.rank + data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) + return tochunk(data_converted, to_proc, to_space) + else + T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) + T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T + @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" + return tochunk(nothing, to_proc, to_space; type=T_new) end end end @@ -858,6 +861,7 @@ function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) T = typeof(result) space = memory_space(result, proc)::MPIMemorySpace T_space = (T, space.innerSpace) + @opcounter :execute_bcast_send_yield bcast_send_yield(T_space, proc.comm, proc.rank, tag) return tochunk(result, proc, space) else From e8e7eb82ee0172943e82e10357b950183230337e Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 30 Oct 2025 17:36:25 -0300 Subject: [PATCH 16/24] MPI: de-hashing the tags based on uniformity --- src/datadeps/queue.jl | 623 ++++++++++++++++++++----------------- src/datadeps/remainders.jl | 18 +- src/mpi.jl | 51 ++- src/options.jl | 2 + 4 files changed, 377 insertions(+), 317 deletions(-) diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 38b9c9c69..47e405d68 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -1,8 +1,23 @@ + +const TAG_WAITING = Base.Lockable(Ref{UInt32}(1)) +function to_tag() + intask = Dagger.in_task() + opts = Dagger.get_options() + if intask + return Dagger.get_tls().task_spec.options.tag::UInt32 + end + lock(TAG_WAITING) do counter_ref + tag = counter_ref[] + counter_ref[] = tag + 1 > MPI.tag_ub() ? 1 : tag + 1 + return tag + end +end + struct DataDepsTaskQueue <: AbstractTaskQueue # The queue above us upper_queue::AbstractTaskQueue # The set of tasks that have already been seen - seen_tasks::Union{Vector{Pair{DTaskSpec,DTask}},Nothing} + seen_tasks::Union{Vector{DTaskPair},Nothing} # The data-dependency graph of all tasks g::Union{SimpleDiGraph{Int},Nothing} # The mapping from task to graph ID @@ -20,7 +35,7 @@ struct DataDepsTaskQueue <: AbstractTaskQueue traversal::Symbol=:inorder, scheduler::Symbol=:naive, aliasing::Bool=true) - seen_tasks = Pair{DTaskSpec,DTask}[] + seen_tasks = DTaskPair[] g = SimpleDiGraph() task_to_id = Dict{DTask,Int}() return new(upper_queue, seen_tasks, g, task_to_id, traversal, scheduler, @@ -28,11 +43,11 @@ struct DataDepsTaskQueue <: AbstractTaskQueue end end -function enqueue!(queue::DataDepsTaskQueue, spec::Pair{DTaskSpec,DTask}) - push!(queue.seen_tasks, spec) +function enqueue!(queue::DataDepsTaskQueue, pair::DTaskPair) + push!(queue.seen_tasks, pair) end -function enqueue!(queue::DataDepsTaskQueue, specs::Vector{Pair{DTaskSpec,DTask}}) - append!(queue.seen_tasks, specs) +function enqueue!(queue::DataDepsTaskQueue, pairs::Vector{DTaskPair}) + append!(queue.seen_tasks, pairs) end const DATADEPS_CURRENT_TASK = TaskLocalValue{Union{DTask,Nothing}}(Returns(nothing)) @@ -192,314 +207,362 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Start launching tasks and necessary copies write_num = 1 proc_idx = 1 - pressures = Dict{Processor,Int}() + #pressures = Dict{Processor,Int}() proc_to_scope_lfu = BasicLFUCache{Processor,AbstractScope}(1024) - for (spec, task) in queue.seen_tasks[task_order] - DATADEPS_CURRENT_TASK[] = task - - # Populate all task dependencies - populate_task_info!(state, spec, task) - - scheduler = queue.scheduler - if scheduler == :naive - raw_args = map(arg->tochunk(value(arg)), spec.fargs) - our_proc = remotecall_fetch(1, all_procs, raw_args) do all_procs, raw_args - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - - @lock sch_state.lock begin - # Calculate costs per processor and select the most optimal - # FIXME: This should consider any already-allocated slots, - # whether they are up-to-date, and if not, the cost of moving - # data to them - procs, costs = Sch.estimate_task_costs(sch_state, all_procs, nothing, raw_args) - return first(procs) - end - end - elseif scheduler == :smart - raw_args = map(filter(arg->haskey(state.data_locality, value(arg)), spec.fargs)) do arg - arg_chunk = tochunk(value(arg)) - # Only the owned slot is valid - # FIXME: Track up-to-date copies and pass all of those - return arg_chunk => data_locality[arg] - end - f_chunk = tochunk(value(spec.fargs[1])) - our_proc, task_pressure = remotecall_fetch(1, all_procs, pressures, f_chunk, raw_args) do all_procs, pressures, f, chunks_locality - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - - @lock sch_state.lock begin - tx_rate = sch_state.transfer_rate[] - - costs = Dict{Processor,Float64}() - for proc in all_procs - # Filter out chunks that are already local - chunks_filt = Iterators.filter(((chunk, space)=chunk_locality)->!(proc in processors(space)), chunks_locality) - - # Estimate network transfer costs based on data size - # N.B. `affinity(x)` really means "data size of `x`" - # N.B. We treat same-worker transfers as having zero transfer cost - tx_cost = Sch.impute_sum(affinity(chunk)[2] for chunk in chunks_filt) - - # Estimate total cost to move data and get task running after currently-scheduled tasks - est_time_util = get(pressures, proc, UInt64(0)) - costs[proc] = est_time_util + (tx_cost/tx_rate) - end - - # Look up estimated task cost - sig = Sch.signature(sch_state, f, map(first, chunks_locality)) - task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) - - # Shuffle procs around, so equally-costly procs are equally considered - P = randperm(length(all_procs)) - procs = getindex.(Ref(all_procs), P) - - # Sort by lowest cost first - sort!(procs, by=p->costs[p]) - - best_proc = first(procs) - return best_proc, task_pressure - end - end - # FIXME: Pressure should be decreased by pressure of syncdeps on same processor - pressures[our_proc] = get(pressures, our_proc, UInt64(0)) + task_pressure - elseif scheduler == :ultra - args = Base.mapany(spec.fargs) do arg - pos, data = arg - data, _ = unwrap_inout(data) - if data isa DTask - data = fetch(data; move_value=false, unwrap=false) - end - return pos => tochunk(data) + for pair in queue.seen_tasks[task_order] + spec = pair.spec + task = pair.task + write_num, proc_idx = distribute_task!(queue, state, all_procs, spec, task, spec.fargs, proc_to_scope_lfu, write_num, proc_idx) + end + + # Copy args from remote to local + # N.B. We sort the keys to ensure a deterministic order for uniformity + check_uniform(length(state.arg_owner)) + for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) + check_uniform(arg_w) + arg = arg_w.arg + origin_space = state.arg_origin[arg] + remainder, _ = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) + if remainder isa MultiRemainderAliasing + origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) + enqueue_remainder_copy_from!(state, origin_space, arg_w, remainder, origin_scope, write_num) + elseif remainder isa FullCopy + origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) + enqueue_copy_from!(state, origin_space, arg_w, origin_scope, write_num) + else + @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" + @dagdebug nothing :spawn_datadeps "Skipped copy-from (up-to-date): $origin_space" + end + end +end +struct DataDepsTaskDependency + arg_w::ArgumentWrapper + readdep::Bool + writedep::Bool +end +DataDepsTaskDependency(arg, dep) = + DataDepsTaskDependency(ArgumentWrapper(arg, dep[1]), dep[2], dep[3]) +struct DataDepsTaskArgument + arg + pos::ArgPosition + may_alias::Bool + inplace_move::Bool + deps::Vector{DataDepsTaskDependency} +end +struct TypedDataDepsTaskArgument{T,N} + arg::T + pos::ArgPosition + may_alias::Bool + inplace_move::Bool + deps::NTuple{N,DataDepsTaskDependency} +end +map_or_ntuple(f, xs::Vector) = map(f, 1:length(xs)) +map_or_ntuple(f, xs::Tuple) = ntuple(f, length(xs)) +function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int, proc_idx::Int) where typed + @specialize spec fargs + + DATADEPS_CURRENT_TASK[] = task + + if typed + fargs::Tuple + else + fargs::Vector{Argument} + end + + scheduler = queue.scheduler + if scheduler == :naive + raw_args = map(arg->tochunk(value(arg)), spec.fargs) + our_proc = remotecall_fetch(1, all_procs, raw_args) do all_procs, raw_args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + # Calculate costs per processor and select the most optimal + # FIXME: This should consider any already-allocated slots, + # whether they are up-to-date, and if not, the cost of moving + # data to them + procs, costs = Sch.estimate_task_costs(sch_state, all_procs, nothing, raw_args) + return first(procs) end - f_chunk = tochunk(value(spec.fargs[1])) - task_time = remotecall_fetch(1, f_chunk, args) do f, args - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - return @lock sch_state.lock begin - sig = Sch.signature(sch_state, f, args) - return get(sch_state.signature_time_cost, sig, 1000^3) + end + elseif scheduler == :smart + raw_args = map(filter(arg->haskey(state.data_locality, value(arg)), spec.fargs)) do arg + arg_chunk = tochunk(value(arg)) + # Only the owned slot is valid + # FIXME: Track up-to-date copies and pass all of those + return arg_chunk => data_locality[arg] + end + f_chunk = tochunk(value(spec.fargs[1])) + our_proc, task_pressure = remotecall_fetch(1, all_procs, pressures, f_chunk, raw_args) do all_procs, pressures, f, chunks_locality + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + tx_rate = sch_state.transfer_rate[] + + costs = Dict{Processor,Float64}() + for proc in all_procs + # Filter out chunks that are already local + chunks_filt = Iterators.filter(((chunk, space)=chunk_locality)->!(proc in processors(space)), chunks_locality) + + # Estimate network transfer costs based on data size + # N.B. `affinity(x)` really means "data size of `x`" + # N.B. We treat same-worker transfers as having zero transfer cost + tx_cost = Sch.impute_sum(affinity(chunk)[2] for chunk in chunks_filt) + + # Estimate total cost to move data and get task running after currently-scheduled tasks + est_time_util = get(pressures, proc, UInt64(0)) + costs[proc] = est_time_util + (tx_cost/tx_rate) end - end - # FIXME: Copy deps are computed eagerly - deps = @something(spec.options.syncdeps, Set{Any}()) + # Look up estimated task cost + sig = Sch.signature(sch_state, f, map(first, chunks_locality)) + task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) - # Find latest time-to-completion of all syncdeps - deps_completed = UInt64(0) - for dep in deps - haskey(sstate.task_completions, dep) || continue # copy deps aren't recorded - deps_completed = max(deps_completed, sstate.task_completions[dep]) - end + # Shuffle procs around, so equally-costly procs are equally considered + P = randperm(length(all_procs)) + procs = getindex.(Ref(all_procs), P) - # Find latest time-to-completion of each memory space - # FIXME: Figure out space completions based on optimal packing - spaces_completed = Dict{MemorySpace,UInt64}() - for space in exec_spaces - completed = UInt64(0) - for (task, other_space) in sstate.assignments - space == other_space || continue - completed = max(completed, sstate.task_completions[task]) - end - spaces_completed[space] = completed - end + # Sort by lowest cost first + sort!(procs, by=p->costs[p]) - # Choose the earliest-available memory space and processor - # FIXME: Consider move time - move_time = UInt64(0) - local our_space_completed - while true - our_space_completed, our_space = findmin(spaces_completed) - our_space_procs = filter(proc->proc in all_procs, processors(our_space)) - if isempty(our_space_procs) - delete!(spaces_completed, our_space) - continue - end - our_proc = rand(our_space_procs) - break + best_proc = first(procs) + return best_proc, task_pressure end - - sstate.task_to_spec[task] = spec - sstate.assignments[task] = our_space - sstate.task_completions[task] = our_space_completed + move_time + task_time - elseif scheduler == :roundrobin - our_proc = all_procs[proc_idx] - else - error("Invalid scheduler: $sched") end - @assert our_proc in all_procs - our_space = only(memory_spaces(our_proc)) - - # Find the scope for this task (and its copies) - task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) - if task_scope == scope - # Optimize for the common case, cache the proc=>scope mapping - our_scope = get!(proc_to_scope_lfu, our_proc) do - our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - return constrain(UnionScope(map(ExactScope, our_procs)...), scope) + # FIXME: Pressure should be decreased by pressure of syncdeps on same processor + pressures[our_proc] = get(pressures, our_proc, UInt64(0)) + task_pressure + elseif scheduler == :ultra + args = Base.mapany(spec.fargs) do arg + pos, data = arg + data, _ = unwrap_inout(data) + if data isa DTask + data = fetch(data; move_value=false, unwrap=false) end - else - # Use the provided scope and constrain it to the available processors - our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - our_scope = constrain(UnionScope(map(ExactScope, our_procs)...), task_scope) + return pos => tochunk(data) end - if our_scope isa InvalidScope - throw(Sch.SchedulingException("Scopes are not compatible: $(our_scope.x), $(our_scope.y)")) - end - check_uniform(our_proc) - check_uniform(our_space) - - f = spec.fargs[1] - # FIXME: May not be correct to move this under uniformity - f.value = move(default_processor(), our_proc, value(f)) - @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Scheduling: $our_proc ($our_space)" - - # Copy raw task arguments for analysis - task_args = map(copy, spec.fargs) - - # Generate a list of ArgumentWrappers for each task argument - task_arg_ws = map(task_args) do _arg - arg = value(_arg) - arg, deps = unwrap_inout(arg) - arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg - if !type_may_alias(typeof(arg)) || !supports_inplace_move(state, arg) - return [(ArgumentWrapper(arg, identity), false, false)] + f_chunk = tochunk(value(spec.fargs[1])) + task_time = remotecall_fetch(1, f_chunk, args) do f, args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + return @lock sch_state.lock begin + sig = Sch.signature(sch_state, f, args) + return get(sch_state.signature_time_cost, sig, 1000^3) end + end - # Get the Chunk for the argument - arg = state.raw_arg_to_chunk[arg] + # FIXME: Copy deps are computed eagerly + deps = @something(spec.options.syncdeps, Set{Any}()) - arg_ws = Tuple{ArgumentWrapper,Bool,Bool}[] - for (dep_mod, readdep, writedep) in deps - push!(arg_ws, (ArgumentWrapper(arg, dep_mod), readdep, writedep)) - end - return arg_ws + # Find latest time-to-completion of all syncdeps + deps_completed = UInt64(0) + for dep in deps + haskey(sstate.task_completions, dep) || continue # copy deps aren't recorded + deps_completed = max(deps_completed, sstate.task_completions[dep]) end - task_arg_ws = task_arg_ws::Vector{Vector{Tuple{ArgumentWrapper,Bool,Bool}}} - # Truncate the history for each argument - for arg_ws in task_arg_ws - for (arg_w, _, _) in arg_ws - truncate_history!(state, arg_w) + # Find latest time-to-completion of each memory space + # FIXME: Figure out space completions based on optimal packing + spaces_completed = Dict{MemorySpace,UInt64}() + for space in exec_spaces + completed = UInt64(0) + for (task, other_space) in sstate.assignments + space == other_space || continue + completed = max(completed, sstate.task_completions[task]) end + spaces_completed[space] = completed end - # Copy args from local to remote - for (idx, arg_ws) in enumerate(task_arg_ws) - arg = first(arg_ws)[1].arg - pos = raw_position(task_args[idx]) - - # Is the data written previously or now? - if !type_may_alias(typeof(arg)) - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (immutable)" - spec.fargs[idx].value = arg + # Choose the earliest-available memory space and processor + # FIXME: Consider move time + move_time = UInt64(0) + local our_space_completed + while true + our_space_completed, our_space = findmin(spaces_completed) + our_space_procs = filter(proc->proc in all_procs, processors(our_space)) + if isempty(our_space_procs) + delete!(spaces_completed, our_space) continue end + our_proc = rand(our_space_procs) + break + end - # Is the data writeable? - if !supports_inplace_move(state, arg) - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (non-writeable)" - spec.fargs[idx].value = arg - continue - end + sstate.task_to_spec[task] = spec + sstate.assignments[task] = our_space + sstate.task_completions[task] = our_space_completed + move_time + task_time + elseif scheduler == :roundrobin + our_proc = all_procs[proc_idx] + else + error("Invalid scheduler: $sched") + end + @assert our_proc in all_procs + our_space = only(memory_spaces(our_proc)) + + # Find the scope for this task (and its copies) + task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) + if task_scope == scope + # Optimize for the common case, cache the proc=>scope mapping + our_scope = get!(proc_to_scope_lfu, our_proc) do + our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) + return constrain(UnionScope(map(ExactScope, our_procs)...), scope) + end + else + # Use the provided scope and constrain it to the available processors + our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) + our_scope = constrain(UnionScope(map(ExactScope, our_procs)...), task_scope) + end + if our_scope isa InvalidScope + throw(Sch.SchedulingException("Scopes are not compatible: $(our_scope.x), $(our_scope.y)")) + end + check_uniform(our_proc) + check_uniform(our_space) + + f = spec.fargs[1] + # FIXME: May not be correct to move this under uniformity + #f.value = move(default_processor(), our_proc, value(f)) + @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Scheduling: $our_proc ($our_space)" + + # Copy raw task arguments for analysis + # N.B. Used later for checking dependencies + task_args = map_or_ntuple(idx->copy(spec.fargs[idx]), spec.fargs) + + # Populate all task dependencies + task_arg_ws = populate_task_info!(state, task_args, spec, task) + + # Truncate the history for each argument + map_or_ntuple(task_arg_ws) do idx + arg_ws = task_arg_ws[idx] + map_or_ntuple(arg_ws.deps) do dep_idx + dep = arg_ws.deps[dep_idx] + truncate_history!(state, dep.arg_w) + end + return + end - # Is the source of truth elsewhere? - arg_remote = get_or_generate_slot!(state, our_space, arg) - for (arg_w, _, _) in arg_ws - dep_mod = arg_w.dep_mod - remainder, _ = compute_remainder_for_arg!(state, our_space, arg_w, write_num) - if remainder isa MultiRemainderAliasing - enqueue_remainder_copy_to!(state, our_space, arg_w, remainder, value(f), idx, our_scope, task, write_num) - elseif remainder isa FullCopy - enqueue_copy_to!(state, our_space, arg_w, value(f), idx, our_scope, task, write_num) - else - @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Skipped copy-to (up-to-date): $our_space" - end - end - spec.fargs[idx].value = arg_remote + # Copy args from local to remote + remote_args = map_or_ntuple(task_arg_ws) do idx + arg_ws = task_arg_ws[idx] + arg = arg_ws.arg + pos = raw_position(arg_ws.pos) + + # Is the data written previously or now? + if !arg_ws.may_alias + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (immutable)" + return arg end - write_num += 1 - - # Validate that we're not accidentally performing a copy - for (idx, _arg) in enumerate(spec.fargs) - arg = value(_arg) - _, deps = unwrap_inout(value(task_args[idx])) - # N.B. We only do this check when the argument supports in-place - # moves, because for the moment, we are not guaranteeing updates or - # write-back of results - if is_writedep(arg, deps, task) && supports_inplace_move(state, arg) - arg_space = memory_space(arg) - @assert arg_space == our_space "($(repr(value(f))))[$(idx-1)] Tried to pass $(typeof(arg)) from $arg_space to $our_space" + + # Is the data writeable? + if !arg_ws.inplace_move + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)] Skipped copy-to (non-writeable)" + return arg + end + + # Is the source of truth elsewhere? + arg_remote = get_or_generate_slot!(state, our_space, arg) + map_or_ntuple(arg_ws.deps) do dep_idx + dep = arg_ws.deps[dep_idx] + arg_w = dep.arg_w + dep_mod = arg_w.dep_mod + remainder, _ = compute_remainder_for_arg!(state, our_space, arg_w, write_num) + if remainder isa MultiRemainderAliasing + enqueue_remainder_copy_to!(state, our_space, arg_w, remainder, value(f), idx, our_scope, task, write_num) + elseif remainder isa FullCopy + enqueue_copy_to!(state, our_space, arg_w, value(f), idx, our_scope, task, write_num) + else + @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Skipped copy-to (up-to-date): $our_space" end end + return arg_remote + end + write_num += 1 - # Calculate this task's syncdeps - if spec.options.syncdeps === nothing - spec.options.syncdeps = Set{Any}() + # Validate that we're not accidentally performing a copy + map_or_ntuple(task_arg_ws) do idx + arg_ws = task_arg_ws[idx] + arg = remote_args[idx] + + # Get the dependencies again as (dep_mod, readdep, writedep) + deps = map_or_ntuple(arg_ws.deps) do dep_idx + dep = arg_ws.deps[dep_idx] + (dep.arg_w.dep_mod, dep.readdep, dep.writedep) end - syncdeps = spec.options.syncdeps - for (idx, arg_ws) in enumerate(task_arg_ws) - arg = first(arg_ws)[1].arg - type_may_alias(typeof(arg)) || continue - supports_inplace_move(state, arg) || continue - for (arg_w, _, writedep) in arg_ws - ainfo = aliasing!(state, our_space, arg_w) - dep_mod = arg_w.dep_mod - if writedep - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as writer" - get_write_deps!(state, our_space, ainfo, write_num, syncdeps) - else - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as reader" - get_read_deps!(state, our_space, ainfo, write_num, syncdeps) - end - end + + # Check that any mutable and written arguments are already in the correct space + if is_writedep(arg, deps, task) && arg_ws.may_alias && arg_ws.inplace_move + arg_space = memory_space(arg) + @assert arg_space == our_space "($(repr(value(f))))[$(idx-1)] Tried to pass $(typeof(arg)) from $arg_space to $our_space" end - @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Task has $(length(syncdeps)) syncdeps" - - # Launch user's task - spec.options.scope = our_scope - spec.options.exec_scope = our_scope - spec.options.occupancy = Dict(Any=>0) - enqueue!(upper_queue, spec=>task) - - # Update read/write tracking for arguments - for (idx, arg_ws) in enumerate(task_arg_ws) - arg = first(arg_ws)[1].arg - type_may_alias(typeof(arg)) || continue - for (arg_w, _, writedep) in arg_ws - ainfo = aliasing!(state, our_space, arg_w) - dep_mod = arg_w.dep_mod - if writedep - @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Task set as writer" - add_writer!(state, arg_w, our_space, ainfo, task, write_num) - else - add_reader!(state, arg_w, our_space, ainfo, task, write_num) - end + end + + # Calculate this task's syncdeps + if spec.options.syncdeps === nothing + spec.options.syncdeps = Set{Any}() + end + if spec.options.tag === nothing + spec.options.tag = to_tag() + end + + syncdeps = spec.options.syncdeps + map_or_ntuple(task_arg_ws) do idx + arg_ws = task_arg_ws[idx] + arg = arg_ws.arg + arg_ws.may_alias || return + arg_ws.inplace_move || return + map_or_ntuple(arg_ws.deps) do dep_idx + dep = arg_ws.deps[dep_idx] + arg_w = dep.arg_w + ainfo = aliasing!(state, our_space, arg_w) + dep_mod = arg_w.dep_mod + if dep.writedep + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as writer" + get_write_deps!(state, our_space, ainfo, write_num, syncdeps) + else + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Syncing as reader" + get_read_deps!(state, our_space, ainfo, write_num, syncdeps) end end - - write_num += 1 - proc_idx = mod1(proc_idx + 1, length(all_procs)) + return end + @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Task has $(length(syncdeps)) syncdeps" - # Copy args from remote to local - # N.B. We sort the keys to ensure a deterministic order for uniformity - check_uniform(length(state.arg_owner)) - for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) - check_uniform(arg_w) - arg = arg_w.arg - origin_space = state.arg_origin[arg] - remainder, _ = compute_remainder_for_arg!(state, origin_space, arg_w, write_num) - if remainder isa MultiRemainderAliasing - origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) - enqueue_remainder_copy_from!(state, origin_space, arg_w, remainder, origin_scope, write_num) - elseif remainder isa FullCopy - origin_scope = UnionScope(map(ExactScope, collect(processors(origin_space)))...) - enqueue_copy_from!(state, origin_space, arg_w, origin_scope, write_num) + # Launch user's task + new_fargs = map_or_ntuple(task_arg_ws) do idx + if is_typed(spec) + return TypedArgument(task_arg_ws[idx].pos, remote_args[idx]) else - @assert remainder isa NoAliasing "Expected NoAliasing, got $(typeof(remainder))" - @dagdebug nothing :spawn_datadeps "Skipped copy-from (up-to-date): $origin_space" + return Argument(task_arg_ws[idx].pos, remote_args[idx]) + end + end + new_spec = DTaskSpec(new_fargs, spec.options) + new_spec.options.scope = our_scope + new_spec.options.exec_scope = our_scope + new_spec.options.occupancy = Dict(Any=>0) + enqueue!(queue.upper_queue, DTaskPair(new_spec, task)) + + # Update read/write tracking for arguments + map_or_ntuple(task_arg_ws) do idx + arg_ws = task_arg_ws[idx] + arg = arg_ws.arg + arg_ws.may_alias || return + arg_ws.inplace_move || return + for dep in arg_ws.deps + arg_w = dep.arg_w + ainfo = aliasing!(state, our_space, arg_w) + dep_mod = arg_w.dep_mod + if dep.writedep + @dagdebug nothing :spawn_datadeps "($(repr(value(f))))[$(idx-1)][$dep_mod] Task set as writer" + add_writer!(state, arg_w, our_space, ainfo, task, write_num) + else + add_reader!(state, arg_w, our_space, ainfo, task, write_num) + end end + return end + + write_num += 1 + proc_idx = mod1(proc_idx + 1, length(all_procs)) + + return write_num, proc_idx end diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl index 0d1c65f6f..671365793 100644 --- a/src/datadeps/remainders.jl +++ b/src/datadeps/remainders.jl @@ -312,7 +312,9 @@ function enqueue_remainder_copy_to!(state::DataDepsState, dest_space::MemorySpac @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Remainder copy-to has $(length(remainder_syncdeps)) syncdeps" # Launch the remainder copy task - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + copy_task = Dagger.with_options(; tag=to_tag()) do + Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + end # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) @@ -359,7 +361,9 @@ function enqueue_remainder_copy_from!(state::DataDepsState, dest_space::MemorySp @dagdebug nothing :spawn_datadeps "($(typeof(arg_w.arg)))[$dep_mod] Remainder copy-from has $(length(remainder_syncdeps)) syncdeps" # Launch the remainder copy task - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + copy_task = Dagger.with_options(; tag=to_tag()) do + Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=remainder_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(remainder_aliasing, dest_space, source_space, arg_dest, arg_source) + end # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) @@ -388,9 +392,9 @@ function enqueue_copy_to!(state::DataDepsState, dest_space::MemorySpace, arg_w:: @dagdebug nothing :spawn_datadeps "($(repr(f)))[$(idx-1)][$dep_mod] Full copy-to has $(length(copy_syncdeps)) syncdeps" # Launch the remainder copy task - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) - - # This copy task becomes a new writer for the target region + copy_task = Dagger.with_options(; tag=to_tag()) do + Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) + end add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) end function enqueue_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w::ArgumentWrapper, @@ -415,7 +419,9 @@ function enqueue_copy_from!(state::DataDepsState, dest_space::MemorySpace, arg_w @dagdebug nothing :spawn_datadeps "($(typeof(arg_w.arg)))[$dep_mod] Full copy-from has $(length(copy_syncdeps)) syncdeps" # Launch the remainder copy task - copy_task = Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) + copy_task = Dagger.with_options(; tag=to_tag()) do + Dagger.@spawn scope=dest_scope exec_scope=dest_scope syncdeps=copy_syncdeps occupancy=Dict(Any=>0) meta=true Dagger.move!(dep_mod, dest_space, source_space, arg_dest, arg_source) + end # This copy task becomes a new writer for the target region add_writer!(state, arg_w, dest_space, target_ainfo, copy_task, write_num) diff --git a/src/mpi.jl b/src/mpi.jl index a0750599e..c0b5392f7 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -55,7 +55,6 @@ function aliasing(accel::MPIAcceleration, x::Chunk, T) tag = to_tag(hash(handle.id, hash(:aliasing))) check_uniform(tag) rank = MPI.Comm_rank(accel.comm) - if handle.rank == rank ainfo = aliasing(x, T) #Core.print("[$rank] aliasing: $ainfo, sending\n") @@ -356,17 +355,6 @@ function take_ref_id!() return MPIRefID(tid, uid, id) end -function to_tag(h::UInt) - # FIXME: Use some kind of bounded re-hashing - # FIXME: Re-hash with upper and lower - bound = MPI.tag_ub() - tag = abs(Base.unsafe_trunc(Int32, h)) - while tag > bound - tag = tag - bound - end - return tag -end - #TODO: partitioned scheduling with comm bifurcation function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) @assert space.comm == MPI.COMM_WORLD "$(space.comm) != $(MPI.COMM_WORLD)" @@ -673,8 +661,8 @@ function move!(dep_mod, to_space::MPIMemorySpace, from_space::MPIMemorySpace, to if to_space.rank == from_space.rank == local_rank move!(dep_mod, to_space.innerSpace, from_space.innerSpace, to, from) else - tag = to_tag(hash(dep_mod, hash(to.handle.id, hash(from.handle.id, hash(:move!))))) @dagdebug nothing :mpi "[$local_rank][$tag] Moving from $(from_space.rank) to $(to_space.rank)\n" + tag = to_tag() if local_rank == from_space.rank send_yield!(poolget(from.handle; uniform=false), to_space.comm, to_space.rank, tag) elseif local_rank == to_space.rank @@ -769,8 +757,8 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro task = DATADEPS_CURRENT_TASK[] return with(MPI_UID=>task.uid, MPI_UNIFORM=>true) do @assert data isa Chunk "Expected Chunk, got $(typeof(data))" - tag = to_tag(hash(data.handle.id)) space = memory_space(data) + tag = to_tag() if space.rank != from_proc.rank # If the data is already where it needs to be @assert space.rank == to_proc.rank @@ -792,21 +780,23 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro value = poolget(data.handle) data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) return tochunk(data_converted, to_proc, to_space) - elseif loc_rank == from_proc.rank - value = poolget(data.handle) - data_moved = move(from_proc.innerProc, to_proc.innerProc, value) - Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) - # FIXME: This is wrong to take typeof(data_moved), because the type may change - return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) - elseif loc_rank == to_proc.rank - data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) - data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) - return tochunk(data_converted, to_proc, to_space) else - T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) - T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T - @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" - return tochunk(nothing, to_proc, to_space; type=T_new) + if loc_rank == from_proc.rank + value = poolget(data.handle) + data_moved = move(from_proc.innerProc, to_proc.innerProc, value) + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + # FIXME: This is wrong to take typeof(data_moved), because the type may change + return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) + elseif loc_rank == to_proc.rank + data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) + return tochunk(data_converted, to_proc, to_space) + else + T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) + T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T + @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" + return tochunk(nothing, to_proc, to_space; type=T_new) + end end end end @@ -844,19 +834,18 @@ end #FIXME:try to think of a better move! scheme function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) - tag = to_tag(hash(sch_handle().thunk_id.id, hash(:execute!, UInt(0)))) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing + tag_space = to_tag() if islocal || inplace_move result = execute!(proc.innerProc, world, f, args...; kwargs...) end if inplace_move - # move! already handles communication space = memory_space(nothing, proc)::MPIMemorySpace return tochunk(nothing, proc, space) else - # Handle communication ourselves + # Handle commun1ication ourselves if islocal T = typeof(result) space = memory_space(result, proc)::MPIMemorySpace diff --git a/src/options.jl b/src/options.jl index ee53faa04..580ddef53 100644 --- a/src/options.jl +++ b/src/options.jl @@ -35,6 +35,7 @@ Stores per-task options to be passed to the scheduler. Base.@kwdef mutable struct Options propagates::Union{Vector{Symbol},Nothing} = nothing + tag::Union{UInt32,Nothing} = nothing acceleration::Union{Acceleration,Nothing} = nothing processor::Union{Processor,Nothing} = nothing scope::Union{AbstractScope,Nothing} = nothing @@ -123,6 +124,7 @@ signature `sig`, if the option was previously unspecified in `opts`. """ function populate_defaults!(opts::Options, sig) maybe_default!(opts, Val{:propagates}(), sig) + maybe_default!(opts, Val{:tag}(), sig) maybe_default!(opts, Val{:acceleration}(), sig) maybe_default!(opts, Val{:processor}(), sig) maybe_default!(opts, Val{:compute_scope}(), sig) From fc1ae09c169bc3336ab4f7d46dd8c85d84725ac2 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Mon, 10 Nov 2025 19:50:03 -0300 Subject: [PATCH 17/24] MPI: final re-work of tags --- src/datadeps/queue.jl | 1 + src/mpi.jl | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 47e405d68..ebf9f8fa6 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -7,6 +7,7 @@ function to_tag() return Dagger.get_tls().task_spec.options.tag::UInt32 end lock(TAG_WAITING) do counter_ref + @assert Sch.SCHED_MOVE[] == false "We should not create a tag on the scheduler unwrap move" tag = counter_ref[] counter_ref[] = tag + 1 > MPI.tag_ub() ? 1 : tag + 1 return tag diff --git a/src/mpi.jl b/src/mpi.jl index c0b5392f7..4b85122b9 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -52,7 +52,7 @@ MPIAcceleration() = MPIAcceleration(MPI.COMM_WORLD) function aliasing(accel::MPIAcceleration, x::Chunk, T) handle = x.handle::MPIRef @assert accel.comm == handle.comm "MPIAcceleration comm mismatch" - tag = to_tag(hash(handle.id, hash(:aliasing))) + tag = to_tag() check_uniform(tag) rank = MPI.Comm_rank(accel.comm) if handle.rank == rank @@ -638,7 +638,7 @@ WeakChunk(c::Chunk{T,H}) where {T,H<:MPIRef} = WeakChunk(c.handle.rank, c.handle function MemPool.poolget(ref::MPIRef; uniform::Bool=false) @assert uniform || ref.rank == MPI.Comm_rank(ref.comm) "MPIRef rank mismatch: $(ref.rank) != $(MPI.Comm_rank(ref.comm))" if uniform - tag = to_tag(hash(ref.id, hash(:poolget))) + tag = to_tag() if ref.rank == MPI.Comm_rank(ref.comm) value = poolget(ref.innerRef) @opcounter :poolget_bcast_send_yield @@ -684,7 +684,7 @@ function move!(dep_mod::RemainderAliasing{<:MPIMemorySpace}, to_space::MPIMemory if to_space.rank == from_space.rank == local_rank move!(dep_mod, to_space.innerSpace, from_space.innerSpace, to, from) else - tag = to_tag(hash(dep_mod, hash(to.handle.id, hash(from.handle.id, hash(:move!))))) + tag = to_tag() @dagdebug nothing :mpi "[$local_rank][$tag] Moving from $(from_space.rank) to $(to_space.rank)\n" if local_rank == from_space.rank # Get the source data for each span From 4f97dc24f0414211975eb44523aa9cac34e26b36 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Fri, 27 Feb 2026 06:06:39 +0000 Subject: [PATCH 18/24] MPI: recv pool+queue, malloc send buffers, MemPool completion queue, length validation - Recv pool with Irecv(ANY_SOURCE, ANY_TAG), completion queue by (comm, source, tag), recv_yield uses queue - Send serialization lock + non-GC (malloc) buffers for serialized path to avoid internal_Isend segfault on system MPICH - Completion queue stores poolset(ref), recv_yield poolget(ref) - MAX_SERIALIZED_RECV_LENGTH validation to avoid InexactError in MPI.Buffer - LocalPreferences.toml: system MPI (abi=MPICH, libmpi, mpiexec) - memory-spaces: remove duplicate LocalMemorySpan block Made-with: Cursor --- LocalPreferences.toml | 10 + Project.toml | 2 + src/array/alloc.jl | 15 +- src/array/copy.jl | 4 + src/array/darray.jl | 41 ++- src/datadeps/aliasing.jl | 259 +++++++++++--- src/datadeps/chunkview.jl | 23 +- src/datadeps/queue.jl | 220 +++++------- src/datadeps/scheduling.jl | 9 +- src/memory-spaces.jl | 65 +--- src/mpi.jl | 714 ++++++++++++++++++++++++++++++++----- src/utils/interval_tree.jl | 45 +-- 12 files changed, 987 insertions(+), 420 deletions(-) create mode 100644 LocalPreferences.toml diff --git a/LocalPreferences.toml b/LocalPreferences.toml new file mode 100644 index 000000000..3a11c113f --- /dev/null +++ b/LocalPreferences.toml @@ -0,0 +1,10 @@ +# When using system MPI, run once in the environment where you run MPI jobs (with MPI module loaded): +# julia --project=Dagger.jl -e 'using MPIPreferences; MPIPreferences.use_system_binary()' +# That populates abi, libmpi, mpiexec and avoids "Unknown MPI ABI nothing". +[MPIPreferences] +_format = "1.0" +abi = "MPICH" +binary = "system" +libmpi = "libmpi" +mpiexec = "mpiexec" +preloads = [] diff --git a/Project.toml b/Project.toml index b6d03531d..687094335 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" NextLA = "d37ed344-79c4-486d-9307-6d11355a15a3" @@ -78,6 +79,7 @@ Graphs = "1" JSON3 = "1" KernelAbstractions = "0.9" MPI = "0.20.22" +MPIPreferences = "0.1.11" MacroTools = "0.5" MemPool = "0.4.12" Metal = "1.1" diff --git a/src/array/alloc.jl b/src/array/alloc.jl index e67ca593c..862f48421 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -70,11 +70,11 @@ function partition(p::AbstractBlocks, dom::ArrayDomain) map(_cumlength, map(length, indexes(dom)), p.blocksize)) end -function allocate_array(f, T, idx, sz) +function allocate_array(f, ::Type{T}, idx, sz::NTuple{N,Int})::Array{T,N} where {T,N} new_f = allocate_array_func(task_processor(), f) return new_f(idx, T, sz) end -function allocate_array(f, T, sz) +function allocate_array(f, ::Type{T}, sz::NTuple{N,Int})::Array{T,N} where {T,N} new_f = allocate_array_func(task_processor(), f) return new_f(T, sz) end @@ -189,8 +189,15 @@ function Base.view(A::AbstractArray{T,N}, p::Blocks{N}; space=default_memory_spa d = ArrayDomain(Base.index_shape(A)) dc = partition(p, d) # N.B. We use `tochunk` because we only want to take the view locally, and - # taking views should be very fast - chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] + # taking views should be very fast. + # Per-chunk space for DArray: use each chunk's owner so tochunk on owner uses + # local_rank == space.rank and registers refs correctly (fixes MPI aliasing). + if A isa DArray && size(A.chunks) == size(dc) + chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), + (c = A.chunks[I]; c isa Chunk ? memory_space(c) : space))) for (I, x) in pairs(IndexCartesian(), dc)] + else + chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] + end return DArray(T, d, dc, chunks, p) end Base.view(A::AbstractArray, ::AutoBlocks) = diff --git a/src/array/copy.jl b/src/array/copy.jl index d032525f9..647305fee 100644 --- a/src/array/copy.jl +++ b/src/array/copy.jl @@ -84,6 +84,10 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices( Dagger.spawn_datadeps() do for Bidx in Bci + accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end Bpart = B.chunks[Bidx] Bsd_global_raw = padNmax(Bsd_all[Bidx]) Bsd_global_shifted = shift_ranges(Bsd_global_raw, Binds_offset) diff --git a/src/array/darray.jl b/src/array/darray.jl index e04bcf065..406494d40 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -582,26 +582,29 @@ DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::AssignmentType{1} = :arb DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::AssignmentType{2} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment) DArray(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = DArray(A, auto_blocks(A), assignment) -@warn "Add assignment to undef initializer" maxlog=1 -function DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}) where {T,N} - dist = auto_blocks(dims) - return DArray{T,N}(undef, dist, dims...) -end -function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}) where {T,N} - domain = ArrayDomain(ntuple(i->1:dims[i], N)) +struct AllocateUndef{S} end +(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims) + +function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} + domain = ArrayDomain(map(x->1:x, dims)) subdomains = partition(dist, domain) - tasks = Array{DTask,N}(undef, size(subdomains)...) - Dagger.spawn_datadeps() do - for (i, x) in enumerate(subdomains) - tasks[i] = Dagger.@spawn allocate_array_undef(T, size(x)) - end - end - return DArray(T, domain, subdomains, tasks, dist) -end -DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,)) -DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}) where {T,N} = - DArray{T,N}(undef, dist, (dims...,)) + a = AllocateArray(T, AllocateUndef{T}(), false, domain, subdomains, dist, assignment) + return _to_darray(a) +end +DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, (dims...,); assignment) +DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks(dims), dims; assignment) +DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) +DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, dims; assignment) +DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, (dims...,); assignment) +DArray{T}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks(dims), dims; assignment) +DArray{T}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N} collect(x) == y diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index aec83d039..39dbaf541 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -251,6 +251,120 @@ struct HistoryEntry write_num::Int end +struct AliasedObjectCacheStore + keys::Vector{AbstractAliasing} + derived::Dict{AbstractAliasing,AbstractAliasing} + stored::Dict{MemorySpace,Set{AbstractAliasing}} + values::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} +end +AliasedObjectCacheStore() = + AliasedObjectCacheStore(Vector{AbstractAliasing}(), + Dict{AbstractAliasing,AbstractAliasing}(), + Dict{MemorySpace,Set{AbstractAliasing}}(), + Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}()) + +function is_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) + if !haskey(cache.stored, space) + return false + end + if !haskey(cache.derived, ainfo) + return false + end + key = cache.derived[ainfo] + return key in cache.stored[space] +end +function is_key_present(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) + return haskey(cache.derived, ainfo) +end +function get_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) + @assert is_stored(cache, space, ainfo) "Cache does not have derived ainfo $ainfo" + key = cache.derived[ainfo] + return cache.values[space][key] +end +function set_stored!(cache::AliasedObjectCacheStore, dest_space::MemorySpace, value::Chunk, ainfo::AbstractAliasing) + @assert !is_stored(cache, dest_space, ainfo) "Cache already has derived ainfo $ainfo" + key = cache.derived[ainfo] + value_ainfo = aliasing(value, identity) + cache.derived[value_ainfo] = key + push!(get!(Set{AbstractAliasing}, cache.stored, dest_space), key) + values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, dest_space) + values_dict[key] = value + return +end +function set_key_stored!(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) + push!(cache.keys, ainfo) + cache.derived[ainfo] = ainfo + push!(get!(Set{AbstractAliasing}, cache.stored, space), ainfo) + values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, space) + values_dict[ainfo] = value + return +end + +struct AliasedObjectCache + space::MemorySpace + chunk::Chunk +end +function is_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) + wid = root_worker_id(cache.chunk) + if wid != myid() + return remotecall_fetch(is_stored, wid, cache, ainfo) + end + cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore + return is_stored(cache_raw, cache.space, ainfo) +end +function is_key_present(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing) + wid = root_worker_id(cache.chunk) + if wid != myid() + return remotecall_fetch(is_key_present, wid, cache, space, ainfo) + end + cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore + return is_key_present(cache_raw, space, ainfo) +end +function get_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) + wid = root_worker_id(cache.chunk) + if wid != myid() + return remotecall_fetch(get_stored, wid, cache, ainfo) + end + cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore + return get_stored(cache_raw, cache.space, ainfo) +end +function set_stored!(cache::AliasedObjectCache, value::Chunk, ainfo::AbstractAliasing) + wid = root_worker_id(cache.chunk) + if wid != myid() + return remotecall_fetch(set_stored!, wid, cache, value, ainfo) + end + cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore + set_stored!(cache_raw, cache.space, value, ainfo) + return +end +function set_key_stored!(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) + wid = root_worker_id(cache.chunk) + if wid != myid() + return remotecall_fetch(set_key_stored!, wid, cache, space, ainfo, value) + end + cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore + set_key_stored!(cache_raw, space, ainfo, value) +end +function aliased_object!(f, cache::AliasedObjectCache, x; ainfo=aliasing(current_acceleration(), x, identity)) + x_space = memory_space(x) + if !is_key_present(cache, x_space, ainfo) + x_chunk = x isa Chunk ? x : tochunk(x, first(processors(x_space))) + set_key_stored!(cache, x_space, ainfo, x_chunk) + end + if is_stored(cache, ainfo) + return get_stored(cache, ainfo) + else + y = f(x) + @assert y isa Chunk "Didn't get a Chunk from functor" + @assert memory_space(y) == cache.space "Space mismatch! $(memory_space(y)) != $(cache.space)" + if memory_space(x) != cache.space + @assert ainfo != aliasing(current_acceleration(), y, identity) "Aliasing mismatch! $ainfo == $(aliasing(current_acceleration(), y, identity))" + end + set_stored!(cache, y, ainfo) + return y + end +end + @warn "Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper" maxlog=1 struct DataDepsState # The mapping of original raw argument to its Chunk @@ -288,7 +402,7 @@ struct DataDepsState # The mapping of, for a given memory space, the backing Chunks that an ainfo references # Used by slot generation to replace the backing Chunks during move - ainfo_backing_chunk::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} + ainfo_backing_chunk::Chunk{AliasedObjectCacheStore} # Cache of argument's supports_inplace_move query result supports_inplace_cache::IdDict{Any,Bool} @@ -320,7 +434,7 @@ struct DataDepsState ainfo_arg = Dict{AliasingWrapper,ArgumentWrapper}() arg_owner = Dict{ArgumentWrapper,MemorySpace}() arg_overlaps = Dict{ArgumentWrapper,Set{ArgumentWrapper}}() - ainfo_backing_chunk = Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}() + ainfo_backing_chunk = tochunk(AliasedObjectCacheStore()) arg_history = Dict{ArgumentWrapper,Vector{HistoryEntry}}() supports_inplace_cache = IdDict{Any,Bool}() @@ -331,13 +445,17 @@ struct DataDepsState ainfos_owner = Dict{AliasingWrapper,Union{Pair{DTask,Int},Nothing}}() ainfos_readers = Dict{AliasingWrapper,Vector{Pair{DTask,Int}}}() - return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_owner, arg_overlaps, ainfo_backing_chunk, arg_history, + return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_history, arg_owner, arg_overlaps, ainfo_backing_chunk, supports_inplace_cache, ainfo_cache, ainfos_overlaps, ainfos_owner, ainfos_readers) end end # N.B. arg_w must be the original argument wrapper, not a remote copy function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper) + accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end # Grab the remote copy of the argument, and calculate the ainfo remote_arg = get_or_generate_slot!(state, target_space, arg_w.arg) remote_arg_w = ArgumentWrapper(remote_arg, arg_w.dep_mod) @@ -348,7 +466,7 @@ function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::Argum end # Calculate the ainfo - ainfo = AliasingWrapper(aliasing(current_acceleration(), remote_arg, arg_w.dep_mod)) + ainfo = AliasingWrapper(aliasing(accel, remote_arg, arg_w.dep_mod)) # Cache the result state.ainfo_cache[remote_arg_w] = ainfo @@ -375,8 +493,9 @@ function is_writedep(arg, deps, task::DTask) end # Aliasing state setup -function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) - # Track the task's arguments and access patterns +# Returns Vector/Tuple of DataDepsTaskArgument for consumption by distribute_task! +function populate_task_info!(state::DataDepsState, task_args, spec::DTaskSpec, task::DTask) + result = DataDepsTaskArgument[] for (idx, _arg) in enumerate(spec.fargs) arg = value(_arg) @@ -386,42 +505,49 @@ function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) # Unwrap the Chunk underlying any DTask arguments arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg - # Skip non-aliasing arguments - type_may_alias(typeof(arg)) || continue + may_alias = type_may_alias(typeof(arg)) + inplace_move = supports_inplace_move(state, arg) - # Skip arguments not supporting in-place move - supports_inplace_move(state, arg) || continue - - # Generate a Chunk for the argument if necessary - if haskey(state.raw_arg_to_chunk, arg) - arg = state.raw_arg_to_chunk[arg] - else - if !(arg isa Chunk) - new_arg = with(MPI_UID=>task.uid) do - tochunk(arg) - end - state.raw_arg_to_chunk[arg] = new_arg - arg = new_arg + if may_alias && inplace_move + # Generate a Chunk for the argument if necessary + if haskey(state.raw_arg_to_chunk, arg) + arg = state.raw_arg_to_chunk[arg] else - state.raw_arg_to_chunk[arg] = arg + if !(arg isa Chunk) + new_arg = with(MPI_UID=>task.uid) do + tochunk(arg) + end + state.raw_arg_to_chunk[arg] = new_arg + arg = new_arg + else + state.raw_arg_to_chunk[arg] = arg + end end - end - - # Track the origin space of the argument - origin_space = memory_space(arg) - check_uniform(origin_space) - state.arg_origin[arg] = origin_space - state.remote_arg_to_original[arg] = arg - # Populate argument info for all aliasing dependencies - for (dep_mod, _, _) in deps - # Generate an ArgumentWrapper for the argument - aw = ArgumentWrapper(arg, dep_mod) + # Track the origin space of the argument + origin_space = memory_space(arg) + check_uniform(origin_space) + state.arg_origin[arg] = origin_space + state.remote_arg_to_original[arg] = arg + + # Populate argument info for all aliasing dependencies + dep_infos = DataDepsTaskDependency[ + DataDepsTaskDependency(ArgumentWrapper(arg, dep_mod), readdep, writedep) + for (dep_mod, readdep, writedep) in deps + ] + + # Populate argument info for all aliasing dependencies + for (dep_mod, _, _) in deps + aw = ArgumentWrapper(arg, dep_mod) + populate_argument_info!(state, aw, origin_space) + end - # Populate argument info - populate_argument_info!(state, aw, origin_space) + push!(result, DataDepsTaskArgument(arg, ArgPosition(_arg.pos), true, true, dep_infos)) + else + push!(result, DataDepsTaskArgument(arg, ArgPosition(_arg.pos), may_alias, inplace_move, DataDepsTaskDependency[])) end end + return spec.fargs isa Tuple ? (result...,) : result end function populate_argument_info!(state::DataDepsState, arg_w::ArgumentWrapper, origin_space::MemorySpace) # Initialize ownership and history @@ -620,7 +746,6 @@ function generate_slot!(state::DataDepsState, dest_space, data) to_proc = first(processors(dest_space)) from_proc = first(processors(orig_space)) dest_space_args = get!(IdDict{Any,Any}, state.remote_args, dest_space) - ALIASED_OBJECT_CACHE[] = get!(Dict{AbstractAliasing,Chunk}, state.ainfo_backing_chunk, dest_space) if orig_space == dest_space && (data isa Chunk || !isremotehandle(data)) # Fast path for local data that's already in a Chunk or not a remote handle needing rewrapping task = DATADEPS_CURRENT_TASK[] @@ -628,18 +753,17 @@ function generate_slot!(state::DataDepsState, dest_space, data) tochunk(data, from_proc) end else + aliased_object_cache = AliasedObjectCache(dest_space, state.ainfo_backing_chunk) ctx = Sch.eager_context() id = rand(Int) @maybelog ctx timespan_start(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data)) - data_chunk = move_rewrap(from_proc, to_proc, orig_space, dest_space, data) + data_chunk = move_rewrap(aliased_object_cache, from_proc, to_proc, orig_space, dest_space, data) @maybelog ctx timespan_finish(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data=data_chunk)) end @assert memory_space(data_chunk) == dest_space "space mismatch! $dest_space (dest) != $(memory_space(data_chunk)) (actual) ($(typeof(data)) (data) vs. $(typeof(data_chunk)) (chunk)), spaces ($orig_space -> $dest_space)" dest_space_args[data] = data_chunk state.remote_arg_to_original[data_chunk] = data - ALIASED_OBJECT_CACHE[] = nothing - check_uniform(memory_space(dest_space_args[data])) check_uniform(processor(dest_space_args[data])) check_uniform(dest_space_args[data].handle) @@ -656,11 +780,64 @@ function get_or_generate_slot!(state, dest_space, data) end return state.remote_args[dest_space][data] end -function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) - return aliased_object!(data) do data +function rewrap_aliased_object!(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x) + return aliased_object!(cache, x) do x + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, x) + end +end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data::Chunk) + # MPI: Chunk with MPIRef - data may live on another rank; root_worker_id(MPIRef)=myid() is wrong + if data.handle isa MPIRef + if data.handle.rank != MPI.Comm_rank(data.handle.comm) + # Data is on a different MPI rank; use MPI transfer via remotecall_endpoint + return aliased_object!(cache, data) do data + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, data) + end + end + else + wid = root_worker_id(data) + if wid != myid() + return remotecall_fetch(move_rewrap, wid, cache, from_proc, to_proc, from_space, to_space, data) + end + end + data_raw = unwrap(data) + return move_rewrap(cache, from_proc, to_proc, from_space, to_space, data_raw) +end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) + return aliased_object!(cache, data) do data return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, data) end end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) + to_w = root_worker_id(to_proc) + p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) + inds = parentindices(v) + return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, inds) do from_proc, to_proc, from_space, to_space, p_chunk, inds + p_new = move(from_proc, to_proc, p_chunk) + v_new = view(p_new, inds...) + return tochunk(v_new, to_proc, to_space) + end +end +for wrapper in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular) + @eval function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::$(wrapper)) + to_w = root_worker_id(to_proc) + p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) + return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk) do from_proc, to_proc, from_space, to_space, p_chunk + p_new = move(from_proc, to_proc, p_chunk) + v_new = $(wrapper)(p_new) + return tochunk(v_new, to_proc, to_space) + end + end +end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::Base.RefValue) + return aliased_object!(cache, v) do v + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, v) + end +end +move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::String) = x +move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Symbol) = x +move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Type) = x + function remotecall_endpoint(f, ::Dagger.DistributedAcceleration, from_proc, to_proc, orig_space, dest_space, data) to_w = root_worker_id(to_proc) return remotecall_fetch(to_w, from_proc, to_proc, dest_space, data) do from_proc, to_proc, dest_space, data diff --git a/src/datadeps/chunkview.jl b/src/datadeps/chunkview.jl index 6e2a21dfd..2040b8d52 100644 --- a/src/datadeps/chunkview.jl +++ b/src/datadeps/chunkview.jl @@ -32,23 +32,14 @@ aliasing(x::ChunkView) = memory_space(x::ChunkView) = memory_space(x.chunk) isremotehandle(x::ChunkView) = true -# This definition is here because it's so similar to ChunkView -function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) - p_chunk = aliased_object!(parent(v)) do p_chunk - return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) - end - inds = parentindices(v) - return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new - return view(p_new, inds...) - end -end -function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) - p_chunk = aliased_object!(slice.chunk) do p_chunk - return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) - end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) + to_w = root_worker_id(to_proc) + p_chunk = move_rewrap(cache, from_proc, to_proc, from_space, to_space, slice.chunk) inds = slice.slices - return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new - return view(p_new, inds...) + return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, inds) do from_proc, to_proc, from_space, to_space, p_chunk, inds + p_new = move(from_proc, to_proc, p_chunk) + v_new = view(p_new, inds...) + return tochunk(v_new, to_proc, to_space) end end diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index ebf9f8fa6..5de3fdcb2 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -10,6 +10,9 @@ function to_tag() @assert Sch.SCHED_MOVE[] == false "We should not create a tag on the scheduler unwrap move" tag = counter_ref[] counter_ref[] = tag + 1 > MPI.tag_ub() ? 1 : tag + 1 + # #region agent log + if tag >= 598 && tag <= 612; _r = MPI.Comm_rank(MPI.COMM_WORLD); if _r <= 1; try; _bt = String[]; for s in stacktrace(backtrace()); push!(_bt, "$(s.func)@$(basename(string(s.file))):$(s.line)"); length(_bt) >= 6 && break; end; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_totag\",\"location\":\"queue.jl:to_tag\",\"message\":\"to_tag critical range\",\"data\":{\"rank\":$_r,\"tag\":$tag,\"stack\":$(repr(join(_bt, " > ")))},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end; end + # #endregion return tag end end @@ -26,7 +29,9 @@ struct DataDepsTaskQueue <: AbstractTaskQueue # How to traverse the dependency graph when launching tasks traversal::Symbol # Which scheduler to use to assign tasks to processors - scheduler::Symbol + # DataDepsScheduler objects use datadeps_schedule_task (master API); + # :smart/:ultra Symbols use legacy inline logic + scheduler::Union{DataDepsScheduler,Symbol} # Whether aliasing across arguments is possible # The fields following only apply when aliasing==true @@ -34,12 +39,18 @@ struct DataDepsTaskQueue <: AbstractTaskQueue function DataDepsTaskQueue(upper_queue; traversal::Symbol=:inorder, - scheduler::Symbol=:naive, + scheduler::Union{DataDepsScheduler,Symbol}=RoundRobinScheduler(), aliasing::Bool=true) + # Convert Symbol to scheduler object for master API compatibility + sched = scheduler isa Symbol ? (scheduler == :roundrobin ? RoundRobinScheduler() : + scheduler == :naive ? NaiveScheduler() : + scheduler == :smart ? NaiveScheduler() : # closest equivalent + scheduler == :ultra ? UltraScheduler() : + scheduler) : scheduler seen_tasks = DTaskPair[] g = SimpleDiGraph() task_to_id = Dict{DTask,Int}() - return new(upper_queue, seen_tasks, g, task_to_id, traversal, scheduler, + return new(upper_queue, seen_tasks, g, task_to_id, traversal, sched, aliasing) end end @@ -91,25 +102,33 @@ experimental and subject to change. """ function spawn_datadeps(f::Base.Callable; static::Bool=true, traversal::Symbol=:inorder, - scheduler::Union{Symbol,Nothing}=nothing, + scheduler::Union{DataDepsScheduler,Symbol,Nothing}=nothing, aliasing::Bool=true, launch_wait::Union{Bool,Nothing}=nothing) if !static throw(ArgumentError("Dynamic scheduling is no longer available")) end wait_all(; check_errors=true) do - scheduler = something(scheduler, DATADEPS_SCHEDULER[], :roundrobin)::Symbol + scheduler = something(scheduler, DATADEPS_SCHEDULER[], RoundRobinScheduler()) launch_wait = something(launch_wait, DATADEPS_LAUNCH_WAIT[], false)::Bool if launch_wait result = spawn_bulk() do queue = DataDepsTaskQueue(get_options(:task_queue); traversal, scheduler, aliasing) + accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end with_options(f; task_queue=queue) distribute_tasks!(queue) end else queue = DataDepsTaskQueue(get_options(:task_queue); traversal, scheduler, aliasing) + accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end result = with_options(f; task_queue=queue) distribute_tasks!(queue) end @@ -117,7 +136,7 @@ function spawn_datadeps(f::Base.Callable; static::Bool=true, return result end end -const DATADEPS_SCHEDULER = ScopedValue{Union{Symbol,Nothing}}(nothing) +const DATADEPS_SCHEDULER = ScopedValue{Union{DataDepsScheduler,Symbol,Nothing}}(nothing) const DATADEPS_LAUNCH_WAIT = ScopedValue{Union{Bool,Nothing}}(nothing) @warn "Don't blindly set occupancy=0, only do for MPI" maxlog=1 @@ -133,6 +152,9 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Get the set of all processors to be scheduled on scope = get_compute_scope() accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end accel_procs = filter(procs(Dagger.Sch.eager_context())) do proc Dagger.accel_matches_proc(accel, proc) end @@ -143,7 +165,9 @@ function distribute_tasks!(queue::DataDepsTaskQueue) if isempty(all_procs) throw(Sch.SchedulingException("No processors available, try widening scope")) end + all_scope = UnionScope(map(ExactScope, all_procs)...) exec_spaces = unique(vcat(map(proc->collect(memory_spaces(proc)), all_procs)...)) + DATADEPS_EXEC_SPACES[] = exec_spaces #=if !all(space->space isa CPURAMMemorySpace, exec_spaces) && !all(space->root_worker_id(space) == myid(), exec_spaces) @warn "Datadeps support for multi-GPU, multi-worker is currently broken\nPlease be prepared for incorrect results or errors" maxlog=1 end=# @@ -207,19 +231,29 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Start launching tasks and necessary copies write_num = 1 - proc_idx = 1 - #pressures = Dict{Processor,Int}() proc_to_scope_lfu = BasicLFUCache{Processor,AbstractScope}(1024) for pair in queue.seen_tasks[task_order] spec = pair.spec task = pair.task - write_num, proc_idx = distribute_task!(queue, state, all_procs, spec, task, spec.fargs, proc_to_scope_lfu, write_num, proc_idx) + write_num = distribute_task!(queue, state, all_procs, all_scope, spec, task, spec.fargs, proc_to_scope_lfu, write_num) end # Copy args from remote to local # N.B. We sort the keys to ensure a deterministic order for uniformity check_uniform(length(state.arg_owner)) + # #region agent log + if accel isa MPIAcceleration + try + open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io + println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:copy_from_phase\",\"message\":\"Starting copy-from phase\",\"data\":{\"rank\":$(MPI.Comm_rank(accel.comm)),\"arg_owner_count\":$(length(state.arg_owner))},\"timestamp\":$(round(Int,time()*1000))}") + end + catch; end + end + # #endregion for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end check_uniform(arg_w) arg = arg_w.arg origin_space = state.arg_origin[arg] @@ -259,9 +293,25 @@ struct TypedDataDepsTaskArgument{T,N} end map_or_ntuple(f, xs::Vector) = map(f, 1:length(xs)) map_or_ntuple(f, xs::Tuple) = ntuple(f, length(xs)) -function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int, proc_idx::Int) where typed +function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, all_scope, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int) where typed @specialize spec fargs + accel = current_acceleration() + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end + + # #region agent log + r = accel isa MPIAcceleration ? MPI.Comm_rank(accel.comm) : -1 + if accel isa MPIAcceleration + try + open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io + println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:distribute_task_entry\",\"message\":\"distribute_task entry\",\"data\":{\"rank\":$r,\"task_id\":$(task.id)},\"timestamp\":$(round(Int,time()*1000))}") + end + catch; end + end + # #endregion + DATADEPS_CURRENT_TASK[] = task if typed @@ -270,143 +320,30 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr fargs::Vector{Argument} end + task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) scheduler = queue.scheduler - if scheduler == :naive - raw_args = map(arg->tochunk(value(arg)), spec.fargs) - our_proc = remotecall_fetch(1, all_procs, raw_args) do all_procs, raw_args - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - - @lock sch_state.lock begin - # Calculate costs per processor and select the most optimal - # FIXME: This should consider any already-allocated slots, - # whether they are up-to-date, and if not, the cost of moving - # data to them - procs, costs = Sch.estimate_task_costs(sch_state, all_procs, nothing, raw_args) - return first(procs) - end - end - elseif scheduler == :smart - raw_args = map(filter(arg->haskey(state.data_locality, value(arg)), spec.fargs)) do arg - arg_chunk = tochunk(value(arg)) - # Only the owned slot is valid - # FIXME: Track up-to-date copies and pass all of those - return arg_chunk => data_locality[arg] - end - f_chunk = tochunk(value(spec.fargs[1])) - our_proc, task_pressure = remotecall_fetch(1, all_procs, pressures, f_chunk, raw_args) do all_procs, pressures, f, chunks_locality - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - - @lock sch_state.lock begin - tx_rate = sch_state.transfer_rate[] - - costs = Dict{Processor,Float64}() - for proc in all_procs - # Filter out chunks that are already local - chunks_filt = Iterators.filter(((chunk, space)=chunk_locality)->!(proc in processors(space)), chunks_locality) - - # Estimate network transfer costs based on data size - # N.B. `affinity(x)` really means "data size of `x`" - # N.B. We treat same-worker transfers as having zero transfer cost - tx_cost = Sch.impute_sum(affinity(chunk)[2] for chunk in chunks_filt) - - # Estimate total cost to move data and get task running after currently-scheduled tasks - est_time_util = get(pressures, proc, UInt64(0)) - costs[proc] = est_time_util + (tx_cost/tx_rate) - end - # Look up estimated task cost - sig = Sch.signature(sch_state, f, map(first, chunks_locality)) - task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) - - # Shuffle procs around, so equally-costly procs are equally considered - P = randperm(length(all_procs)) - procs = getindex.(Ref(all_procs), P) - - # Sort by lowest cost first - sort!(procs, by=p->costs[p]) - - best_proc = first(procs) - return best_proc, task_pressure - end - end - # FIXME: Pressure should be decreased by pressure of syncdeps on same processor - pressures[our_proc] = get(pressures, our_proc, UInt64(0)) + task_pressure - elseif scheduler == :ultra - args = Base.mapany(spec.fargs) do arg - pos, data = arg - data, _ = unwrap_inout(data) - if data isa DTask - data = fetch(data; move_value=false, unwrap=false) - end - return pos => tochunk(data) - end - f_chunk = tochunk(value(spec.fargs[1])) - task_time = remotecall_fetch(1, f_chunk, args) do f, args - Sch.init_eager() - sch_state = Sch.EAGER_STATE[] - return @lock sch_state.lock begin - sig = Sch.signature(sch_state, f, args) - return get(sch_state.signature_time_cost, sig, 1000^3) - end - end - - # FIXME: Copy deps are computed eagerly - deps = @something(spec.options.syncdeps, Set{Any}()) - - # Find latest time-to-completion of all syncdeps - deps_completed = UInt64(0) - for dep in deps - haskey(sstate.task_completions, dep) || continue # copy deps aren't recorded - deps_completed = max(deps_completed, sstate.task_completions[dep]) - end - - # Find latest time-to-completion of each memory space - # FIXME: Figure out space completions based on optimal packing - spaces_completed = Dict{MemorySpace,UInt64}() - for space in exec_spaces - completed = UInt64(0) - for (task, other_space) in sstate.assignments - space == other_space || continue - completed = max(completed, sstate.task_completions[task]) - end - spaces_completed[space] = completed - end - - # Choose the earliest-available memory space and processor - # FIXME: Consider move time - move_time = UInt64(0) - local our_space_completed - while true - our_space_completed, our_space = findmin(spaces_completed) - our_space_procs = filter(proc->proc in all_procs, processors(our_space)) - if isempty(our_space_procs) - delete!(spaces_completed, our_space) - continue - end - our_proc = rand(our_space_procs) - break - end - - sstate.task_to_spec[task] = spec - sstate.assignments[task] = our_space - sstate.task_completions[task] = our_space_completed + move_time + task_time - elseif scheduler == :roundrobin - our_proc = all_procs[proc_idx] - else - error("Invalid scheduler: $sched") - end + # Use datadeps_schedule_task (master API) + our_proc = datadeps_schedule_task(scheduler, state, all_procs, all_scope, task_scope, spec, task) @assert our_proc in all_procs our_space = only(memory_spaces(our_proc)) + # #region agent log + if accel isa MPIAcceleration + proc_rank = our_proc isa Dagger.MPIProcessor ? our_proc.rank : (our_proc isa Dagger.MPIOSProc ? our_proc.rank : -1) + try + open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io + println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:distribute_task_scheduled\",\"message\":\"task scheduled\",\"data\":{\"rank\":$r,\"task_id\":$(task.id),\"our_proc_rank\":$proc_rank},\"timestamp\":$(round(Int,time()*1000))}") + end + catch; end + end + # #endregion # Find the scope for this task (and its copies) - task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) - if task_scope == scope + if task_scope == all_scope # Optimize for the common case, cache the proc=>scope mapping our_scope = get!(proc_to_scope_lfu, our_proc) do our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - return constrain(UnionScope(map(ExactScope, our_procs)...), scope) + return constrain(UnionScope(map(ExactScope, our_procs)...), all_scope) end else # Use the provided scope and constrain it to the available processors @@ -443,6 +380,10 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr # Copy args from local to remote remote_args = map_or_ntuple(task_arg_ws) do idx + if accel isa MPIAcceleration + service_aliasing_requests(accel.comm) + end + arg_ws = task_arg_ws[idx] arg = arg_ws.arg pos = raw_position(arg_ws.pos) @@ -563,7 +504,6 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr end write_num += 1 - proc_idx = mod1(proc_idx + 1, length(all_procs)) - return write_num, proc_idx + return write_num end diff --git a/src/datadeps/scheduling.jl b/src/datadeps/scheduling.jl index 0bf9818f6..d883c687c 100644 --- a/src/datadeps/scheduling.jl +++ b/src/datadeps/scheduling.jl @@ -1,5 +1,10 @@ +export DataDepsScheduler, RoundRobinScheduler, NaiveScheduler, UltraScheduler + abstract type DataDepsScheduler end +# Set by distribute_tasks! before the task loop for UltraScheduler +const DATADEPS_EXEC_SPACES = Ref{Union{Vector{<:MemorySpace},Nothing}}(nothing) + mutable struct RoundRobinScheduler <: DataDepsScheduler proc_idx::Int RoundRobinScheduler() = new(1) @@ -79,7 +84,7 @@ function datadeps_schedule_task(sched::UltraScheduler, state::DataDepsState, all end # FIXME: Copy deps are computed eagerly - deps = @something(spec.options.syncdeps, Set{ThunkSyncdep}()) + deps = @something(spec.options.syncdeps, Set{Any}()) # Find latest time-to-completion of all syncdeps deps_completed = UInt64(0) @@ -90,6 +95,8 @@ function datadeps_schedule_task(sched::UltraScheduler, state::DataDepsState, all # Find latest time-to-completion of each memory space # FIXME: Figure out space completions based on optimal packing + # exec_spaces is set by distribute_tasks! before the task loop + exec_spaces = something(DATADEPS_EXEC_SPACES[], unique(vcat(map(proc->collect(memory_spaces(proc)), all_procs)...))) spaces_completed = Dict{MemorySpace,UInt64}() for space in exec_spaces completed = UInt64(0) diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index dd9b8dc3f..a41e91d4b 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -24,6 +24,7 @@ struct CPURAMMemorySpace <: MemorySpace owner::Int end root_worker_id(space::CPURAMMemorySpace) = space.owner +root_worker_id(c::Chunk) = root_worker_id(c.handle) CPURAMMemorySpace() = CPURAMMemorySpace(myid()) @@ -136,37 +137,7 @@ end may_alias(::MemorySpace, ::MemorySpace) = true may_alias(space1::CPURAMMemorySpace, space2::CPURAMMemorySpace) = space1.owner == space2.owner -struct RemotePtr{T,S<:MemorySpace} <: Ref{T} - addr::UInt - space::S -end -RemotePtr{T}(addr::UInt, space::S) where {T,S} = RemotePtr{T,S}(addr, space) -RemotePtr{T}(ptr::Ptr{V}, space::S) where {T,V,S} = RemotePtr{T,S}(UInt(ptr), space) -RemotePtr{T}(ptr::Ptr{V}) where {T,V} = RemotePtr{T}(UInt(ptr), CPURAMMemorySpace(myid())) -# FIXME: Don't hardcode CPURAMMemorySpace -RemotePtr(addr::UInt) = RemotePtr{Cvoid}(addr, CPURAMMemorySpace(myid())) -Base.convert(::Type{RemotePtr}, x::Ptr{T}) where T = - RemotePtr(UInt(x), CPURAMMemorySpace(myid())) -Base.convert(::Type{<:RemotePtr{V}}, x::Ptr{T}) where {V,T} = - RemotePtr{V}(UInt(x), CPURAMMemorySpace(myid())) -Base.convert(::Type{UInt}, ptr::RemotePtr) = ptr.addr -Base.:+(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr + offset, ptr.space) -Base.:-(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr - offset, ptr.space) -function Base.isless(ptr1::RemotePtr, ptr2::RemotePtr) - @assert ptr1.space == ptr2.space - return ptr1.addr < ptr2.addr -end - -struct MemorySpan{S} - ptr::RemotePtr{Cvoid,S} - len::UInt -end -MemorySpan(ptr::RemotePtr{Cvoid,S}, len::Integer) where S = - MemorySpan{S}(ptr, UInt(len)) -MemorySpan{S}(addr::UInt, len::Integer) where S = - MemorySpan{S}(RemotePtr{Cvoid,S}(addr), UInt(len)) -Base.isless(a::MemorySpan, b::MemorySpan) = a.ptr < b.ptr -Base.isempty(x::MemorySpan) = x.len == 0 +# RemotePtr and MemorySpan are defined in utils/memory-span.jl (included earlier) abstract type AbstractAliasing end memory_spans(::T) where T<:AbstractAliasing = throw(ArgumentError("Must define `memory_spans` for `$T`")) memory_spans(x) = memory_spans(aliasing(x)) @@ -454,34 +425,4 @@ function will_alias(x_span::MemorySpan, y_span::MemorySpan) return x_span.ptr <= y_end && y_span.ptr <= x_end end -### More space-efficient memory spans - -struct LocalMemorySpan - ptr::UInt - len::UInt -end -LocalMemorySpan(span::MemorySpan) = LocalMemorySpan(span.ptr.addr, span.len) -Base.isempty(x::LocalMemorySpan) = x.len == 0 - -# FIXME: Store the length separately, since it's shared by all spans -struct ManyMemorySpan{N} - spans::NTuple{N,LocalMemorySpan} -end -Base.isempty(x::ManyMemorySpan) = all(isempty, x.spans) - -struct ManyPair{N} <: Unsigned - pairs::NTuple{N,UInt} -end -Base.promote_rule(::Type{ManyPair}, ::Type{T}) where {T<:Integer} = ManyPair -Base.convert(::Type{ManyPair{N}}, x::T) where {T<:Integer,N} = ManyPair(ntuple(i -> x, N)) -Base.convert(::Type{ManyPair}, x::ManyPair) = x -Base.:+(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] + y.pairs[i], N)) -Base.:-(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] - y.pairs[i], N)) -Base.:-(x::ManyPair) = error("Can't negate a ManyPair") -Base.:(==)(x::ManyPair, y::ManyPair) = x.pairs == y.pairs -Base.isless(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] -Base.:(<)(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] -Base.string(x::ManyPair) = "ManyPair($(x.pairs))" - -ManyMemorySpan{N}(start::ManyPair{N}, len::ManyPair{N}) where N = - ManyMemorySpan{N}(ntuple(i -> LocalMemorySpan(start.pairs[i], len.pairs[i]), N)) +# LocalMemorySpan, ManyMemorySpan, ManyPair are defined in utils/memory-span.jl (included earlier) diff --git a/src/mpi.jl b/src/mpi.jl index 4b85122b9..bb4d55ab9 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -49,21 +49,38 @@ struct MPIAcceleration <: Acceleration end MPIAcceleration() = MPIAcceleration(MPI.COMM_WORLD) +const ALIASING_REQUEST_TAG = UInt32(0xFF00) + +# Deterministic tag for data movement (remotecall_endpoint). Uses a separate tag space +# so it cannot collide with the global to_tag() counter used by execute! / aliasing. +# Prevents symmetric deadlock when one rank blocks in remotecall_endpoint recv while +# another rank consumes the same counter value in execute! recv. +const REMOTECALL_TAG_BASE = 100_000 +const REMOTECALL_TAG_RANGE = 424_287 # so base+range-1 <= typical tag_ub +function remotecall_tag(comm::MPI.Comm, uid, from_rank::Int, to_rank::Int, ref_id) + tag_ub = Int(MPI.tag_ub()) + range = min(REMOTECALL_TAG_RANGE, max(1, tag_ub - REMOTECALL_TAG_BASE + 1)) + h = hash((uid, from_rank, to_rank, ref_id)) + tag = REMOTECALL_TAG_BASE + Int(rem(h, UInt(range))) + return UInt32(tag) +end + function aliasing(accel::MPIAcceleration, x::Chunk, T) handle = x.handle::MPIRef @assert accel.comm == handle.comm "MPIAcceleration comm mismatch" - tag = to_tag() - check_uniform(tag) rank = MPI.Comm_rank(accel.comm) if handle.rank == rank ainfo = aliasing(x, T) - #Core.print("[$rank] aliasing: $ainfo, sending\n") - @opcounter :aliasing_bcast_send_yield - bcast_send_yield(ainfo, accel.comm, handle.rank, tag) - else - #Core.print("[$rank] aliasing: receiving from $(handle.rank)\n") - ainfo = recv_yield(accel.comm, handle.rank, tag) - #Core.print("[$rank] aliasing: received $ainfo\n") + check_uniform(ainfo) + return ainfo + end + response_tag = to_tag() + check_uniform(response_tag) + request_payload = (handle, T, response_tag) + _send_yield_raw(request_payload, accel.comm, handle.rank, Int(ALIASING_REQUEST_TAG)) + ainfo = recv_yield(accel.comm, handle.rank, response_tag) + if ainfo isa Exception + throw(ainfo) end check_uniform(ainfo) return ainfo @@ -251,6 +268,8 @@ default_processor(space::MPIMemorySpace) = MPIOSProc(space.comm, space.rank) default_memory_space(accel::MPIAcceleration) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) default_memory_space(accel::MPIAcceleration, x) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) +default_memory_space(accel::MPIAcceleration, x::Array) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) +default_memory_space(accel::MPIAcceleration, x::AliasedObjectCacheStore) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) default_memory_space(accel::MPIAcceleration, x::Chunk) = MPIMemorySpace(CPURAMMemorySpace(myid()), x.handle.comm, x.handle.rank) default_memory_space(accel::MPIAcceleration, x::Function) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) default_memory_space(accel::MPIAcceleration, T::Type) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) @@ -332,29 +351,166 @@ function affinity(x::MPIRef) end end +const MPIREF_ORPHAN = Threads.Atomic{Int}(1) + function take_ref_id!() tid = 0 uid = 0 id = 0 + _branch = "" if Dagger.in_task() tid = sch_handle().thunk_id.id uid = 0 counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) + _branch = "in_task" elseif MPI_TID[] != 0 tid = MPI_TID[] uid = 0 counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) + _branch = "MPI_TID" elseif MPI_UID[] != 0 tid = 0 uid = MPI_UID[] counter = get!(MPIREF_UID, uid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) + _branch = "MPI_UID" + else + tid = 0 + uid = Int(Threads.atomic_add!(MPIREF_ORPHAN, 1)) + counter = get!(MPIREF_UID, uid, Threads.Atomic{Int}(1)) + id = Threads.atomic_add!(counter, 1) + _branch = "orphan" end return MPIRefID(tid, uid, id) end +const MPIREF_REGISTRY = Base.Lockable(Dict{MPIRefID, DRef}()) + +const ALIASING_PENDING = Vector{Tuple{MPIRef, Any, UInt32, Int}}() + +""" +Service any pending aliasing requests where we are the owner. +Called from recv_yield loops to avoid deadlock when a requester is blocking +waiting for aliasing from us while we're blocked waiting for someone else. +""" +# #region agent log +const _SAR_ACTIVE = Threads.Atomic{Int}(0) +# #endregion +const _SAR_OUTBOX = Vector{Tuple{Any, MPI.Comm, Int, UInt32}}() + +const _SAR_STATE = Ref{Symbol}(:idle) +const _SAR_LEN_BUF = Int64[0] +const _SAR_REQ = Ref{Union{Nothing, MPI.Request}}(nothing) +const _SAR_DATA_BUF = Ref{Vector{UInt8}}(UInt8[]) +const _SAR_SRC = Ref{Int}(0) + +function service_aliasing_requests(comm::MPI.Comm) + _prev = Threads.atomic_add!(_SAR_ACTIVE, 1) + if _prev > 0 + Threads.atomic_sub!(_SAR_ACTIVE, 1) + return + end + + rank = MPI.Comm_rank(comm) + + if !isempty(ALIASING_PENDING) + still_pending = Tuple{MPIRef, Any, UInt32, Int}[] + for (handle, dep_mod, response_tag, src) in ALIASING_PENDING + inner_ref = lock(MPIREF_REGISTRY) do reg + get(reg, handle.id, nothing) + end + if inner_ref !== nothing + value = poolget(inner_ref) + ainfo = aliasing(value, dep_mod) + push!(_SAR_OUTBOX, (ainfo, comm, src, response_tag)) + else + push!(still_pending, (handle, dep_mod, response_tag, src)) + end + end + empty!(ALIASING_PENDING) + append!(ALIASING_PENDING, still_pending) + end + + while true + if _SAR_STATE[] == :idle + _SAR_LEN_BUF[1] = 0 + _SAR_REQ[] = MPI.Irecv!(MPI.Buffer(_SAR_LEN_BUF), comm; + source=Int(MPI.API.MPI_ANY_SOURCE[]), tag=Int(ALIASING_REQUEST_TAG)) + _SAR_STATE[] = :wait_len + end + + if _SAR_STATE[] == :wait_len + done, status = MPI.Test(_SAR_REQ[], MPI.Status) + if !done + break + end + _SAR_SRC[] = MPI.Get_source(status) + nbytes = _SAR_LEN_BUF[1] + _SAR_DATA_BUF[] = Array{UInt8}(undef, nbytes) + _SAR_REQ[] = MPI.Irecv!(MPI.Buffer(_SAR_DATA_BUF[]), comm; + source=_SAR_SRC[], tag=Int(ALIASING_REQUEST_TAG)) + _SAR_STATE[] = :wait_data + end + + if _SAR_STATE[] == :wait_data + done, status = MPI.Test(_SAR_REQ[], MPI.Status) + if !done + break + end + payload = MPI.deserialize(_SAR_DATA_BUF[]) + # (SAR recv log removed to reduce noise) + (handle::MPIRef, dep_mod, response_tag::UInt32) = payload + if handle.rank == rank + inner_ref = handle.innerRef + if inner_ref === nothing + inner_ref = lock(MPIREF_REGISTRY) do reg + get(reg, handle.id, nothing) + end + end + if inner_ref !== nothing + value = poolget(inner_ref) + ainfo = aliasing(value, dep_mod) + push!(_SAR_OUTBOX, (ainfo, comm, _SAR_SRC[], response_tag)) + else + push!(ALIASING_PENDING, (handle, dep_mod, response_tag, _SAR_SRC[])) + end + end + _SAR_STATE[] = :idle + continue + end + end + + while !isempty(_SAR_OUTBOX) + (ainfo, _comm, dest, rtag) = popfirst!(_SAR_OUTBOX) + _send_outbox_response(ainfo, _comm, dest, Int(rtag)) + end + + Threads.atomic_sub!(_SAR_ACTIVE, 1) +end + +function _send_outbox_response(value, comm, dest, tag) + buf = MPI.serialize(value) + len_buf = Int64[length(buf)] + lock(SEND_SERIALIZE_LOCK) do + GC.@preserve buf len_buf begin + req_len = MPI.Isend(len_buf, comm; dest, tag) + while true + finish, _ = MPI.Test(req_len, MPI.Status) + finish && break + yield() + end + req_data = MPI.Isend(buf, comm; dest, tag) + while true + finish, _ = MPI.Test(req_data, MPI.Status) + finish && break + yield() + end + end + end +end + #TODO: partitioned scheduling with comm bifurcation function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) @assert space.comm == MPI.COMM_WORLD "$(space.comm) != $(MPI.COMM_WORLD)" @@ -363,15 +519,29 @@ function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) if local_rank != space.rank return MPIRef(space.comm, space.rank, 0, nothing, Mid) else - return MPIRef(space.comm, space.rank, sizeof(x), poolset(x; device, kwargs...), Mid) + innerRef = poolset(x; device, kwargs...) + lock(MPIREF_REGISTRY) do reg + reg[Mid] = innerRef + end + return MPIRef(space.comm, space.rank, sizeof(x), innerRef, Mid) end end const DEADLOCK_DETECT = TaskLocalValue{Bool}(()->true) const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->60.0) +# When true, __wait_for_request spins without yield so remotecall sender completes before other tasks run. +const REMOTECALL_SENDER_NO_YIELD = TaskLocalValue{Bool}(()->false) const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) +# MPI_ANY_TAG + queue: pool of Irecv(ANY_SOURCE, ANY_TAG) and completion queue keyed by (comm, source, tag). +const RECV_POOL_SIZE = 64 +const _RECV_POOL = Dict{MPI.Comm, Any}() # comm -> RecvPoolState +const _RECV_POOL_LOCK = ReentrantLock() + +# Completion queue: (comm, source, tag) -> list of received values (one waiter per key at a time). +const _RECV_COMPLETION_QUEUE = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Vector{Any}}()) + struct InplaceInfo type::DataType shape::Tuple @@ -385,6 +555,52 @@ struct InplaceSparseInfo nzval::Int end +# MPI.Buffer uses Int32 for count; reject corrupt or oversized length to avoid InexactError. +const MAX_SERIALIZED_RECV_LENGTH = Int64(typemax(Int32)) + +# Per-slot state for the recv pool. Phase: :waiting_length | :waiting_data | :waiting_inplace_* | :idle (slot free). +mutable struct RecvPoolSlot + phase::Symbol + comm::MPI.Comm + source::Int + tag::Int + len_buf::Vector{Int64} + req::Union{MPI.Request, Nothing} + data_buf::Vector{UInt8} + # Inplace: for InplaceInfo we store the array buffer; for InplaceSparseInfo we store (colptr, rowval, nzval) as we go. + inplace_info::Union{InplaceInfo, InplaceSparseInfo, Nothing} + inplace_bufs::Vector{Any} # accumulated inplace arrays +end + +mutable struct RecvPoolState + slots::Vector{RecvPoolSlot} + initialized::Bool +end + +function _recv_pool_for_comm(comm::MPI.Comm) + lock(_RECV_POOL_LOCK) do + if !haskey(_RECV_POOL, comm) + _RECV_POOL[comm] = RecvPoolState(RecvPoolSlot[], false) + end + return _RECV_POOL[comm] + end +end + +const _MPI_ANY_SOURCE = Int(MPI.API.MPI_ANY_SOURCE[]) +const _MPI_ANY_TAG = Int(MPI.API.MPI_ANY_TAG[]) + +function _recv_pool_init!(pool::RecvPoolState, comm::MPI.Comm) + pool.initialized && return + rank = MPI.Comm_rank(comm) + for i in 1:RECV_POOL_SIZE + len_buf = Int64[0] + req = MPI.Irecv!(MPI.Buffer(len_buf), comm; source=_MPI_ANY_SOURCE, tag=_MPI_ANY_TAG) + slot = RecvPoolSlot(:waiting_length, comm, -1, -1, len_buf, req, UInt8[], nothing, Any[]) + push!(pool.slots, slot) + end + pool.initialized = true +end + function supports_inplace_mpi(value) if value isa DenseArray && isbitstype(eltype(value)) return true @@ -394,74 +610,74 @@ function supports_inplace_mpi(value) end function recv_yield!(buffer, comm, src, tag) rank = MPI.Comm_rank(comm) - #Core.println("buffer recv: $buffer, type of buffer: $(typeof(buffer)), is in place? $(supports_inplace_mpi(buffer))") if !supports_inplace_mpi(buffer) return recv_yield(comm, src, tag), false end - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv! from [$src]") - - # Ensure no other receiver is waiting - our_event = Base.Event() - @label retry - other_event = lock(RECV_WAITING) do waiting - if haskey(waiting, (comm, src, tag)) - waiting[(comm, src, tag)] - else - waiting[(comm, src, tag)] = our_event - nothing - end - end - if other_event !== nothing - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") - wait(other_event) - @goto retry - end - - buffer = recv_yield_inplace!(buffer, comm, rank, src, tag) - - lock(RECV_WAITING) do waiting - delete!(waiting, (comm, src, tag)) - notify(our_event) - end - + # Inplace: sender uses InplaceInfo+array (length-prefix), pool assembles and queues the array; copy to user buffer. + value = recv_yield(comm, src, tag) + copy!(buffer, value) return buffer, true - end function recv_yield(comm, src, tag) rank = MPI.Comm_rank(comm) - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv from [$src]") + key = (comm, src, tag) + + # Check completion queue first (message may already have been received by the pool). + value = lock(_RECV_COMPLETION_QUEUE) do q + if haskey(q, key) && !isempty(q[key]) + ref = popfirst!(q[key]) + isempty(q[key]) && delete!(q, key) + return poolget(ref) + end + return nothing + end + if value !== nothing + return value + end - # Ensure no other receiver is waiting + # Ensure no other receiver is waiting for this (comm, src, tag). our_event = Base.Event() @label retry other_event = lock(RECV_WAITING) do waiting - if haskey(waiting, (comm, src, tag)) - waiting[(comm, src, tag)] + if haskey(waiting, key) + waiting[key] else - waiting[(comm, src, tag)] = our_event + waiting[key] = our_event nothing end end if other_event !== nothing - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") wait(other_event) @goto retry end - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Receiving...") - type = nothing - @label receive - value = recv_yield_serialized(comm, rank, src, tag) - if value isa InplaceInfo || value isa InplaceSparseInfo - value = recv_yield_inplace(value, comm, rank, src, tag) + # Loop: drain pool, check queue, service aliasing, deadlock detect, yield. + time_start = time_ns() + detect = DEADLOCK_DETECT[] + warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) + timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + @label wait_loop + service_recv_pool(comm) + value = lock(_RECV_COMPLETION_QUEUE) do q + if haskey(q, key) && !isempty(q[key]) + ref = popfirst!(q[key]) + isempty(q[key]) && delete!(q, key) + return poolget(ref) + end + return nothing end - - lock(RECV_WAITING) do waiting - delete!(waiting, (comm, src, tag)) - notify(our_event) + if value !== nothing + lock(RECV_WAITING) do waiting + delete!(waiting, key) + notify(our_event) + end + return value end - return value + service_aliasing_requests(comm) + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) + yield() + @goto wait_loop end function recv_yield_inplace!(array, comm, my_rank, their_rank, tag) @@ -470,19 +686,17 @@ function recv_yield_inplace!(array, comm, my_rank, their_rank, tag) warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + req = MPI.Irecv!(MPI.Buffer(array), comm; source=their_rank, tag=tag) while true - (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) - if got - if MPI.Get_error(stat) != MPI.SUCCESS - error("recv_yield failed with error $(MPI.Get_error(stat))") + finish, status = MPI.Test(req, MPI.Status) + if finish + if MPI.Get_error(status) != MPI.SUCCESS + error("recv_yield failed with error $(MPI.Get_error(status))") end - count = MPI.Get_count(stat, UInt8) - @assert count == sizeof(array) "recv_yield_inplace: expected $(sizeof(array)) bytes, got $count" - buf = MPI.Buffer(array) - req = MPI.Imrecv!(buf, msg) - __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") return array end + service_recv_pool(comm) + service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end @@ -506,30 +720,228 @@ function recv_yield_inplace(_value::InplaceSparseInfo, comm, my_rank, their_rank return SparseMatrixCSC{eltype(T), Int64}(_value.m, _value.n, colptr, rowval, nzval) end +""" +Drain the recv pool: Test each slot's request; when complete, advance the state machine +(length -> data -> optional inplace -> ready). Push completed messages to the completion +queue and notify waiters. Replenish slots with new Irecv(ANY_SOURCE, ANY_TAG) for length. +Called from recv_yield's wait loop and from __wait_for_request (and recv_yield_inplace!). +""" +function service_recv_pool(comm::MPI.Comm) + pool = _recv_pool_for_comm(comm) + _recv_pool_init!(pool, comm) + rank = MPI.Comm_rank(comm) + + for slot in pool.slots + slot.req === nothing && continue + done, status = MPI.Test(slot.req, MPI.Status) + if !done + continue + end + if MPI.Get_error(status) != MPI.SUCCESS + error("recv pool slot failed with error $(MPI.Get_error(status))") + end + + if slot.phase == :waiting_length + slot.source = MPI.Get_source(status) + slot.tag = MPI.Get_tag(status) + count = slot.len_buf[1] + if count < 0 || count > MAX_SERIALIZED_RECV_LENGTH + error("recv pool: invalid or corrupt length $count (max $(MAX_SERIALIZED_RECV_LENGTH)); source=$(slot.source), tag=$(slot.tag)") + end + slot.data_buf = Array{UInt8}(undef, count) + slot.req = MPI.Irecv!(MPI.Buffer(slot.data_buf), comm; source=slot.source, tag=slot.tag) + slot.phase = :waiting_data + continue + end + + if slot.phase == :waiting_data + value = MPI.deserialize(slot.data_buf) + if slot.tag == Int(ALIASING_REQUEST_TAG) + # Hand off to aliasing path (same as service_aliasing_requests). + (handle::MPIRef, dep_mod, response_tag::UInt32) = value + if handle.rank == rank + inner_ref = handle.innerRef + if inner_ref === nothing + inner_ref = lock(MPIREF_REGISTRY) do reg + get(reg, handle.id, nothing) + end + end + if inner_ref !== nothing + v = poolget(inner_ref) + ainfo = aliasing(v, dep_mod) + push!(_SAR_OUTBOX, (ainfo, comm, slot.source, response_tag)) + else + push!(ALIASING_PENDING, (handle, dep_mod, response_tag, slot.source)) + end + end + _recv_pool_slot_reset!(slot, comm) + continue + end + + if value isa InplaceInfo + T = value.type + @assert T <: Array && isbitstype(eltype(T)) + arr = Array{eltype(T)}(undef, value.shape) + slot.inplace_info = value + slot.inplace_bufs = [arr] + slot.req = MPI.Irecv!(MPI.Buffer(arr), comm; source=slot.source, tag=slot.tag) + slot.phase = :waiting_inplace_dense + continue + end + + if value isa InplaceSparseInfo + slot.inplace_info = value + colptr_buf = Vector{Int64}(undef, value.colptr) + slot.inplace_bufs = [colptr_buf] + slot.req = MPI.Irecv!(MPI.Buffer(colptr_buf), comm; source=slot.source, tag=slot.tag) + slot.phase = :waiting_inplace_colptr + continue + end + + # Serialized value complete. + _recv_pool_push_and_reset!(slot, comm, value) + continue + end + + if slot.phase == :waiting_inplace_dense + arr = slot.inplace_bufs[1] + _recv_pool_push_and_reset!(slot, comm, arr) + continue + end + + if slot.phase == :waiting_inplace_colptr + sp = slot.inplace_info::InplaceSparseInfo + rowval_buf = Vector{Int64}(undef, sp.rowval) + push!(slot.inplace_bufs, rowval_buf) + slot.req = MPI.Irecv!(MPI.Buffer(rowval_buf), comm; source=slot.source, tag=slot.tag) + slot.phase = :waiting_inplace_rowval + continue + end + + if slot.phase == :waiting_inplace_rowval + sp = slot.inplace_info::InplaceSparseInfo + nzval_buf = Vector{eltype(sp.type)}(undef, sp.nzval) + push!(slot.inplace_bufs, nzval_buf) + slot.req = MPI.Irecv!(MPI.Buffer(nzval_buf), comm; source=slot.source, tag=slot.tag) + slot.phase = :waiting_inplace_nzval + continue + end + + if slot.phase == :waiting_inplace_nzval + sp = slot.inplace_info::InplaceSparseInfo + colptr = slot.inplace_bufs[1]::Vector{Int64} + rowval = slot.inplace_bufs[2]::Vector{Int64} + nzval = slot.inplace_bufs[3] + mat = SparseMatrixCSC{eltype(sp.type), Int64}(sp.m, sp.n, colptr, rowval, nzval) + _recv_pool_push_and_reset!(slot, comm, mat) + continue + end + end +end + +function _recv_pool_slot_reset!(slot::RecvPoolSlot, comm::MPI.Comm) + slot.phase = :waiting_length + slot.source = -1 + slot.tag = -1 + slot.len_buf[1] = 0 + slot.req = MPI.Irecv!(MPI.Buffer(slot.len_buf), comm; source=_MPI_ANY_SOURCE, tag=_MPI_ANY_TAG) + slot.data_buf = UInt8[] + slot.inplace_info = nothing + slot.inplace_bufs = Any[] +end + +function _recv_pool_push_and_reset!(slot::RecvPoolSlot, comm::MPI.Comm, value::Any) + key = (comm, slot.source, slot.tag) + ref = poolset(value) + lock(_RECV_COMPLETION_QUEUE) do q + if !haskey(q, key) + q[key] = Any[] + end + push!(q[key], ref) + end + lock(RECV_WAITING) do waiting + if haskey(waiting, key) + notify(waiting[key]) + end + end + _recv_pool_slot_reset!(slot, comm) +end + function recv_yield_serialized(comm, my_rank, their_rank, tag) time_start = time_ns() detect = DEADLOCK_DETECT[] warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) + len_buf = Int64[0] + local req_len + try + req_len = MPI.Irecv!(MPI.Buffer(len_buf), comm; source=their_rank, tag=tag) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Irecv_len\",\"message\":\"Irecv! len threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end + while true + local finish, status + try + finish, status = MPI.Test(req_len, MPI.Status) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Test_len\",\"message\":\"Test len threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end + if finish + if MPI.Get_error(status) != MPI.SUCCESS + error("recv_yield_serialized len failed with error $(MPI.Get_error(status))") + end + break + end + service_aliasing_requests(comm) + warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) + yield() + end + count = len_buf[1] + if count < 0 || count > MAX_SERIALIZED_RECV_LENGTH + error("recv_yield_serialized: invalid or corrupt length $count (max $(MAX_SERIALIZED_RECV_LENGTH)); src=$their_rank, tag=$tag") + end + buf = Array{UInt8}(undef, count) + local req_data + try + req_data = MPI.Irecv!(MPI.Buffer(buf), comm; source=their_rank, tag=tag) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Irecv_data\",\"message\":\"Irecv! data threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"count\":$count,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end while true - (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) - if got - if MPI.Get_error(stat) != MPI.SUCCESS - error("recv_yield failed with error $(MPI.Get_error(stat))") + local finish, status + try + finish, status = MPI.Test(req_data, MPI.Status) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Test_data\",\"message\":\"Test data threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end + if finish + if MPI.Get_error(status) != MPI.SUCCESS + error("recv_yield_serialized data failed with error $(MPI.Get_error(status))") end - count = MPI.Get_count(stat, UInt8) - buf = Array{UInt8}(undef, count) - req = MPI.Imrecv!(MPI.Buffer(buf), msg) - __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") return MPI.deserialize(buf) end + service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end end const SEEN_TAGS = Dict{Int32, Type}() +# Serialize nonblocking sends so only one Isend+wait is in flight at a time; avoids MPICH internal_Isend segfault with many concurrent requests. +const SEND_SERIALIZE_LOCK = ReentrantLock() send_yield!(value, comm, dest, tag; check_seen::Bool=true) = _send_yield(value, comm, dest, tag; check_seen, inplace=true) send_yield(value, comm, dest, tag; check_seen::Bool=true) = @@ -543,8 +955,9 @@ function _send_yield(value, comm, dest, tag; check_seen::Bool=true, inplace::Boo if check_seen SEEN_TAGS[tag] = typeof(value) end - #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting send to [$dest]: $(typeof(value)), is support inplace? $(supports_inplace_mpi(value))") + # Inplace sends use InplaceInfo+array so the recv pool (ANY_TAG) can receive them; never send raw array only. if inplace && supports_inplace_mpi(value) + send_yield_serialized(InplaceInfo(typeof(value), size(value)), comm, rank, dest, tag) send_yield_inplace(value, comm, rank, dest, tag) else send_yield_serialized(value, comm, rank, dest, tag) @@ -553,8 +966,20 @@ end function send_yield_inplace(value, comm, my_rank, their_rank, tag) @opcounter :send_yield_inplace - req = MPI.Isend(value, comm; dest=their_rank, tag) - __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") + lock(SEND_SERIALIZE_LOCK) do + GC.@preserve value begin + local req + try + req = MPI.Isend(value, comm; dest=their_rank, tag) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:send_inplace:Isend\",\"message\":\"Isend inplace threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"dest\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end + __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") + end + end end function send_yield_serialized(value, comm, my_rank, their_rank, tag) @@ -568,8 +993,59 @@ function send_yield_serialized(value, comm, my_rank, their_rank, tag) send_yield_inplace(value.rowval, comm, my_rank, their_rank, tag) send_yield_inplace(value.nzval, comm, my_rank, their_rank, tag) else - req = MPI.isend(value, comm; dest=their_rank, tag) - __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") + buf = MPI.serialize(value) + n = length(buf) + lock(SEND_SERIALIZE_LOCK) do + # Non-GC buffers so MPICH gets a stable pointer. Still Isend + yielding wait (no blocking). + ptr_len = Base.Libc.malloc(8) + ptr_len === C_NULL && throw(OutOfMemoryError()) + try + arr_len = Base.unsafe_wrap(Array, Ptr{Int64}(ptr_len), (1,); own=false) + arr_len[1] = n + req_len = MPI.Isend(arr_len, comm; dest=their_rank, tag) + __wait_for_request(req_len, comm, my_rank, their_rank, tag, "send_yield", "send") + finally + Base.Libc.free(ptr_len) + end + ptr = Base.Libc.malloc(n) + ptr === C_NULL && throw(OutOfMemoryError()) + try + arr = Base.unsafe_wrap(Array, Ptr{UInt8}(ptr), (n,); own=false) + copyto!(arr, buf) + req_data = MPI.Isend(arr, comm; dest=their_rank, tag) + __wait_for_request(req_data, comm, my_rank, their_rank, tag, "send_yield", "send") + finally + Base.Libc.free(ptr) + end + end + end +end + +function _send_yield_raw(value, comm, dest, tag) + rank = MPI.Comm_rank(comm) + buf = MPI.serialize(value) + n = length(buf) + lock(SEND_SERIALIZE_LOCK) do + ptr_len = Base.Libc.malloc(8) + ptr_len === C_NULL && throw(OutOfMemoryError()) + try + arr_len = Base.unsafe_wrap(Array, Ptr{Int64}(ptr_len), (1,); own=false) + arr_len[1] = n + req_len = MPI.Isend(arr_len, comm; dest, tag) + __wait_for_request(req_len, comm, rank, dest, tag, "send_yield_raw_len", "send") + finally + Base.Libc.free(ptr_len) + end + ptr = Base.Libc.malloc(n) + ptr === C_NULL && throw(OutOfMemoryError()) + try + arr = Base.unsafe_wrap(Array, Ptr{UInt8}(ptr), (n,); own=false) + copyto!(arr, buf) + req_data = MPI.Isend(arr, comm; dest, tag) + __wait_for_request(req_data, comm, rank, dest, tag, "send_yield_raw_data", "send") + finally + Base.Libc.free(ptr) + end end end @@ -579,13 +1055,30 @@ function __wait_for_request(req, comm, my_rank, their_rank, tag, fn::String, kin warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) while true - finish, status = MPI.Test(req, MPI.Status) + local finish, status + try + finish, status = MPI.Test(req, MPI.Status) + catch e + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:__wait_for_request:Test\",\"message\":\"MPI.Test threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"fn\":\"$fn\",\"kind\":\"$kind\",\"dest\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + rethrow() + end if finish if MPI.Get_error(status) != MPI.SUCCESS error("$fn failed with error $(MPI.Get_error(status))") end return end + if REMOTECALL_SENDER_NO_YIELD[] + # Sender in remotecall_endpoint: spin until send completes so we don't yield to other tasks. + if detect && (time_ns() - time_start) > timeout_period + error("[rank $my_rank][tag $tag] Hit hang on $kind (dest: $their_rank) [remotecall sender spin]") + end + continue + end + service_recv_pool(comm) + service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, kind, their_rank) yield() end @@ -623,10 +1116,13 @@ end function mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, kind, srcdest) time_elapsed = (time_ns() - time_start) if detect && time_elapsed > warn_period - @warn "[rank $rank][tag $tag] Hit probable hang on $kind (dest: $srcdest)" + @warn "[rank $rank][tag $tag] Hit probable hang on $kind (dest: $srcdest) [$(round(time_elapsed/1e9, digits=1))s]" return typemax(UInt64) end if detect && time_elapsed > timeout_period + # #region agent log + try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H11_timeout\",\"location\":\"mpi.jl:deadlock_detect:TIMEOUT\",\"message\":\"deadlock TIMEOUT - will throw\",\"data\":{\"rank\":$rank,\"tag\":$tag,\"kind\":\"$kind\",\"srcdest\":$srcdest,\"elapsed_s\":$(time_elapsed/1e9)},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion error("[rank $rank][tag $tag] Hit hang on $kind (dest: $srcdest)") end return warn_period @@ -636,6 +1132,11 @@ end WeakChunk(c::Chunk{T,H}) where {T,H<:MPIRef} = WeakChunk(c.handle.rank, c.handle.id.id, WeakRef(c)) function MemPool.poolget(ref::MPIRef; uniform::Bool=false) + if !uniform && ref.rank != MPI.Comm_rank(ref.comm) + # #region agent log + _r = MPI.Comm_rank(ref.comm); try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H12_poolget\",\"location\":\"mpi.jl:poolget\",\"message\":\"MPIRef rank mismatch about to assert\",\"data\":{\"local_rank\":$_r,\"ref_rank\":$(ref.rank),\"ref_id\":\"$(ref.id)\",\"uniform\":$uniform,\"backtrace\":\"$(replace(sprint(Base.show_backtrace, backtrace()), '\"'=>'\'', '\n'=>' '))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end + # #endregion + end @assert uniform || ref.rank == MPI.Comm_rank(ref.comm) "MPIRef rank mismatch: $(ref.rank) != $(MPI.Comm_rank(ref.comm))" if uniform tag = to_tag() @@ -758,7 +1259,10 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro return with(MPI_UID=>task.uid, MPI_UNIFORM=>true) do @assert data isa Chunk "Expected Chunk, got $(typeof(data))" space = memory_space(data) - tag = to_tag() + tag = remotecall_tag(accel.comm, task.uid, from_proc.rank, to_proc.rank, data.handle.id) + # #region agent log + if loc_rank <= 1 && tag >= 598 && tag <= 612; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_tag_op\",\"location\":\"mpi.jl:remotecall_endpoint\",\"message\":\"remotecall tag assigned\",\"data\":{\"rank\":$loc_rank,\"tag\":$tag,\"from_rank\":$(from_proc.rank),\"to_rank\":$(to_proc.rank),\"space_rank\":$(space.rank),\"task_uid\":$(task.uid),\"task_id\":$(task.id)},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end + # #endregion if space.rank != from_proc.rank # If the data is already where it needs to be @assert space.rank == to_proc.rank @@ -784,7 +1288,12 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro if loc_rank == from_proc.rank value = poolget(data.handle) data_moved = move(from_proc.innerProc, to_proc.innerProc, value) - Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + try + REMOTECALL_SENDER_NO_YIELD[] = true + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + finally + REMOTECALL_SENDER_NO_YIELD[] = false + end # FIXME: This is wrong to take typeof(data_moved), because the type may change return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) elseif loc_rank == to_proc.rank @@ -831,33 +1340,39 @@ function move(src::MPIProcessor, dst::MPIProcessor, x::Chunk) end end -#FIXME:try to think of a better move! scheme -function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) +_precise_typeof(x) = typeof(x) +_precise_typeof(::Type{T}) where {T} = Type{T} + +function execute!(proc::MPIProcessor, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing - tag_space = to_tag() + if islocal || inplace_move - result = execute!(proc.innerProc, world, f, args...; kwargs...) + result = execute!(proc.innerProc, f, args...; kwargs...) end + if inplace_move space = memory_space(nothing, proc)::MPIMemorySpace return tochunk(nothing, proc, space) + end + + tag = to_tag() + # #region agent log + if local_rank <= 1 && tag >= 598 && tag <= 612; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_tag_op\",\"location\":\"mpi.jl:execute!\",\"message\":\"execute! tag assigned\",\"data\":{\"rank\":$local_rank,\"tag\":$tag,\"proc_rank\":$(proc.rank),\"islocal\":$islocal,\"f\":\"$(nameof(f))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end + # #endregion + if islocal + T = typeof(result) + space = memory_space(result, proc)::MPIMemorySpace + T_space = (T, space.innerSpace) + @opcounter :execute_bcast_send_yield + bcast_send_yield(T_space, proc.comm, proc.rank, tag) + return tochunk(result, proc, space) else - # Handle commun1ication ourselves - if islocal - T = typeof(result) - space = memory_space(result, proc)::MPIMemorySpace - T_space = (T, space.innerSpace) - @opcounter :execute_bcast_send_yield - bcast_send_yield(T_space, proc.comm, proc.rank, tag) - return tochunk(result, proc, space) - else - T, innerSpace = recv_yield(proc.comm, proc.rank, tag) - space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) - return tochunk(nothing, proc, space; type=T) - end + T, innerSpace = recv_yield(proc.comm, proc.rank, tag) + space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) + return tochunk(nothing, proc, space; type=T) end end @@ -867,6 +1382,9 @@ function initialize_acceleration!(a::MPIAcceleration) if !MPI.Initialized() MPI.Init(;threadlevel=:multiple) end + # #region agent log + _r = MPI.Comm_rank(a.comm); _tl = MPI.Query_thread(); if _r <= 1; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H2_init\",\"location\":\"mpi.jl:initialize_acceleration!\",\"message\":\"MPI init\",\"data\":{\"rank\":$_r,\"nthreads\":$(Threads.nthreads()),\"mpi_thread_level\":$_tl,\"tag_ub\":$(MPI.tag_ub())},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end + # #endregion ctx = Dagger.Sch.eager_context() sz = MPI.Comm_size(a.comm) for i in 0:(sz-1) diff --git a/src/utils/interval_tree.jl b/src/utils/interval_tree.jl index 1c2b3a7f6..8046fbb3b 100644 --- a/src/utils/interval_tree.jl +++ b/src/utils/interval_tree.jl @@ -195,44 +195,11 @@ function Base.delete!(tree::IntervalTree{M,E}, span::M) where {M,E} parent_of_succ.right = replacement end - target.span = successor.span - replacement = target - end - - # Phase 3: Handle overlap case - add remaining portions - if target_type == :overlap - original_start = span_start(original_span) - original_end = span_end(original_span) - del_start = span_start(span) - del_end = span_end(span) - verify_span(span) - - # Left portion: exists if original starts before deleted span - if original_start < del_start - left_end = min(original_end, del_start - _span_one(del_start)) - if left_end >= original_start - left_span = M(original_start, left_end - original_start + _span_one(left_end)) - if !isempty(left_span) - replacement = insert_node!(replacement, left_span) - end - end + # Update max_end bottom-up for the successor's original path + update_max_end!(parent_of_succ) + for i in length(succ_path)-1:-1:1 + update_max_end!(succ_path[i]) end - - # Right portion: exists if original extends beyond deleted span - if original_end > del_end - right_start = max(original_start, del_end + _span_one(del_end)) - if original_end >= right_start - right_span = M(right_start, original_end - right_start + _span_one(original_end)) - if !isempty(right_span) - replacement = insert_node!(replacement, right_span) - end - end - end - end - - # Phase 4: Update parent's child pointer - if isempty(path) - root = replacement else # Zero or one child replacement = target.left !== nothing ? target.left : target.right @@ -294,12 +261,12 @@ function find_overlapping!(node::IntervalNode{M,E}, query::M, result::Vector{M}; # Enqueue left subtree if it might contain overlapping intervals if current.left !== nothing && current.left.max_end >= span_start(query) - push!(queue, current.left) + push!(stack, current.left) end # Enqueue right subtree if query extends beyond current node's start if current.right !== nothing && span_end(query) >= span_start(current.span) - push!(queue, current.right) + push!(stack, current.right) end end end From cc700c6eafadd5e12f53e076ff2f768ba37bbfab Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Wed, 4 Mar 2026 21:14:53 +0000 Subject: [PATCH 19/24] Revert to fc1ae09c, keep Sch.jl changes --- Project.toml | 2 - src/array/alloc.jl | 15 +- src/array/copy.jl | 4 - src/array/darray.jl | 41 +-- src/datadeps/aliasing.jl | 259 +++----------- src/datadeps/chunkview.jl | 23 +- src/datadeps/queue.jl | 220 +++++++----- src/datadeps/scheduling.jl | 9 +- src/memory-spaces.jl | 65 +++- src/mpi.jl | 714 +++++-------------------------------- src/mpi_mempool.jl | 36 ++ src/sch/Sch.jl | 10 +- src/utils/interval_tree.jl | 45 ++- 13 files changed, 462 insertions(+), 981 deletions(-) create mode 100644 src/mpi_mempool.jl diff --git a/Project.toml b/Project.toml index 687094335..b6d03531d 100644 --- a/Project.toml +++ b/Project.toml @@ -13,7 +13,6 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" -MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" NextLA = "d37ed344-79c4-486d-9307-6d11355a15a3" @@ -79,7 +78,6 @@ Graphs = "1" JSON3 = "1" KernelAbstractions = "0.9" MPI = "0.20.22" -MPIPreferences = "0.1.11" MacroTools = "0.5" MemPool = "0.4.12" Metal = "1.1" diff --git a/src/array/alloc.jl b/src/array/alloc.jl index 862f48421..e67ca593c 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -70,11 +70,11 @@ function partition(p::AbstractBlocks, dom::ArrayDomain) map(_cumlength, map(length, indexes(dom)), p.blocksize)) end -function allocate_array(f, ::Type{T}, idx, sz::NTuple{N,Int})::Array{T,N} where {T,N} +function allocate_array(f, T, idx, sz) new_f = allocate_array_func(task_processor(), f) return new_f(idx, T, sz) end -function allocate_array(f, ::Type{T}, sz::NTuple{N,Int})::Array{T,N} where {T,N} +function allocate_array(f, T, sz) new_f = allocate_array_func(task_processor(), f) return new_f(T, sz) end @@ -189,15 +189,8 @@ function Base.view(A::AbstractArray{T,N}, p::Blocks{N}; space=default_memory_spa d = ArrayDomain(Base.index_shape(A)) dc = partition(p, d) # N.B. We use `tochunk` because we only want to take the view locally, and - # taking views should be very fast. - # Per-chunk space for DArray: use each chunk's owner so tochunk on owner uses - # local_rank == space.rank and registers refs correctly (fixes MPI aliasing). - if A isa DArray && size(A.chunks) == size(dc) - chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), - (c = A.chunks[I]; c isa Chunk ? memory_space(c) : space))) for (I, x) in pairs(IndexCartesian(), dc)] - else - chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] - end + # taking views should be very fast + chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] return DArray(T, d, dc, chunks, p) end Base.view(A::AbstractArray, ::AutoBlocks) = diff --git a/src/array/copy.jl b/src/array/copy.jl index 647305fee..d032525f9 100644 --- a/src/array/copy.jl +++ b/src/array/copy.jl @@ -84,10 +84,6 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices( Dagger.spawn_datadeps() do for Bidx in Bci - accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end Bpart = B.chunks[Bidx] Bsd_global_raw = padNmax(Bsd_all[Bidx]) Bsd_global_shifted = shift_ranges(Bsd_global_raw, Binds_offset) diff --git a/src/array/darray.jl b/src/array/darray.jl index 406494d40..e04bcf065 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -582,29 +582,26 @@ DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::AssignmentType{1} = :arb DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::AssignmentType{2} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment) DArray(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = DArray(A, auto_blocks(A), assignment) -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims) - -function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} - domain = ArrayDomain(map(x->1:x, dims)) +@warn "Add assignment to undef initializer" maxlog=1 +function DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}) where {T,N} + dist = auto_blocks(dims) + return DArray{T,N}(undef, dist, dims...) +end +function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}) where {T,N} + domain = ArrayDomain(ntuple(i->1:dims[i], N)) subdomains = partition(dist, domain) - a = AllocateArray(T, AllocateUndef{T}(), false, domain, subdomains, dist, assignment) - return _to_darray(a) -end -DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, (dims...,); assignment) -DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks(dims), dims; assignment) -DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) -DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, dims; assignment) -DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, dist, (dims...,); assignment) -DArray{T}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks(dims), dims; assignment) -DArray{T}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) + tasks = Array{DTask,N}(undef, size(subdomains)...) + Dagger.spawn_datadeps() do + for (i, x) in enumerate(subdomains) + tasks[i] = Dagger.@spawn allocate_array_undef(T, size(x)) + end + end + return DArray(T, domain, subdomains, tasks, dist) +end +DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,)) +DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}) where {T,N} = + DArray{T,N}(undef, dist, (dims...,)) function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N} collect(x) == y diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index 39dbaf541..aec83d039 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -251,120 +251,6 @@ struct HistoryEntry write_num::Int end -struct AliasedObjectCacheStore - keys::Vector{AbstractAliasing} - derived::Dict{AbstractAliasing,AbstractAliasing} - stored::Dict{MemorySpace,Set{AbstractAliasing}} - values::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} -end -AliasedObjectCacheStore() = - AliasedObjectCacheStore(Vector{AbstractAliasing}(), - Dict{AbstractAliasing,AbstractAliasing}(), - Dict{MemorySpace,Set{AbstractAliasing}}(), - Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}()) - -function is_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - if !haskey(cache.stored, space) - return false - end - if !haskey(cache.derived, ainfo) - return false - end - key = cache.derived[ainfo] - return key in cache.stored[space] -end -function is_key_present(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - return haskey(cache.derived, ainfo) -end -function get_stored(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing) - @assert is_stored(cache, space, ainfo) "Cache does not have derived ainfo $ainfo" - key = cache.derived[ainfo] - return cache.values[space][key] -end -function set_stored!(cache::AliasedObjectCacheStore, dest_space::MemorySpace, value::Chunk, ainfo::AbstractAliasing) - @assert !is_stored(cache, dest_space, ainfo) "Cache already has derived ainfo $ainfo" - key = cache.derived[ainfo] - value_ainfo = aliasing(value, identity) - cache.derived[value_ainfo] = key - push!(get!(Set{AbstractAliasing}, cache.stored, dest_space), key) - values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, dest_space) - values_dict[key] = value - return -end -function set_key_stored!(cache::AliasedObjectCacheStore, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) - push!(cache.keys, ainfo) - cache.derived[ainfo] = ainfo - push!(get!(Set{AbstractAliasing}, cache.stored, space), ainfo) - values_dict = get!(Dict{AbstractAliasing,Chunk}, cache.values, space) - values_dict[ainfo] = value - return -end - -struct AliasedObjectCache - space::MemorySpace - chunk::Chunk -end -function is_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(is_stored, wid, cache, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return is_stored(cache_raw, cache.space, ainfo) -end -function is_key_present(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(is_key_present, wid, cache, space, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return is_key_present(cache_raw, space, ainfo) -end -function get_stored(cache::AliasedObjectCache, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(get_stored, wid, cache, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - return get_stored(cache_raw, cache.space, ainfo) -end -function set_stored!(cache::AliasedObjectCache, value::Chunk, ainfo::AbstractAliasing) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(set_stored!, wid, cache, value, ainfo) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - set_stored!(cache_raw, cache.space, value, ainfo) - return -end -function set_key_stored!(cache::AliasedObjectCache, space::MemorySpace, ainfo::AbstractAliasing, value::Chunk) - wid = root_worker_id(cache.chunk) - if wid != myid() - return remotecall_fetch(set_key_stored!, wid, cache, space, ainfo, value) - end - cache_raw = unwrap(cache.chunk)::AliasedObjectCacheStore - set_key_stored!(cache_raw, space, ainfo, value) -end -function aliased_object!(f, cache::AliasedObjectCache, x; ainfo=aliasing(current_acceleration(), x, identity)) - x_space = memory_space(x) - if !is_key_present(cache, x_space, ainfo) - x_chunk = x isa Chunk ? x : tochunk(x, first(processors(x_space))) - set_key_stored!(cache, x_space, ainfo, x_chunk) - end - if is_stored(cache, ainfo) - return get_stored(cache, ainfo) - else - y = f(x) - @assert y isa Chunk "Didn't get a Chunk from functor" - @assert memory_space(y) == cache.space "Space mismatch! $(memory_space(y)) != $(cache.space)" - if memory_space(x) != cache.space - @assert ainfo != aliasing(current_acceleration(), y, identity) "Aliasing mismatch! $ainfo == $(aliasing(current_acceleration(), y, identity))" - end - set_stored!(cache, y, ainfo) - return y - end -end - @warn "Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper" maxlog=1 struct DataDepsState # The mapping of original raw argument to its Chunk @@ -402,7 +288,7 @@ struct DataDepsState # The mapping of, for a given memory space, the backing Chunks that an ainfo references # Used by slot generation to replace the backing Chunks during move - ainfo_backing_chunk::Chunk{AliasedObjectCacheStore} + ainfo_backing_chunk::Dict{MemorySpace,Dict{AbstractAliasing,Chunk}} # Cache of argument's supports_inplace_move query result supports_inplace_cache::IdDict{Any,Bool} @@ -434,7 +320,7 @@ struct DataDepsState ainfo_arg = Dict{AliasingWrapper,ArgumentWrapper}() arg_owner = Dict{ArgumentWrapper,MemorySpace}() arg_overlaps = Dict{ArgumentWrapper,Set{ArgumentWrapper}}() - ainfo_backing_chunk = tochunk(AliasedObjectCacheStore()) + ainfo_backing_chunk = Dict{MemorySpace,Dict{AbstractAliasing,Chunk}}() arg_history = Dict{ArgumentWrapper,Vector{HistoryEntry}}() supports_inplace_cache = IdDict{Any,Bool}() @@ -445,17 +331,13 @@ struct DataDepsState ainfos_owner = Dict{AliasingWrapper,Union{Pair{DTask,Int},Nothing}}() ainfos_readers = Dict{AliasingWrapper,Vector{Pair{DTask,Int}}}() - return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_history, arg_owner, arg_overlaps, ainfo_backing_chunk, + return new(arg_to_chunk, arg_origin, remote_args, remote_arg_to_original, ainfo_arg, arg_owner, arg_overlaps, ainfo_backing_chunk, arg_history, supports_inplace_cache, ainfo_cache, ainfos_overlaps, ainfos_owner, ainfos_readers) end end # N.B. arg_w must be the original argument wrapper, not a remote copy function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper) - accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end # Grab the remote copy of the argument, and calculate the ainfo remote_arg = get_or_generate_slot!(state, target_space, arg_w.arg) remote_arg_w = ArgumentWrapper(remote_arg, arg_w.dep_mod) @@ -466,7 +348,7 @@ function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::Argum end # Calculate the ainfo - ainfo = AliasingWrapper(aliasing(accel, remote_arg, arg_w.dep_mod)) + ainfo = AliasingWrapper(aliasing(current_acceleration(), remote_arg, arg_w.dep_mod)) # Cache the result state.ainfo_cache[remote_arg_w] = ainfo @@ -493,9 +375,8 @@ function is_writedep(arg, deps, task::DTask) end # Aliasing state setup -# Returns Vector/Tuple of DataDepsTaskArgument for consumption by distribute_task! -function populate_task_info!(state::DataDepsState, task_args, spec::DTaskSpec, task::DTask) - result = DataDepsTaskArgument[] +function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) + # Track the task's arguments and access patterns for (idx, _arg) in enumerate(spec.fargs) arg = value(_arg) @@ -505,49 +386,42 @@ function populate_task_info!(state::DataDepsState, task_args, spec::DTaskSpec, t # Unwrap the Chunk underlying any DTask arguments arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg - may_alias = type_may_alias(typeof(arg)) - inplace_move = supports_inplace_move(state, arg) + # Skip non-aliasing arguments + type_may_alias(typeof(arg)) || continue - if may_alias && inplace_move - # Generate a Chunk for the argument if necessary - if haskey(state.raw_arg_to_chunk, arg) - arg = state.raw_arg_to_chunk[arg] - else - if !(arg isa Chunk) - new_arg = with(MPI_UID=>task.uid) do - tochunk(arg) - end - state.raw_arg_to_chunk[arg] = new_arg - arg = new_arg - else - state.raw_arg_to_chunk[arg] = arg + # Skip arguments not supporting in-place move + supports_inplace_move(state, arg) || continue + + # Generate a Chunk for the argument if necessary + if haskey(state.raw_arg_to_chunk, arg) + arg = state.raw_arg_to_chunk[arg] + else + if !(arg isa Chunk) + new_arg = with(MPI_UID=>task.uid) do + tochunk(arg) end + state.raw_arg_to_chunk[arg] = new_arg + arg = new_arg + else + state.raw_arg_to_chunk[arg] = arg end + end - # Track the origin space of the argument - origin_space = memory_space(arg) - check_uniform(origin_space) - state.arg_origin[arg] = origin_space - state.remote_arg_to_original[arg] = arg - - # Populate argument info for all aliasing dependencies - dep_infos = DataDepsTaskDependency[ - DataDepsTaskDependency(ArgumentWrapper(arg, dep_mod), readdep, writedep) - for (dep_mod, readdep, writedep) in deps - ] - - # Populate argument info for all aliasing dependencies - for (dep_mod, _, _) in deps - aw = ArgumentWrapper(arg, dep_mod) - populate_argument_info!(state, aw, origin_space) - end + # Track the origin space of the argument + origin_space = memory_space(arg) + check_uniform(origin_space) + state.arg_origin[arg] = origin_space + state.remote_arg_to_original[arg] = arg - push!(result, DataDepsTaskArgument(arg, ArgPosition(_arg.pos), true, true, dep_infos)) - else - push!(result, DataDepsTaskArgument(arg, ArgPosition(_arg.pos), may_alias, inplace_move, DataDepsTaskDependency[])) + # Populate argument info for all aliasing dependencies + for (dep_mod, _, _) in deps + # Generate an ArgumentWrapper for the argument + aw = ArgumentWrapper(arg, dep_mod) + + # Populate argument info + populate_argument_info!(state, aw, origin_space) end end - return spec.fargs isa Tuple ? (result...,) : result end function populate_argument_info!(state::DataDepsState, arg_w::ArgumentWrapper, origin_space::MemorySpace) # Initialize ownership and history @@ -746,6 +620,7 @@ function generate_slot!(state::DataDepsState, dest_space, data) to_proc = first(processors(dest_space)) from_proc = first(processors(orig_space)) dest_space_args = get!(IdDict{Any,Any}, state.remote_args, dest_space) + ALIASED_OBJECT_CACHE[] = get!(Dict{AbstractAliasing,Chunk}, state.ainfo_backing_chunk, dest_space) if orig_space == dest_space && (data isa Chunk || !isremotehandle(data)) # Fast path for local data that's already in a Chunk or not a remote handle needing rewrapping task = DATADEPS_CURRENT_TASK[] @@ -753,17 +628,18 @@ function generate_slot!(state::DataDepsState, dest_space, data) tochunk(data, from_proc) end else - aliased_object_cache = AliasedObjectCache(dest_space, state.ainfo_backing_chunk) ctx = Sch.eager_context() id = rand(Int) @maybelog ctx timespan_start(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data)) - data_chunk = move_rewrap(aliased_object_cache, from_proc, to_proc, orig_space, dest_space, data) + data_chunk = move_rewrap(from_proc, to_proc, orig_space, dest_space, data) @maybelog ctx timespan_finish(ctx, :move, (;thunk_id=0, id, position=ArgPosition(), processor=to_proc), (;f=nothing, data=data_chunk)) end @assert memory_space(data_chunk) == dest_space "space mismatch! $dest_space (dest) != $(memory_space(data_chunk)) (actual) ($(typeof(data)) (data) vs. $(typeof(data_chunk)) (chunk)), spaces ($orig_space -> $dest_space)" dest_space_args[data] = data_chunk state.remote_arg_to_original[data_chunk] = data + ALIASED_OBJECT_CACHE[] = nothing + check_uniform(memory_space(dest_space_args[data])) check_uniform(processor(dest_space_args[data])) check_uniform(dest_space_args[data].handle) @@ -780,64 +656,11 @@ function get_or_generate_slot!(state, dest_space, data) end return state.remote_args[dest_space][data] end -function rewrap_aliased_object!(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x) - return aliased_object!(cache, x) do x - return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, x) - end -end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data::Chunk) - # MPI: Chunk with MPIRef - data may live on another rank; root_worker_id(MPIRef)=myid() is wrong - if data.handle isa MPIRef - if data.handle.rank != MPI.Comm_rank(data.handle.comm) - # Data is on a different MPI rank; use MPI transfer via remotecall_endpoint - return aliased_object!(cache, data) do data - return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, data) - end - end - else - wid = root_worker_id(data) - if wid != myid() - return remotecall_fetch(move_rewrap, wid, cache, from_proc, to_proc, from_space, to_space, data) - end - end - data_raw = unwrap(data) - return move_rewrap(cache, from_proc, to_proc, from_space, to_space, data_raw) -end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) - return aliased_object!(cache, data) do data +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) + return aliased_object!(data) do data return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, data) end end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) - to_w = root_worker_id(to_proc) - p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) - inds = parentindices(v) - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, inds) do from_proc, to_proc, from_space, to_space, p_chunk, inds - p_new = move(from_proc, to_proc, p_chunk) - v_new = view(p_new, inds...) - return tochunk(v_new, to_proc, to_space) - end -end -for wrapper in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular) - @eval function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::$(wrapper)) - to_w = root_worker_id(to_proc) - p_chunk = rewrap_aliased_object!(cache, from_proc, to_proc, from_space, to_space, parent(v)) - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk) do from_proc, to_proc, from_space, to_space, p_chunk - p_new = move(from_proc, to_proc, p_chunk) - v_new = $(wrapper)(p_new) - return tochunk(v_new, to_proc, to_space) - end - end -end -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::Base.RefValue) - return aliased_object!(cache, v) do v - return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, v) - end -end -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::String) = x -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Symbol) = x -move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, x::Type) = x - function remotecall_endpoint(f, ::Dagger.DistributedAcceleration, from_proc, to_proc, orig_space, dest_space, data) to_w = root_worker_id(to_proc) return remotecall_fetch(to_w, from_proc, to_proc, dest_space, data) do from_proc, to_proc, dest_space, data diff --git a/src/datadeps/chunkview.jl b/src/datadeps/chunkview.jl index 2040b8d52..6e2a21dfd 100644 --- a/src/datadeps/chunkview.jl +++ b/src/datadeps/chunkview.jl @@ -32,14 +32,23 @@ aliasing(x::ChunkView) = memory_space(x::ChunkView) = memory_space(x.chunk) isremotehandle(x::ChunkView) = true -function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) - to_w = root_worker_id(to_proc) - p_chunk = move_rewrap(cache, from_proc, to_proc, from_space, to_space, slice.chunk) +# This definition is here because it's so similar to ChunkView +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, v::SubArray) + p_chunk = aliased_object!(parent(v)) do p_chunk + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) + end + inds = parentindices(v) + return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new + return view(p_new, inds...) + end +end +function move_rewrap(from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, slice::ChunkView) + p_chunk = aliased_object!(slice.chunk) do p_chunk + return remotecall_endpoint(identity, current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) + end inds = slice.slices - return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, p_chunk, inds) do from_proc, to_proc, from_space, to_space, p_chunk, inds - p_new = move(from_proc, to_proc, p_chunk) - v_new = view(p_new, inds...) - return tochunk(v_new, to_proc, to_space) + return remotecall_endpoint(current_acceleration(), from_proc, to_proc, from_space, to_space, p_chunk) do p_new + return view(p_new, inds...) end end diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 5de3fdcb2..ebf9f8fa6 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -10,9 +10,6 @@ function to_tag() @assert Sch.SCHED_MOVE[] == false "We should not create a tag on the scheduler unwrap move" tag = counter_ref[] counter_ref[] = tag + 1 > MPI.tag_ub() ? 1 : tag + 1 - # #region agent log - if tag >= 598 && tag <= 612; _r = MPI.Comm_rank(MPI.COMM_WORLD); if _r <= 1; try; _bt = String[]; for s in stacktrace(backtrace()); push!(_bt, "$(s.func)@$(basename(string(s.file))):$(s.line)"); length(_bt) >= 6 && break; end; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_totag\",\"location\":\"queue.jl:to_tag\",\"message\":\"to_tag critical range\",\"data\":{\"rank\":$_r,\"tag\":$tag,\"stack\":$(repr(join(_bt, " > ")))},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end; end - # #endregion return tag end end @@ -29,9 +26,7 @@ struct DataDepsTaskQueue <: AbstractTaskQueue # How to traverse the dependency graph when launching tasks traversal::Symbol # Which scheduler to use to assign tasks to processors - # DataDepsScheduler objects use datadeps_schedule_task (master API); - # :smart/:ultra Symbols use legacy inline logic - scheduler::Union{DataDepsScheduler,Symbol} + scheduler::Symbol # Whether aliasing across arguments is possible # The fields following only apply when aliasing==true @@ -39,18 +34,12 @@ struct DataDepsTaskQueue <: AbstractTaskQueue function DataDepsTaskQueue(upper_queue; traversal::Symbol=:inorder, - scheduler::Union{DataDepsScheduler,Symbol}=RoundRobinScheduler(), + scheduler::Symbol=:naive, aliasing::Bool=true) - # Convert Symbol to scheduler object for master API compatibility - sched = scheduler isa Symbol ? (scheduler == :roundrobin ? RoundRobinScheduler() : - scheduler == :naive ? NaiveScheduler() : - scheduler == :smart ? NaiveScheduler() : # closest equivalent - scheduler == :ultra ? UltraScheduler() : - scheduler) : scheduler seen_tasks = DTaskPair[] g = SimpleDiGraph() task_to_id = Dict{DTask,Int}() - return new(upper_queue, seen_tasks, g, task_to_id, traversal, sched, + return new(upper_queue, seen_tasks, g, task_to_id, traversal, scheduler, aliasing) end end @@ -102,33 +91,25 @@ experimental and subject to change. """ function spawn_datadeps(f::Base.Callable; static::Bool=true, traversal::Symbol=:inorder, - scheduler::Union{DataDepsScheduler,Symbol,Nothing}=nothing, + scheduler::Union{Symbol,Nothing}=nothing, aliasing::Bool=true, launch_wait::Union{Bool,Nothing}=nothing) if !static throw(ArgumentError("Dynamic scheduling is no longer available")) end wait_all(; check_errors=true) do - scheduler = something(scheduler, DATADEPS_SCHEDULER[], RoundRobinScheduler()) + scheduler = something(scheduler, DATADEPS_SCHEDULER[], :roundrobin)::Symbol launch_wait = something(launch_wait, DATADEPS_LAUNCH_WAIT[], false)::Bool if launch_wait result = spawn_bulk() do queue = DataDepsTaskQueue(get_options(:task_queue); traversal, scheduler, aliasing) - accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end with_options(f; task_queue=queue) distribute_tasks!(queue) end else queue = DataDepsTaskQueue(get_options(:task_queue); traversal, scheduler, aliasing) - accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end result = with_options(f; task_queue=queue) distribute_tasks!(queue) end @@ -136,7 +117,7 @@ function spawn_datadeps(f::Base.Callable; static::Bool=true, return result end end -const DATADEPS_SCHEDULER = ScopedValue{Union{DataDepsScheduler,Symbol,Nothing}}(nothing) +const DATADEPS_SCHEDULER = ScopedValue{Union{Symbol,Nothing}}(nothing) const DATADEPS_LAUNCH_WAIT = ScopedValue{Union{Bool,Nothing}}(nothing) @warn "Don't blindly set occupancy=0, only do for MPI" maxlog=1 @@ -152,9 +133,6 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Get the set of all processors to be scheduled on scope = get_compute_scope() accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end accel_procs = filter(procs(Dagger.Sch.eager_context())) do proc Dagger.accel_matches_proc(accel, proc) end @@ -165,9 +143,7 @@ function distribute_tasks!(queue::DataDepsTaskQueue) if isempty(all_procs) throw(Sch.SchedulingException("No processors available, try widening scope")) end - all_scope = UnionScope(map(ExactScope, all_procs)...) exec_spaces = unique(vcat(map(proc->collect(memory_spaces(proc)), all_procs)...)) - DATADEPS_EXEC_SPACES[] = exec_spaces #=if !all(space->space isa CPURAMMemorySpace, exec_spaces) && !all(space->root_worker_id(space) == myid(), exec_spaces) @warn "Datadeps support for multi-GPU, multi-worker is currently broken\nPlease be prepared for incorrect results or errors" maxlog=1 end=# @@ -231,29 +207,19 @@ function distribute_tasks!(queue::DataDepsTaskQueue) # Start launching tasks and necessary copies write_num = 1 + proc_idx = 1 + #pressures = Dict{Processor,Int}() proc_to_scope_lfu = BasicLFUCache{Processor,AbstractScope}(1024) for pair in queue.seen_tasks[task_order] spec = pair.spec task = pair.task - write_num = distribute_task!(queue, state, all_procs, all_scope, spec, task, spec.fargs, proc_to_scope_lfu, write_num) + write_num, proc_idx = distribute_task!(queue, state, all_procs, spec, task, spec.fargs, proc_to_scope_lfu, write_num, proc_idx) end # Copy args from remote to local # N.B. We sort the keys to ensure a deterministic order for uniformity check_uniform(length(state.arg_owner)) - # #region agent log - if accel isa MPIAcceleration - try - open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io - println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:copy_from_phase\",\"message\":\"Starting copy-from phase\",\"data\":{\"rank\":$(MPI.Comm_rank(accel.comm)),\"arg_owner_count\":$(length(state.arg_owner))},\"timestamp\":$(round(Int,time()*1000))}") - end - catch; end - end - # #endregion for arg_w in sort(collect(keys(state.arg_owner)); by=arg_w->arg_w.hash) - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end check_uniform(arg_w) arg = arg_w.arg origin_space = state.arg_origin[arg] @@ -293,25 +259,9 @@ struct TypedDataDepsTaskArgument{T,N} end map_or_ntuple(f, xs::Vector) = map(f, 1:length(xs)) map_or_ntuple(f, xs::Tuple) = ntuple(f, length(xs)) -function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, all_scope, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int) where typed +function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int, proc_idx::Int) where typed @specialize spec fargs - accel = current_acceleration() - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end - - # #region agent log - r = accel isa MPIAcceleration ? MPI.Comm_rank(accel.comm) : -1 - if accel isa MPIAcceleration - try - open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io - println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:distribute_task_entry\",\"message\":\"distribute_task entry\",\"data\":{\"rank\":$r,\"task_id\":$(task.id)},\"timestamp\":$(round(Int,time()*1000))}") - end - catch; end - end - # #endregion - DATADEPS_CURRENT_TASK[] = task if typed @@ -320,30 +270,143 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr fargs::Vector{Argument} end - task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) scheduler = queue.scheduler + if scheduler == :naive + raw_args = map(arg->tochunk(value(arg)), spec.fargs) + our_proc = remotecall_fetch(1, all_procs, raw_args) do all_procs, raw_args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + # Calculate costs per processor and select the most optimal + # FIXME: This should consider any already-allocated slots, + # whether they are up-to-date, and if not, the cost of moving + # data to them + procs, costs = Sch.estimate_task_costs(sch_state, all_procs, nothing, raw_args) + return first(procs) + end + end + elseif scheduler == :smart + raw_args = map(filter(arg->haskey(state.data_locality, value(arg)), spec.fargs)) do arg + arg_chunk = tochunk(value(arg)) + # Only the owned slot is valid + # FIXME: Track up-to-date copies and pass all of those + return arg_chunk => data_locality[arg] + end + f_chunk = tochunk(value(spec.fargs[1])) + our_proc, task_pressure = remotecall_fetch(1, all_procs, pressures, f_chunk, raw_args) do all_procs, pressures, f, chunks_locality + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + + @lock sch_state.lock begin + tx_rate = sch_state.transfer_rate[] + + costs = Dict{Processor,Float64}() + for proc in all_procs + # Filter out chunks that are already local + chunks_filt = Iterators.filter(((chunk, space)=chunk_locality)->!(proc in processors(space)), chunks_locality) + + # Estimate network transfer costs based on data size + # N.B. `affinity(x)` really means "data size of `x`" + # N.B. We treat same-worker transfers as having zero transfer cost + tx_cost = Sch.impute_sum(affinity(chunk)[2] for chunk in chunks_filt) + + # Estimate total cost to move data and get task running after currently-scheduled tasks + est_time_util = get(pressures, proc, UInt64(0)) + costs[proc] = est_time_util + (tx_cost/tx_rate) + end - # Use datadeps_schedule_task (master API) - our_proc = datadeps_schedule_task(scheduler, state, all_procs, all_scope, task_scope, spec, task) - @assert our_proc in all_procs - our_space = only(memory_spaces(our_proc)) - # #region agent log - if accel isa MPIAcceleration - proc_rank = our_proc isa Dagger.MPIProcessor ? our_proc.rank : (our_proc isa Dagger.MPIOSProc ? our_proc.rank : -1) - try - open("/flare/dagger/fdadagger/.cursor/debug-757b3d.log", "a") do io - println(io, "{\"sessionId\":\"757b3d\",\"hypothesisId\":\"H7\",\"location\":\"queue.jl:distribute_task_scheduled\",\"message\":\"task scheduled\",\"data\":{\"rank\":$r,\"task_id\":$(task.id),\"our_proc_rank\":$proc_rank},\"timestamp\":$(round(Int,time()*1000))}") + # Look up estimated task cost + sig = Sch.signature(sch_state, f, map(first, chunks_locality)) + task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) + + # Shuffle procs around, so equally-costly procs are equally considered + P = randperm(length(all_procs)) + procs = getindex.(Ref(all_procs), P) + + # Sort by lowest cost first + sort!(procs, by=p->costs[p]) + + best_proc = first(procs) + return best_proc, task_pressure end - catch; end + end + # FIXME: Pressure should be decreased by pressure of syncdeps on same processor + pressures[our_proc] = get(pressures, our_proc, UInt64(0)) + task_pressure + elseif scheduler == :ultra + args = Base.mapany(spec.fargs) do arg + pos, data = arg + data, _ = unwrap_inout(data) + if data isa DTask + data = fetch(data; move_value=false, unwrap=false) + end + return pos => tochunk(data) + end + f_chunk = tochunk(value(spec.fargs[1])) + task_time = remotecall_fetch(1, f_chunk, args) do f, args + Sch.init_eager() + sch_state = Sch.EAGER_STATE[] + return @lock sch_state.lock begin + sig = Sch.signature(sch_state, f, args) + return get(sch_state.signature_time_cost, sig, 1000^3) + end + end + + # FIXME: Copy deps are computed eagerly + deps = @something(spec.options.syncdeps, Set{Any}()) + + # Find latest time-to-completion of all syncdeps + deps_completed = UInt64(0) + for dep in deps + haskey(sstate.task_completions, dep) || continue # copy deps aren't recorded + deps_completed = max(deps_completed, sstate.task_completions[dep]) + end + + # Find latest time-to-completion of each memory space + # FIXME: Figure out space completions based on optimal packing + spaces_completed = Dict{MemorySpace,UInt64}() + for space in exec_spaces + completed = UInt64(0) + for (task, other_space) in sstate.assignments + space == other_space || continue + completed = max(completed, sstate.task_completions[task]) + end + spaces_completed[space] = completed + end + + # Choose the earliest-available memory space and processor + # FIXME: Consider move time + move_time = UInt64(0) + local our_space_completed + while true + our_space_completed, our_space = findmin(spaces_completed) + our_space_procs = filter(proc->proc in all_procs, processors(our_space)) + if isempty(our_space_procs) + delete!(spaces_completed, our_space) + continue + end + our_proc = rand(our_space_procs) + break + end + + sstate.task_to_spec[task] = spec + sstate.assignments[task] = our_space + sstate.task_completions[task] = our_space_completed + move_time + task_time + elseif scheduler == :roundrobin + our_proc = all_procs[proc_idx] + else + error("Invalid scheduler: $sched") end - # #endregion + @assert our_proc in all_procs + our_space = only(memory_spaces(our_proc)) # Find the scope for this task (and its copies) - if task_scope == all_scope + task_scope = @something(spec.options.compute_scope, spec.options.scope, DefaultScope()) + if task_scope == scope # Optimize for the common case, cache the proc=>scope mapping our_scope = get!(proc_to_scope_lfu, our_proc) do our_procs = filter(proc->proc in all_procs, collect(processors(our_space))) - return constrain(UnionScope(map(ExactScope, our_procs)...), all_scope) + return constrain(UnionScope(map(ExactScope, our_procs)...), scope) end else # Use the provided scope and constrain it to the available processors @@ -380,10 +443,6 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr # Copy args from local to remote remote_args = map_or_ntuple(task_arg_ws) do idx - if accel isa MPIAcceleration - service_aliasing_requests(accel.comm) - end - arg_ws = task_arg_ws[idx] arg = arg_ws.arg pos = raw_position(arg_ws.pos) @@ -504,6 +563,7 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr end write_num += 1 + proc_idx = mod1(proc_idx + 1, length(all_procs)) - return write_num + return write_num, proc_idx end diff --git a/src/datadeps/scheduling.jl b/src/datadeps/scheduling.jl index d883c687c..0bf9818f6 100644 --- a/src/datadeps/scheduling.jl +++ b/src/datadeps/scheduling.jl @@ -1,10 +1,5 @@ -export DataDepsScheduler, RoundRobinScheduler, NaiveScheduler, UltraScheduler - abstract type DataDepsScheduler end -# Set by distribute_tasks! before the task loop for UltraScheduler -const DATADEPS_EXEC_SPACES = Ref{Union{Vector{<:MemorySpace},Nothing}}(nothing) - mutable struct RoundRobinScheduler <: DataDepsScheduler proc_idx::Int RoundRobinScheduler() = new(1) @@ -84,7 +79,7 @@ function datadeps_schedule_task(sched::UltraScheduler, state::DataDepsState, all end # FIXME: Copy deps are computed eagerly - deps = @something(spec.options.syncdeps, Set{Any}()) + deps = @something(spec.options.syncdeps, Set{ThunkSyncdep}()) # Find latest time-to-completion of all syncdeps deps_completed = UInt64(0) @@ -95,8 +90,6 @@ function datadeps_schedule_task(sched::UltraScheduler, state::DataDepsState, all # Find latest time-to-completion of each memory space # FIXME: Figure out space completions based on optimal packing - # exec_spaces is set by distribute_tasks! before the task loop - exec_spaces = something(DATADEPS_EXEC_SPACES[], unique(vcat(map(proc->collect(memory_spaces(proc)), all_procs)...))) spaces_completed = Dict{MemorySpace,UInt64}() for space in exec_spaces completed = UInt64(0) diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index a41e91d4b..dd9b8dc3f 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -24,7 +24,6 @@ struct CPURAMMemorySpace <: MemorySpace owner::Int end root_worker_id(space::CPURAMMemorySpace) = space.owner -root_worker_id(c::Chunk) = root_worker_id(c.handle) CPURAMMemorySpace() = CPURAMMemorySpace(myid()) @@ -137,7 +136,37 @@ end may_alias(::MemorySpace, ::MemorySpace) = true may_alias(space1::CPURAMMemorySpace, space2::CPURAMMemorySpace) = space1.owner == space2.owner -# RemotePtr and MemorySpan are defined in utils/memory-span.jl (included earlier) +struct RemotePtr{T,S<:MemorySpace} <: Ref{T} + addr::UInt + space::S +end +RemotePtr{T}(addr::UInt, space::S) where {T,S} = RemotePtr{T,S}(addr, space) +RemotePtr{T}(ptr::Ptr{V}, space::S) where {T,V,S} = RemotePtr{T,S}(UInt(ptr), space) +RemotePtr{T}(ptr::Ptr{V}) where {T,V} = RemotePtr{T}(UInt(ptr), CPURAMMemorySpace(myid())) +# FIXME: Don't hardcode CPURAMMemorySpace +RemotePtr(addr::UInt) = RemotePtr{Cvoid}(addr, CPURAMMemorySpace(myid())) +Base.convert(::Type{RemotePtr}, x::Ptr{T}) where T = + RemotePtr(UInt(x), CPURAMMemorySpace(myid())) +Base.convert(::Type{<:RemotePtr{V}}, x::Ptr{T}) where {V,T} = + RemotePtr{V}(UInt(x), CPURAMMemorySpace(myid())) +Base.convert(::Type{UInt}, ptr::RemotePtr) = ptr.addr +Base.:+(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr + offset, ptr.space) +Base.:-(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr - offset, ptr.space) +function Base.isless(ptr1::RemotePtr, ptr2::RemotePtr) + @assert ptr1.space == ptr2.space + return ptr1.addr < ptr2.addr +end + +struct MemorySpan{S} + ptr::RemotePtr{Cvoid,S} + len::UInt +end +MemorySpan(ptr::RemotePtr{Cvoid,S}, len::Integer) where S = + MemorySpan{S}(ptr, UInt(len)) +MemorySpan{S}(addr::UInt, len::Integer) where S = + MemorySpan{S}(RemotePtr{Cvoid,S}(addr), UInt(len)) +Base.isless(a::MemorySpan, b::MemorySpan) = a.ptr < b.ptr +Base.isempty(x::MemorySpan) = x.len == 0 abstract type AbstractAliasing end memory_spans(::T) where T<:AbstractAliasing = throw(ArgumentError("Must define `memory_spans` for `$T`")) memory_spans(x) = memory_spans(aliasing(x)) @@ -425,4 +454,34 @@ function will_alias(x_span::MemorySpan, y_span::MemorySpan) return x_span.ptr <= y_end && y_span.ptr <= x_end end -# LocalMemorySpan, ManyMemorySpan, ManyPair are defined in utils/memory-span.jl (included earlier) +### More space-efficient memory spans + +struct LocalMemorySpan + ptr::UInt + len::UInt +end +LocalMemorySpan(span::MemorySpan) = LocalMemorySpan(span.ptr.addr, span.len) +Base.isempty(x::LocalMemorySpan) = x.len == 0 + +# FIXME: Store the length separately, since it's shared by all spans +struct ManyMemorySpan{N} + spans::NTuple{N,LocalMemorySpan} +end +Base.isempty(x::ManyMemorySpan) = all(isempty, x.spans) + +struct ManyPair{N} <: Unsigned + pairs::NTuple{N,UInt} +end +Base.promote_rule(::Type{ManyPair}, ::Type{T}) where {T<:Integer} = ManyPair +Base.convert(::Type{ManyPair{N}}, x::T) where {T<:Integer,N} = ManyPair(ntuple(i -> x, N)) +Base.convert(::Type{ManyPair}, x::ManyPair) = x +Base.:+(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] + y.pairs[i], N)) +Base.:-(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] - y.pairs[i], N)) +Base.:-(x::ManyPair) = error("Can't negate a ManyPair") +Base.:(==)(x::ManyPair, y::ManyPair) = x.pairs == y.pairs +Base.isless(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] +Base.:(<)(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] +Base.string(x::ManyPair) = "ManyPair($(x.pairs))" + +ManyMemorySpan{N}(start::ManyPair{N}, len::ManyPair{N}) where N = + ManyMemorySpan{N}(ntuple(i -> LocalMemorySpan(start.pairs[i], len.pairs[i]), N)) diff --git a/src/mpi.jl b/src/mpi.jl index bb4d55ab9..4b85122b9 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -49,38 +49,21 @@ struct MPIAcceleration <: Acceleration end MPIAcceleration() = MPIAcceleration(MPI.COMM_WORLD) -const ALIASING_REQUEST_TAG = UInt32(0xFF00) - -# Deterministic tag for data movement (remotecall_endpoint). Uses a separate tag space -# so it cannot collide with the global to_tag() counter used by execute! / aliasing. -# Prevents symmetric deadlock when one rank blocks in remotecall_endpoint recv while -# another rank consumes the same counter value in execute! recv. -const REMOTECALL_TAG_BASE = 100_000 -const REMOTECALL_TAG_RANGE = 424_287 # so base+range-1 <= typical tag_ub -function remotecall_tag(comm::MPI.Comm, uid, from_rank::Int, to_rank::Int, ref_id) - tag_ub = Int(MPI.tag_ub()) - range = min(REMOTECALL_TAG_RANGE, max(1, tag_ub - REMOTECALL_TAG_BASE + 1)) - h = hash((uid, from_rank, to_rank, ref_id)) - tag = REMOTECALL_TAG_BASE + Int(rem(h, UInt(range))) - return UInt32(tag) -end - function aliasing(accel::MPIAcceleration, x::Chunk, T) handle = x.handle::MPIRef @assert accel.comm == handle.comm "MPIAcceleration comm mismatch" + tag = to_tag() + check_uniform(tag) rank = MPI.Comm_rank(accel.comm) if handle.rank == rank ainfo = aliasing(x, T) - check_uniform(ainfo) - return ainfo - end - response_tag = to_tag() - check_uniform(response_tag) - request_payload = (handle, T, response_tag) - _send_yield_raw(request_payload, accel.comm, handle.rank, Int(ALIASING_REQUEST_TAG)) - ainfo = recv_yield(accel.comm, handle.rank, response_tag) - if ainfo isa Exception - throw(ainfo) + #Core.print("[$rank] aliasing: $ainfo, sending\n") + @opcounter :aliasing_bcast_send_yield + bcast_send_yield(ainfo, accel.comm, handle.rank, tag) + else + #Core.print("[$rank] aliasing: receiving from $(handle.rank)\n") + ainfo = recv_yield(accel.comm, handle.rank, tag) + #Core.print("[$rank] aliasing: received $ainfo\n") end check_uniform(ainfo) return ainfo @@ -268,8 +251,6 @@ default_processor(space::MPIMemorySpace) = MPIOSProc(space.comm, space.rank) default_memory_space(accel::MPIAcceleration) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) default_memory_space(accel::MPIAcceleration, x) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, 0) -default_memory_space(accel::MPIAcceleration, x::Array) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) -default_memory_space(accel::MPIAcceleration, x::AliasedObjectCacheStore) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) default_memory_space(accel::MPIAcceleration, x::Chunk) = MPIMemorySpace(CPURAMMemorySpace(myid()), x.handle.comm, x.handle.rank) default_memory_space(accel::MPIAcceleration, x::Function) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) default_memory_space(accel::MPIAcceleration, T::Type) = MPIMemorySpace(CPURAMMemorySpace(myid()), accel.comm, MPI.Comm_rank(accel.comm)) @@ -351,166 +332,29 @@ function affinity(x::MPIRef) end end -const MPIREF_ORPHAN = Threads.Atomic{Int}(1) - function take_ref_id!() tid = 0 uid = 0 id = 0 - _branch = "" if Dagger.in_task() tid = sch_handle().thunk_id.id uid = 0 counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) - _branch = "in_task" elseif MPI_TID[] != 0 tid = MPI_TID[] uid = 0 counter = get!(MPIREF_TID, tid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) - _branch = "MPI_TID" elseif MPI_UID[] != 0 tid = 0 uid = MPI_UID[] counter = get!(MPIREF_UID, uid, Threads.Atomic{Int}(1)) id = Threads.atomic_add!(counter, 1) - _branch = "MPI_UID" - else - tid = 0 - uid = Int(Threads.atomic_add!(MPIREF_ORPHAN, 1)) - counter = get!(MPIREF_UID, uid, Threads.Atomic{Int}(1)) - id = Threads.atomic_add!(counter, 1) - _branch = "orphan" end return MPIRefID(tid, uid, id) end -const MPIREF_REGISTRY = Base.Lockable(Dict{MPIRefID, DRef}()) - -const ALIASING_PENDING = Vector{Tuple{MPIRef, Any, UInt32, Int}}() - -""" -Service any pending aliasing requests where we are the owner. -Called from recv_yield loops to avoid deadlock when a requester is blocking -waiting for aliasing from us while we're blocked waiting for someone else. -""" -# #region agent log -const _SAR_ACTIVE = Threads.Atomic{Int}(0) -# #endregion -const _SAR_OUTBOX = Vector{Tuple{Any, MPI.Comm, Int, UInt32}}() - -const _SAR_STATE = Ref{Symbol}(:idle) -const _SAR_LEN_BUF = Int64[0] -const _SAR_REQ = Ref{Union{Nothing, MPI.Request}}(nothing) -const _SAR_DATA_BUF = Ref{Vector{UInt8}}(UInt8[]) -const _SAR_SRC = Ref{Int}(0) - -function service_aliasing_requests(comm::MPI.Comm) - _prev = Threads.atomic_add!(_SAR_ACTIVE, 1) - if _prev > 0 - Threads.atomic_sub!(_SAR_ACTIVE, 1) - return - end - - rank = MPI.Comm_rank(comm) - - if !isempty(ALIASING_PENDING) - still_pending = Tuple{MPIRef, Any, UInt32, Int}[] - for (handle, dep_mod, response_tag, src) in ALIASING_PENDING - inner_ref = lock(MPIREF_REGISTRY) do reg - get(reg, handle.id, nothing) - end - if inner_ref !== nothing - value = poolget(inner_ref) - ainfo = aliasing(value, dep_mod) - push!(_SAR_OUTBOX, (ainfo, comm, src, response_tag)) - else - push!(still_pending, (handle, dep_mod, response_tag, src)) - end - end - empty!(ALIASING_PENDING) - append!(ALIASING_PENDING, still_pending) - end - - while true - if _SAR_STATE[] == :idle - _SAR_LEN_BUF[1] = 0 - _SAR_REQ[] = MPI.Irecv!(MPI.Buffer(_SAR_LEN_BUF), comm; - source=Int(MPI.API.MPI_ANY_SOURCE[]), tag=Int(ALIASING_REQUEST_TAG)) - _SAR_STATE[] = :wait_len - end - - if _SAR_STATE[] == :wait_len - done, status = MPI.Test(_SAR_REQ[], MPI.Status) - if !done - break - end - _SAR_SRC[] = MPI.Get_source(status) - nbytes = _SAR_LEN_BUF[1] - _SAR_DATA_BUF[] = Array{UInt8}(undef, nbytes) - _SAR_REQ[] = MPI.Irecv!(MPI.Buffer(_SAR_DATA_BUF[]), comm; - source=_SAR_SRC[], tag=Int(ALIASING_REQUEST_TAG)) - _SAR_STATE[] = :wait_data - end - - if _SAR_STATE[] == :wait_data - done, status = MPI.Test(_SAR_REQ[], MPI.Status) - if !done - break - end - payload = MPI.deserialize(_SAR_DATA_BUF[]) - # (SAR recv log removed to reduce noise) - (handle::MPIRef, dep_mod, response_tag::UInt32) = payload - if handle.rank == rank - inner_ref = handle.innerRef - if inner_ref === nothing - inner_ref = lock(MPIREF_REGISTRY) do reg - get(reg, handle.id, nothing) - end - end - if inner_ref !== nothing - value = poolget(inner_ref) - ainfo = aliasing(value, dep_mod) - push!(_SAR_OUTBOX, (ainfo, comm, _SAR_SRC[], response_tag)) - else - push!(ALIASING_PENDING, (handle, dep_mod, response_tag, _SAR_SRC[])) - end - end - _SAR_STATE[] = :idle - continue - end - end - - while !isempty(_SAR_OUTBOX) - (ainfo, _comm, dest, rtag) = popfirst!(_SAR_OUTBOX) - _send_outbox_response(ainfo, _comm, dest, Int(rtag)) - end - - Threads.atomic_sub!(_SAR_ACTIVE, 1) -end - -function _send_outbox_response(value, comm, dest, tag) - buf = MPI.serialize(value) - len_buf = Int64[length(buf)] - lock(SEND_SERIALIZE_LOCK) do - GC.@preserve buf len_buf begin - req_len = MPI.Isend(len_buf, comm; dest, tag) - while true - finish, _ = MPI.Test(req_len, MPI.Status) - finish && break - yield() - end - req_data = MPI.Isend(buf, comm; dest, tag) - while true - finish, _ = MPI.Test(req_data, MPI.Status) - finish && break - yield() - end - end - end -end - #TODO: partitioned scheduling with comm bifurcation function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) @assert space.comm == MPI.COMM_WORLD "$(space.comm) != $(MPI.COMM_WORLD)" @@ -519,29 +363,15 @@ function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) if local_rank != space.rank return MPIRef(space.comm, space.rank, 0, nothing, Mid) else - innerRef = poolset(x; device, kwargs...) - lock(MPIREF_REGISTRY) do reg - reg[Mid] = innerRef - end - return MPIRef(space.comm, space.rank, sizeof(x), innerRef, Mid) + return MPIRef(space.comm, space.rank, sizeof(x), poolset(x; device, kwargs...), Mid) end end const DEADLOCK_DETECT = TaskLocalValue{Bool}(()->true) const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->60.0) -# When true, __wait_for_request spins without yield so remotecall sender completes before other tasks run. -const REMOTECALL_SENDER_NO_YIELD = TaskLocalValue{Bool}(()->false) const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) -# MPI_ANY_TAG + queue: pool of Irecv(ANY_SOURCE, ANY_TAG) and completion queue keyed by (comm, source, tag). -const RECV_POOL_SIZE = 64 -const _RECV_POOL = Dict{MPI.Comm, Any}() # comm -> RecvPoolState -const _RECV_POOL_LOCK = ReentrantLock() - -# Completion queue: (comm, source, tag) -> list of received values (one waiter per key at a time). -const _RECV_COMPLETION_QUEUE = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Vector{Any}}()) - struct InplaceInfo type::DataType shape::Tuple @@ -555,52 +385,6 @@ struct InplaceSparseInfo nzval::Int end -# MPI.Buffer uses Int32 for count; reject corrupt or oversized length to avoid InexactError. -const MAX_SERIALIZED_RECV_LENGTH = Int64(typemax(Int32)) - -# Per-slot state for the recv pool. Phase: :waiting_length | :waiting_data | :waiting_inplace_* | :idle (slot free). -mutable struct RecvPoolSlot - phase::Symbol - comm::MPI.Comm - source::Int - tag::Int - len_buf::Vector{Int64} - req::Union{MPI.Request, Nothing} - data_buf::Vector{UInt8} - # Inplace: for InplaceInfo we store the array buffer; for InplaceSparseInfo we store (colptr, rowval, nzval) as we go. - inplace_info::Union{InplaceInfo, InplaceSparseInfo, Nothing} - inplace_bufs::Vector{Any} # accumulated inplace arrays -end - -mutable struct RecvPoolState - slots::Vector{RecvPoolSlot} - initialized::Bool -end - -function _recv_pool_for_comm(comm::MPI.Comm) - lock(_RECV_POOL_LOCK) do - if !haskey(_RECV_POOL, comm) - _RECV_POOL[comm] = RecvPoolState(RecvPoolSlot[], false) - end - return _RECV_POOL[comm] - end -end - -const _MPI_ANY_SOURCE = Int(MPI.API.MPI_ANY_SOURCE[]) -const _MPI_ANY_TAG = Int(MPI.API.MPI_ANY_TAG[]) - -function _recv_pool_init!(pool::RecvPoolState, comm::MPI.Comm) - pool.initialized && return - rank = MPI.Comm_rank(comm) - for i in 1:RECV_POOL_SIZE - len_buf = Int64[0] - req = MPI.Irecv!(MPI.Buffer(len_buf), comm; source=_MPI_ANY_SOURCE, tag=_MPI_ANY_TAG) - slot = RecvPoolSlot(:waiting_length, comm, -1, -1, len_buf, req, UInt8[], nothing, Any[]) - push!(pool.slots, slot) - end - pool.initialized = true -end - function supports_inplace_mpi(value) if value isa DenseArray && isbitstype(eltype(value)) return true @@ -610,74 +394,74 @@ function supports_inplace_mpi(value) end function recv_yield!(buffer, comm, src, tag) rank = MPI.Comm_rank(comm) + #Core.println("buffer recv: $buffer, type of buffer: $(typeof(buffer)), is in place? $(supports_inplace_mpi(buffer))") if !supports_inplace_mpi(buffer) return recv_yield(comm, src, tag), false end - # Inplace: sender uses InplaceInfo+array (length-prefix), pool assembles and queues the array; copy to user buffer. - value = recv_yield(comm, src, tag) - copy!(buffer, value) + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv! from [$src]") + + # Ensure no other receiver is waiting + our_event = Base.Event() + @label retry + other_event = lock(RECV_WAITING) do waiting + if haskey(waiting, (comm, src, tag)) + waiting[(comm, src, tag)] + else + waiting[(comm, src, tag)] = our_event + nothing + end + end + if other_event !== nothing + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") + wait(other_event) + @goto retry + end + + buffer = recv_yield_inplace!(buffer, comm, rank, src, tag) + + lock(RECV_WAITING) do waiting + delete!(waiting, (comm, src, tag)) + notify(our_event) + end + return buffer, true + end function recv_yield(comm, src, tag) rank = MPI.Comm_rank(comm) - key = (comm, src, tag) - - # Check completion queue first (message may already have been received by the pool). - value = lock(_RECV_COMPLETION_QUEUE) do q - if haskey(q, key) && !isempty(q[key]) - ref = popfirst!(q[key]) - isempty(q[key]) && delete!(q, key) - return poolget(ref) - end - return nothing - end - if value !== nothing - return value - end + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting recv from [$src]") - # Ensure no other receiver is waiting for this (comm, src, tag). + # Ensure no other receiver is waiting our_event = Base.Event() @label retry other_event = lock(RECV_WAITING) do waiting - if haskey(waiting, key) - waiting[key] + if haskey(waiting, (comm, src, tag)) + waiting[(comm, src, tag)] else - waiting[key] = our_event + waiting[(comm, src, tag)] = our_event nothing end end if other_event !== nothing + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Waiting for other receiver...") wait(other_event) @goto retry end + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Receiving...") - # Loop: drain pool, check queue, service aliasing, deadlock detect, yield. - time_start = time_ns() - detect = DEADLOCK_DETECT[] - warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) - timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) - @label wait_loop - service_recv_pool(comm) - value = lock(_RECV_COMPLETION_QUEUE) do q - if haskey(q, key) && !isempty(q[key]) - ref = popfirst!(q[key]) - isempty(q[key]) && delete!(q, key) - return poolget(ref) - end - return nothing + type = nothing + @label receive + value = recv_yield_serialized(comm, rank, src, tag) + if value isa InplaceInfo || value isa InplaceSparseInfo + value = recv_yield_inplace(value, comm, rank, src, tag) end - if value !== nothing - lock(RECV_WAITING) do waiting - delete!(waiting, key) - notify(our_event) - end - return value + + lock(RECV_WAITING) do waiting + delete!(waiting, (comm, src, tag)) + notify(our_event) end - service_aliasing_requests(comm) - warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, "recv", src) - yield() - @goto wait_loop + return value end function recv_yield_inplace!(array, comm, my_rank, their_rank, tag) @@ -686,17 +470,19 @@ function recv_yield_inplace!(array, comm, my_rank, their_rank, tag) warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) - req = MPI.Irecv!(MPI.Buffer(array), comm; source=their_rank, tag=tag) while true - finish, status = MPI.Test(req, MPI.Status) - if finish - if MPI.Get_error(status) != MPI.SUCCESS - error("recv_yield failed with error $(MPI.Get_error(status))") + (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) + if got + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield failed with error $(MPI.Get_error(stat))") end + count = MPI.Get_count(stat, UInt8) + @assert count == sizeof(array) "recv_yield_inplace: expected $(sizeof(array)) bytes, got $count" + buf = MPI.Buffer(array) + req = MPI.Imrecv!(buf, msg) + __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") return array end - service_recv_pool(comm) - service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end @@ -720,228 +506,30 @@ function recv_yield_inplace(_value::InplaceSparseInfo, comm, my_rank, their_rank return SparseMatrixCSC{eltype(T), Int64}(_value.m, _value.n, colptr, rowval, nzval) end -""" -Drain the recv pool: Test each slot's request; when complete, advance the state machine -(length -> data -> optional inplace -> ready). Push completed messages to the completion -queue and notify waiters. Replenish slots with new Irecv(ANY_SOURCE, ANY_TAG) for length. -Called from recv_yield's wait loop and from __wait_for_request (and recv_yield_inplace!). -""" -function service_recv_pool(comm::MPI.Comm) - pool = _recv_pool_for_comm(comm) - _recv_pool_init!(pool, comm) - rank = MPI.Comm_rank(comm) - - for slot in pool.slots - slot.req === nothing && continue - done, status = MPI.Test(slot.req, MPI.Status) - if !done - continue - end - if MPI.Get_error(status) != MPI.SUCCESS - error("recv pool slot failed with error $(MPI.Get_error(status))") - end - - if slot.phase == :waiting_length - slot.source = MPI.Get_source(status) - slot.tag = MPI.Get_tag(status) - count = slot.len_buf[1] - if count < 0 || count > MAX_SERIALIZED_RECV_LENGTH - error("recv pool: invalid or corrupt length $count (max $(MAX_SERIALIZED_RECV_LENGTH)); source=$(slot.source), tag=$(slot.tag)") - end - slot.data_buf = Array{UInt8}(undef, count) - slot.req = MPI.Irecv!(MPI.Buffer(slot.data_buf), comm; source=slot.source, tag=slot.tag) - slot.phase = :waiting_data - continue - end - - if slot.phase == :waiting_data - value = MPI.deserialize(slot.data_buf) - if slot.tag == Int(ALIASING_REQUEST_TAG) - # Hand off to aliasing path (same as service_aliasing_requests). - (handle::MPIRef, dep_mod, response_tag::UInt32) = value - if handle.rank == rank - inner_ref = handle.innerRef - if inner_ref === nothing - inner_ref = lock(MPIREF_REGISTRY) do reg - get(reg, handle.id, nothing) - end - end - if inner_ref !== nothing - v = poolget(inner_ref) - ainfo = aliasing(v, dep_mod) - push!(_SAR_OUTBOX, (ainfo, comm, slot.source, response_tag)) - else - push!(ALIASING_PENDING, (handle, dep_mod, response_tag, slot.source)) - end - end - _recv_pool_slot_reset!(slot, comm) - continue - end - - if value isa InplaceInfo - T = value.type - @assert T <: Array && isbitstype(eltype(T)) - arr = Array{eltype(T)}(undef, value.shape) - slot.inplace_info = value - slot.inplace_bufs = [arr] - slot.req = MPI.Irecv!(MPI.Buffer(arr), comm; source=slot.source, tag=slot.tag) - slot.phase = :waiting_inplace_dense - continue - end - - if value isa InplaceSparseInfo - slot.inplace_info = value - colptr_buf = Vector{Int64}(undef, value.colptr) - slot.inplace_bufs = [colptr_buf] - slot.req = MPI.Irecv!(MPI.Buffer(colptr_buf), comm; source=slot.source, tag=slot.tag) - slot.phase = :waiting_inplace_colptr - continue - end - - # Serialized value complete. - _recv_pool_push_and_reset!(slot, comm, value) - continue - end - - if slot.phase == :waiting_inplace_dense - arr = slot.inplace_bufs[1] - _recv_pool_push_and_reset!(slot, comm, arr) - continue - end - - if slot.phase == :waiting_inplace_colptr - sp = slot.inplace_info::InplaceSparseInfo - rowval_buf = Vector{Int64}(undef, sp.rowval) - push!(slot.inplace_bufs, rowval_buf) - slot.req = MPI.Irecv!(MPI.Buffer(rowval_buf), comm; source=slot.source, tag=slot.tag) - slot.phase = :waiting_inplace_rowval - continue - end - - if slot.phase == :waiting_inplace_rowval - sp = slot.inplace_info::InplaceSparseInfo - nzval_buf = Vector{eltype(sp.type)}(undef, sp.nzval) - push!(slot.inplace_bufs, nzval_buf) - slot.req = MPI.Irecv!(MPI.Buffer(nzval_buf), comm; source=slot.source, tag=slot.tag) - slot.phase = :waiting_inplace_nzval - continue - end - - if slot.phase == :waiting_inplace_nzval - sp = slot.inplace_info::InplaceSparseInfo - colptr = slot.inplace_bufs[1]::Vector{Int64} - rowval = slot.inplace_bufs[2]::Vector{Int64} - nzval = slot.inplace_bufs[3] - mat = SparseMatrixCSC{eltype(sp.type), Int64}(sp.m, sp.n, colptr, rowval, nzval) - _recv_pool_push_and_reset!(slot, comm, mat) - continue - end - end -end - -function _recv_pool_slot_reset!(slot::RecvPoolSlot, comm::MPI.Comm) - slot.phase = :waiting_length - slot.source = -1 - slot.tag = -1 - slot.len_buf[1] = 0 - slot.req = MPI.Irecv!(MPI.Buffer(slot.len_buf), comm; source=_MPI_ANY_SOURCE, tag=_MPI_ANY_TAG) - slot.data_buf = UInt8[] - slot.inplace_info = nothing - slot.inplace_bufs = Any[] -end - -function _recv_pool_push_and_reset!(slot::RecvPoolSlot, comm::MPI.Comm, value::Any) - key = (comm, slot.source, slot.tag) - ref = poolset(value) - lock(_RECV_COMPLETION_QUEUE) do q - if !haskey(q, key) - q[key] = Any[] - end - push!(q[key], ref) - end - lock(RECV_WAITING) do waiting - if haskey(waiting, key) - notify(waiting[key]) - end - end - _recv_pool_slot_reset!(slot, comm) -end - function recv_yield_serialized(comm, my_rank, their_rank, tag) time_start = time_ns() detect = DEADLOCK_DETECT[] warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) - len_buf = Int64[0] - local req_len - try - req_len = MPI.Irecv!(MPI.Buffer(len_buf), comm; source=their_rank, tag=tag) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Irecv_len\",\"message\":\"Irecv! len threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end - while true - local finish, status - try - finish, status = MPI.Test(req_len, MPI.Status) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Test_len\",\"message\":\"Test len threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end - if finish - if MPI.Get_error(status) != MPI.SUCCESS - error("recv_yield_serialized len failed with error $(MPI.Get_error(status))") - end - break - end - service_aliasing_requests(comm) - warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) - yield() - end - count = len_buf[1] - if count < 0 || count > MAX_SERIALIZED_RECV_LENGTH - error("recv_yield_serialized: invalid or corrupt length $count (max $(MAX_SERIALIZED_RECV_LENGTH)); src=$their_rank, tag=$tag") - end - buf = Array{UInt8}(undef, count) - local req_data - try - req_data = MPI.Irecv!(MPI.Buffer(buf), comm; source=their_rank, tag=tag) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Irecv_data\",\"message\":\"Irecv! data threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"count\":$count,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end while true - local finish, status - try - finish, status = MPI.Test(req_data, MPI.Status) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:recv_ser:Test_data\",\"message\":\"Test data threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"src\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end - if finish - if MPI.Get_error(status) != MPI.SUCCESS - error("recv_yield_serialized data failed with error $(MPI.Get_error(status))") + (got, msg, stat) = MPI.Improbe(their_rank, tag, comm, MPI.Status) + if got + if MPI.Get_error(stat) != MPI.SUCCESS + error("recv_yield failed with error $(MPI.Get_error(stat))") end + count = MPI.Get_count(stat, UInt8) + buf = Array{UInt8}(undef, count) + req = MPI.Imrecv!(MPI.Buffer(buf), msg) + __wait_for_request(req, comm, my_rank, their_rank, tag, "recv_yield", "recv") return MPI.deserialize(buf) end - service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, "recv", their_rank) yield() end end const SEEN_TAGS = Dict{Int32, Type}() -# Serialize nonblocking sends so only one Isend+wait is in flight at a time; avoids MPICH internal_Isend segfault with many concurrent requests. -const SEND_SERIALIZE_LOCK = ReentrantLock() send_yield!(value, comm, dest, tag; check_seen::Bool=true) = _send_yield(value, comm, dest, tag; check_seen, inplace=true) send_yield(value, comm, dest, tag; check_seen::Bool=true) = @@ -955,9 +543,8 @@ function _send_yield(value, comm, dest, tag; check_seen::Bool=true, inplace::Boo if check_seen SEEN_TAGS[tag] = typeof(value) end - # Inplace sends use InplaceInfo+array so the recv pool (ANY_TAG) can receive them; never send raw array only. + #Core.println("[rank $(MPI.Comm_rank(comm))][tag $tag] Starting send to [$dest]: $(typeof(value)), is support inplace? $(supports_inplace_mpi(value))") if inplace && supports_inplace_mpi(value) - send_yield_serialized(InplaceInfo(typeof(value), size(value)), comm, rank, dest, tag) send_yield_inplace(value, comm, rank, dest, tag) else send_yield_serialized(value, comm, rank, dest, tag) @@ -966,20 +553,8 @@ end function send_yield_inplace(value, comm, my_rank, their_rank, tag) @opcounter :send_yield_inplace - lock(SEND_SERIALIZE_LOCK) do - GC.@preserve value begin - local req - try - req = MPI.Isend(value, comm; dest=their_rank, tag) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:send_inplace:Isend\",\"message\":\"Isend inplace threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"dest\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end - __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") - end - end + req = MPI.Isend(value, comm; dest=their_rank, tag) + __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") end function send_yield_serialized(value, comm, my_rank, their_rank, tag) @@ -993,59 +568,8 @@ function send_yield_serialized(value, comm, my_rank, their_rank, tag) send_yield_inplace(value.rowval, comm, my_rank, their_rank, tag) send_yield_inplace(value.nzval, comm, my_rank, their_rank, tag) else - buf = MPI.serialize(value) - n = length(buf) - lock(SEND_SERIALIZE_LOCK) do - # Non-GC buffers so MPICH gets a stable pointer. Still Isend + yielding wait (no blocking). - ptr_len = Base.Libc.malloc(8) - ptr_len === C_NULL && throw(OutOfMemoryError()) - try - arr_len = Base.unsafe_wrap(Array, Ptr{Int64}(ptr_len), (1,); own=false) - arr_len[1] = n - req_len = MPI.Isend(arr_len, comm; dest=their_rank, tag) - __wait_for_request(req_len, comm, my_rank, their_rank, tag, "send_yield", "send") - finally - Base.Libc.free(ptr_len) - end - ptr = Base.Libc.malloc(n) - ptr === C_NULL && throw(OutOfMemoryError()) - try - arr = Base.unsafe_wrap(Array, Ptr{UInt8}(ptr), (n,); own=false) - copyto!(arr, buf) - req_data = MPI.Isend(arr, comm; dest=their_rank, tag) - __wait_for_request(req_data, comm, my_rank, their_rank, tag, "send_yield", "send") - finally - Base.Libc.free(ptr) - end - end - end -end - -function _send_yield_raw(value, comm, dest, tag) - rank = MPI.Comm_rank(comm) - buf = MPI.serialize(value) - n = length(buf) - lock(SEND_SERIALIZE_LOCK) do - ptr_len = Base.Libc.malloc(8) - ptr_len === C_NULL && throw(OutOfMemoryError()) - try - arr_len = Base.unsafe_wrap(Array, Ptr{Int64}(ptr_len), (1,); own=false) - arr_len[1] = n - req_len = MPI.Isend(arr_len, comm; dest, tag) - __wait_for_request(req_len, comm, rank, dest, tag, "send_yield_raw_len", "send") - finally - Base.Libc.free(ptr_len) - end - ptr = Base.Libc.malloc(n) - ptr === C_NULL && throw(OutOfMemoryError()) - try - arr = Base.unsafe_wrap(Array, Ptr{UInt8}(ptr), (n,); own=false) - copyto!(arr, buf) - req_data = MPI.Isend(arr, comm; dest, tag) - __wait_for_request(req_data, comm, rank, dest, tag, "send_yield_raw_data", "send") - finally - Base.Libc.free(ptr) - end + req = MPI.isend(value, comm; dest=their_rank, tag) + __wait_for_request(req, comm, my_rank, their_rank, tag, "send_yield", "send") end end @@ -1055,30 +579,13 @@ function __wait_for_request(req, comm, my_rank, their_rank, tag, fn::String, kin warn_period = round(UInt64, DEADLOCK_WARN_PERIOD[] * 1e9) timeout_period = round(UInt64, DEADLOCK_TIMEOUT_PERIOD[] * 1e9) while true - local finish, status - try - finish, status = MPI.Test(req, MPI.Status) - catch e - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H15_mpi_err\",\"location\":\"mpi.jl:__wait_for_request:Test\",\"message\":\"MPI.Test threw\",\"data\":{\"rank\":$my_rank,\"tag\":$tag,\"fn\":\"$fn\",\"kind\":\"$kind\",\"dest\":$their_rank,\"error\":\"$(sprint(showerror, e))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - rethrow() - end + finish, status = MPI.Test(req, MPI.Status) if finish if MPI.Get_error(status) != MPI.SUCCESS error("$fn failed with error $(MPI.Get_error(status))") end return end - if REMOTECALL_SENDER_NO_YIELD[] - # Sender in remotecall_endpoint: spin until send completes so we don't yield to other tasks. - if detect && (time_ns() - time_start) > timeout_period - error("[rank $my_rank][tag $tag] Hit hang on $kind (dest: $their_rank) [remotecall sender spin]") - end - continue - end - service_recv_pool(comm) - service_aliasing_requests(comm) warn_period = mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, my_rank, tag, kind, their_rank) yield() end @@ -1116,13 +623,10 @@ end function mpi_deadlock_detect(detect, time_start, warn_period, timeout_period, rank, tag, kind, srcdest) time_elapsed = (time_ns() - time_start) if detect && time_elapsed > warn_period - @warn "[rank $rank][tag $tag] Hit probable hang on $kind (dest: $srcdest) [$(round(time_elapsed/1e9, digits=1))s]" + @warn "[rank $rank][tag $tag] Hit probable hang on $kind (dest: $srcdest)" return typemax(UInt64) end if detect && time_elapsed > timeout_period - # #region agent log - try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H11_timeout\",\"location\":\"mpi.jl:deadlock_detect:TIMEOUT\",\"message\":\"deadlock TIMEOUT - will throw\",\"data\":{\"rank\":$rank,\"tag\":$tag,\"kind\":\"$kind\",\"srcdest\":$srcdest,\"elapsed_s\":$(time_elapsed/1e9)},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion error("[rank $rank][tag $tag] Hit hang on $kind (dest: $srcdest)") end return warn_period @@ -1132,11 +636,6 @@ end WeakChunk(c::Chunk{T,H}) where {T,H<:MPIRef} = WeakChunk(c.handle.rank, c.handle.id.id, WeakRef(c)) function MemPool.poolget(ref::MPIRef; uniform::Bool=false) - if !uniform && ref.rank != MPI.Comm_rank(ref.comm) - # #region agent log - _r = MPI.Comm_rank(ref.comm); try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H12_poolget\",\"location\":\"mpi.jl:poolget\",\"message\":\"MPIRef rank mismatch about to assert\",\"data\":{\"local_rank\":$_r,\"ref_rank\":$(ref.rank),\"ref_id\":\"$(ref.id)\",\"uniform\":$uniform,\"backtrace\":\"$(replace(sprint(Base.show_backtrace, backtrace()), '\"'=>'\'', '\n'=>' '))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end - # #endregion - end @assert uniform || ref.rank == MPI.Comm_rank(ref.comm) "MPIRef rank mismatch: $(ref.rank) != $(MPI.Comm_rank(ref.comm))" if uniform tag = to_tag() @@ -1259,10 +758,7 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro return with(MPI_UID=>task.uid, MPI_UNIFORM=>true) do @assert data isa Chunk "Expected Chunk, got $(typeof(data))" space = memory_space(data) - tag = remotecall_tag(accel.comm, task.uid, from_proc.rank, to_proc.rank, data.handle.id) - # #region agent log - if loc_rank <= 1 && tag >= 598 && tag <= 612; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_tag_op\",\"location\":\"mpi.jl:remotecall_endpoint\",\"message\":\"remotecall tag assigned\",\"data\":{\"rank\":$loc_rank,\"tag\":$tag,\"from_rank\":$(from_proc.rank),\"to_rank\":$(to_proc.rank),\"space_rank\":$(space.rank),\"task_uid\":$(task.uid),\"task_id\":$(task.id)},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end - # #endregion + tag = to_tag() if space.rank != from_proc.rank # If the data is already where it needs to be @assert space.rank == to_proc.rank @@ -1288,12 +784,7 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro if loc_rank == from_proc.rank value = poolget(data.handle) data_moved = move(from_proc.innerProc, to_proc.innerProc, value) - try - REMOTECALL_SENDER_NO_YIELD[] = true - Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) - finally - REMOTECALL_SENDER_NO_YIELD[] = false - end + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) # FIXME: This is wrong to take typeof(data_moved), because the type may change return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) elseif loc_rank == to_proc.rank @@ -1340,39 +831,33 @@ function move(src::MPIProcessor, dst::MPIProcessor, x::Chunk) end end -_precise_typeof(x) = typeof(x) -_precise_typeof(::Type{T}) where {T} = Type{T} - -function execute!(proc::MPIProcessor, f, args...; kwargs...) +#FIXME:try to think of a better move! scheme +function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing - + tag_space = to_tag() if islocal || inplace_move - result = execute!(proc.innerProc, f, args...; kwargs...) + result = execute!(proc.innerProc, world, f, args...; kwargs...) end - if inplace_move space = memory_space(nothing, proc)::MPIMemorySpace return tochunk(nothing, proc, space) - end - - tag = to_tag() - # #region agent log - if local_rank <= 1 && tag >= 598 && tag <= 612; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H16_tag_op\",\"location\":\"mpi.jl:execute!\",\"message\":\"execute! tag assigned\",\"data\":{\"rank\":$local_rank,\"tag\":$tag,\"proc_rank\":$(proc.rank),\"islocal\":$islocal,\"f\":\"$(nameof(f))\"},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end - # #endregion - if islocal - T = typeof(result) - space = memory_space(result, proc)::MPIMemorySpace - T_space = (T, space.innerSpace) - @opcounter :execute_bcast_send_yield - bcast_send_yield(T_space, proc.comm, proc.rank, tag) - return tochunk(result, proc, space) else - T, innerSpace = recv_yield(proc.comm, proc.rank, tag) - space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) - return tochunk(nothing, proc, space; type=T) + # Handle commun1ication ourselves + if islocal + T = typeof(result) + space = memory_space(result, proc)::MPIMemorySpace + T_space = (T, space.innerSpace) + @opcounter :execute_bcast_send_yield + bcast_send_yield(T_space, proc.comm, proc.rank, tag) + return tochunk(result, proc, space) + else + T, innerSpace = recv_yield(proc.comm, proc.rank, tag) + space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) + return tochunk(nothing, proc, space; type=T) + end end end @@ -1382,9 +867,6 @@ function initialize_acceleration!(a::MPIAcceleration) if !MPI.Initialized() MPI.Init(;threadlevel=:multiple) end - # #region agent log - _r = MPI.Comm_rank(a.comm); _tl = MPI.Query_thread(); if _r <= 1; try; open("/flare/dagger/fdadagger/.cursor/debug-852f70.log", "a") do io; println(io, "{\"sessionId\":\"852f70\",\"hypothesisId\":\"H2_init\",\"location\":\"mpi.jl:initialize_acceleration!\",\"message\":\"MPI init\",\"data\":{\"rank\":$_r,\"nthreads\":$(Threads.nthreads()),\"mpi_thread_level\":$_tl,\"tag_ub\":$(MPI.tag_ub())},\"timestamp\":$(round(Int,time()*1000))}"); end; catch; end; end - # #endregion ctx = Dagger.Sch.eager_context() sz = MPI.Comm_size(a.comm) for i in 0:(sz-1) diff --git a/src/mpi_mempool.jl b/src/mpi_mempool.jl new file mode 100644 index 000000000..149c7900a --- /dev/null +++ b/src/mpi_mempool.jl @@ -0,0 +1,36 @@ +# Mempool for received MPI message data only (no envelopes). +# Key: (comm, source, tag). Used when a message is received but not the one the caller was waiting for. +# Included from mpi.jl; runs in Dagger module scope. + +const MPI_RECV_MEMPOOL = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Vector{Any}}()) + +function mpi_mempool_put!(comm::MPI.Comm, source::Integer, tag::Integer, data::Any) + key = (comm, Int(source), Int(tag)) + ref = poolset(data) + lock(MPI_RECV_MEMPOOL) do pool + if !haskey(pool, key) + pool[key] = Any[] + end + push!(pool[key], ref) + end + return nothing +end + +function mpi_mempool_take!(comm::MPI.Comm, source::Integer, tag::Integer) + key = (comm, Int(source), Int(tag)) + ref = lock(MPI_RECV_MEMPOOL) do pool + if !haskey(pool, key) || isempty(pool[key]) + return nothing + end + popfirst!(pool[key]) + end + ref === nothing && return nothing + return poolget(ref) +end + +function mpi_mempool_has(comm::MPI.Comm, source::Integer, tag::Integer) + key = (comm, Int(source), Int(tag)) + return lock(MPI_RECV_MEMPOOL) do pool + haskey(pool, key) && !isempty(pool[key]) + end +end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index a6d252575..954867098 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -15,7 +15,7 @@ import Base: @invokelatest import ..Dagger import ..Dagger: Context, Processor, SchedulerOptions, Options, Thunk, WeakThunk, ThunkFuture, ThunkID, DTaskFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, InvalidScope, LockedObject, Argument, Signature -import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, root_worker_id, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc! +import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, root_worker_id, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc!, is_local_processor, fire_order_key import ..Dagger: @dagdebug, @safe_lock_spin1, @maybelog, @take_or_alloc! import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek @@ -685,10 +685,12 @@ end costs_cleanup() @goto pop_task - # Fire all newly-scheduled tasks + # Fire all newly-scheduled tasks (owner/local first, then by fire_order_key to avoid MPI execute! deadlock) @label fire_tasks - for (task_loc, task_spec) in to_fire - fire_tasks!(ctx, task_loc, task_spec, state) + task_locs = collect(keys(to_fire)) + sort!(task_locs; by=loc -> (is_local_processor(loc.proc) ? 0 : 1, fire_order_key(loc.proc))) + for task_loc in task_locs + fire_tasks!(ctx, task_loc, to_fire[task_loc], state) end to_fire_cleanup() diff --git a/src/utils/interval_tree.jl b/src/utils/interval_tree.jl index 8046fbb3b..1c2b3a7f6 100644 --- a/src/utils/interval_tree.jl +++ b/src/utils/interval_tree.jl @@ -195,11 +195,44 @@ function Base.delete!(tree::IntervalTree{M,E}, span::M) where {M,E} parent_of_succ.right = replacement end - # Update max_end bottom-up for the successor's original path - update_max_end!(parent_of_succ) - for i in length(succ_path)-1:-1:1 - update_max_end!(succ_path[i]) + target.span = successor.span + replacement = target + end + + # Phase 3: Handle overlap case - add remaining portions + if target_type == :overlap + original_start = span_start(original_span) + original_end = span_end(original_span) + del_start = span_start(span) + del_end = span_end(span) + verify_span(span) + + # Left portion: exists if original starts before deleted span + if original_start < del_start + left_end = min(original_end, del_start - _span_one(del_start)) + if left_end >= original_start + left_span = M(original_start, left_end - original_start + _span_one(left_end)) + if !isempty(left_span) + replacement = insert_node!(replacement, left_span) + end + end end + + # Right portion: exists if original extends beyond deleted span + if original_end > del_end + right_start = max(original_start, del_end + _span_one(del_end)) + if original_end >= right_start + right_span = M(right_start, original_end - right_start + _span_one(original_end)) + if !isempty(right_span) + replacement = insert_node!(replacement, right_span) + end + end + end + end + + # Phase 4: Update parent's child pointer + if isempty(path) + root = replacement else # Zero or one child replacement = target.left !== nothing ? target.left : target.right @@ -261,12 +294,12 @@ function find_overlapping!(node::IntervalNode{M,E}, query::M, result::Vector{M}; # Enqueue left subtree if it might contain overlapping intervals if current.left !== nothing && current.left.max_end >= span_start(query) - push!(stack, current.left) + push!(queue, current.left) end # Enqueue right subtree if query extends beyond current node's start if current.right !== nothing && span_end(query) >= span_start(current.span) - push!(stack, current.right) + push!(queue, current.right) end end end From f8f5756502ef0c3a0b398ecc86eff78bf103ed48 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 5 Mar 2026 04:57:55 +0000 Subject: [PATCH 20/24] WIP: MPI works --- Project.toml | 4 +- src/Dagger.jl | 17 ++-- src/array/alloc.jl | 17 +++- src/array/copy.jl | 9 +- src/array/darray.jl | 131 ++++++++++++------------- src/array/linalg.jl | 100 +++++++++++-------- src/array/trsm.jl | 34 ++++--- src/datadeps/aliasing.jl | 43 ++++++++- src/datadeps/queue.jl | 44 +++++++-- src/dtask.jl | 32 ++----- src/lib/domain-blocks.jl | 2 + src/memory-spaces.jl | 74 +++----------- src/mpi.jl | 9 +- src/options.jl | 10 +- src/processor.jl | 19 ++++ src/queue.jl | 2 +- src/sch/Sch.jl | 10 +- src/sch/eager.jl | 3 +- src/sch/util.jl | 11 +-- src/scopes.jl | 11 ++- src/thunk.jl | 15 +-- src/tochunk.jl | 28 +++--- src/types/processor.jl | 11 +-- src/utils/chunks.jl | 191 +++++++++++++++++++++++++++++++++++++ src/utils/dagdebug.jl | 31 +++--- src/utils/interval_tree.jl | 53 +++------- src/utils/scopes.jl | 25 ++--- 27 files changed, 551 insertions(+), 385 deletions(-) create mode 100644 src/utils/chunks.jl diff --git a/Project.toml b/Project.toml index b6d03531d..69163e027 100644 --- a/Project.toml +++ b/Project.toml @@ -12,8 +12,8 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" NextLA = "d37ed344-79c4-486d-9307-6d11355a15a3" OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e" @@ -77,8 +77,8 @@ GraphViz = "0.2" Graphs = "1" JSON3 = "1" KernelAbstractions = "0.9" -MPI = "0.20.22" MacroTools = "0.5" +MPI = "0.20.22" MemPool = "0.4.12" Metal = "1.1" NextLA = "0.2.2" diff --git a/src/Dagger.jl b/src/Dagger.jl index 5e07f9ca8..a74cec3e6 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -10,7 +10,7 @@ import MemPool: DRef, FileRef, poolget, poolset import Base: collect, reduce, view import NextLA import LinearAlgebra -import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, I, norm, dot +import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, Cholesky, diagind, ishermitian, issymmetric, I import Random import Random: AbstractRNG @@ -53,7 +53,7 @@ import Adapt include("lib/util.jl") include("utils/dagdebug.jl") -# Type definitions +# Type definitions (for MPI/acceleration) include("types/processor.jl") include("types/scope.jl") include("types/memory-space.jl") @@ -71,6 +71,7 @@ include("context.jl") include("utils/processors.jl") include("scopes.jl") include("utils/scopes.jl") +include("chunks.jl") include("utils/signature.jl") include("thunkid.jl") include("utils/lfucache.jl") @@ -82,11 +83,7 @@ include("argument.jl") include("queue.jl") include("thunk.jl") include("utils/fetch.jl") -include("chunks.jl") -include("affinity.jl") -include("tochunk.jl") -include("mutable.jl") -include("shard.jl") +include("utils/chunks.jl") include("weakchunk.jl") include("utils/logging.jl") include("submission.jl") @@ -101,6 +98,7 @@ include("utils/clock.jl") include("utils/system_uuid.jl") include("utils/caching.jl") include("sch/Sch.jl"); using .Sch +include("tochunk.jl") # Data dependency task queue include("datadeps/aliasing.jl") @@ -138,7 +136,7 @@ include("array/mul.jl") include("array/cholesky.jl") include("array/trsm.jl") include("array/lu.jl") -include("array/gmres.jl") +include("array/qr.jl") # GPU include("gpu.jl") @@ -167,8 +165,9 @@ function set_distributed_package!(value) @info "Dagger.jl preference has been set, restart your Julia session for this change to take effect!" end -# MPI +# MPI (mpi.jl loads MPI; mpi_mempool uses it) include("mpi.jl") +include("mpi_mempool.jl") # Precompilation import PrecompileTools: @compile_workload diff --git a/src/array/alloc.jl b/src/array/alloc.jl index e67ca593c..fe92ae1e1 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -184,13 +184,24 @@ function Base.zero(x::DArray{T,N}) where {T,N} return _to_darray(a) end -@warn "Consider a better way to provide a unique ID for each chunk" maxlog=1 -function Base.view(A::AbstractArray{T,N}, p::Blocks{N}; space=default_memory_space(current_acceleration(), A)) where {T,N} +# Weird LinearAlgebra dispatch in `\` needs this +function LinearAlgebra._zeros(::Type{T}, B::DVector, n::Integer) where T + m = max(size(B, 1), n) + sz = (m,) + return zeros(auto_blocks(sz), T, sz) +end +function LinearAlgebra._zeros(::Type{T}, B::DMatrix, n::Integer) where T + m = max(size(B, 1), n) + sz = (m, size(B, 2)) + return zeros(auto_blocks(sz), T, sz) +end + +function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} d = ArrayDomain(Base.index_shape(A)) dc = partition(p, d) # N.B. We use `tochunk` because we only want to take the view locally, and # taking views should be very fast - chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc] + chunks = [tochunk(view(A, x.indexes...)) for x in dc] return DArray(T, d, dc, chunks, p) end Base.view(A::AbstractArray, ::AutoBlocks) = diff --git a/src/array/copy.jl b/src/array/copy.jl index d032525f9..7ed815daf 100644 --- a/src/array/copy.jl +++ b/src/array/copy.jl @@ -119,14 +119,7 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices( Arange_local = Arange_global_clamped .- CartesianIndex(Arange_start) .+ CartesianIndex{Nmax}(1) # Perform local view copy - space = (Bpart isa DTask ? fetch(Bpart; move_value=false, unwrap=false) : Bpart).space - procs = processors(space) - scope = UnionScope([ExactScope(proc) for proc in procs]) - check_uniform(space) - for proc in procs - check_uniform(proc) - end - Dagger.@spawn scope = scope copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local) + Dagger.@spawn copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local) end end end diff --git a/src/array/darray.jl b/src/array/darray.jl index e04bcf065..5e95adf94 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -1,4 +1,4 @@ -import Base: ==, fetch +import Base: ==, fetch, length, isempty, size export DArray, DVector, DMatrix, DVecOrMat, Blocks, AutoBlocks export distribute @@ -83,7 +83,8 @@ isempty(a::ArrayDomain) = length(a) == 0 The domain of an array is an ArrayDomain. """ domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)]) - +# Scalar / non-array values (e.g. for Chunk of immediate data) +domain(x::Any) = ArrayDomain(()) abstract type ArrayOp{T, N} <: AbstractArray{T, N} end Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian() @@ -176,46 +177,28 @@ domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) stage(ctx, c::DArray) = c -@warn "Dispatch uniform on acceleration" maxlog=1 -@warn "Take D.concat into account" maxlog=1 -function Base.collect(D::DArray{T,N}; tree=false, copyto=false, uniform::Bool=true) where {T,N} - if isempty(D.chunks) - return Array{eltype(D)}(undef, size(D)...) +function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N} + a = fetch(d) + if isempty(d.chunks) + return Array{eltype(d)}(undef, size(d)...) end - # Return a scalar, as required by Julia's array interface - if ndims(D) == 0 - return fetch(D.chunks[1]; unwrap=true) + if ndims(d) == 0 + return fetch(a.chunks[1]) end - if uniform - @assert D.concat === cat "FIXME: Handle non-cat" - A = Array{eltype(D)}(undef, size(D)...) - DA = view(A, D.partitioning; space=CPURAMMemorySpace()) - - # Perform the equivalent of `copyto!(DA, D)`, but force local updates - # FIXME: Be more parallel? - for idx in eachindex(DA.chunks) - dest = fetch(DA.chunks[idx]; move_value=false, unwrap=true, uniform=true)::AbstractArray - src = fetch(D.chunks[idx]; move_value=true, unwrap=true, uniform=true)::AbstractArray - copyto!(dest, src) - end + if copyto + C = Array{T,N}(undef, size(a)) + DC = view(C, Blocks(size(a)...)) + copyto!(DC, a) + return C + end - return A + dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)] + if tree + collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) else - if copyto - C = Array{T,N}(undef, size(D)) - DC = view(C, Blocks(size(D)...)) - copyto!(DC, D) - return C - end - - dimcatfuncs = [(x...) -> D.concat(x..., dims=i) for i in 1:ndims(D)] - if tree - collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), D.chunks))) - else - treereduce_nd(dimcatfuncs, asyncmap(fetch, D.chunks)) - end + collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks))) end end Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A)) @@ -339,8 +322,8 @@ function Base.isequal(x::ArrayOp, y::ArrayOp) x === y end -Base.similar(::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} = - DArray{S,N}(undef, dims) +Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} = + DArray{S,N}(undef, D.partitioning, dims) Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} = map(identity, x)::DArray{T,N,B,F} @@ -406,18 +389,23 @@ function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d: end """ - Base.fetch(A::DArray; unwrap::Bool=false, kwargs...) -> DArray + Base.fetch(c::DArray) -Returns a new `DArray` with the same data as `A`, but where all values are -fully computed. +If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk. """ -function Base.fetch(A::DArray{T}; unwrap::Bool=false, kwargs...) where T - if any(unwrappable, chunks(A)) - tasks = map(t->unwrappable(t) ? fetch(t; unwrap, kwargs...) : t, chunks(A)) - B = DArray(T, A.domain, A.subdomains, tasks, A.partitioning, A.concat) - return B +function Base.fetch(c::DArray{T}) where T + if any(istask, chunks(c)) + thunks = chunks(c) + sz = size(thunks) + dmn = domain(c) + dmnchunks = domainchunks(c) + return fetch(Dagger.spawn(Options(meta=true), thunks...) do results... + t = eltype(fetch(results[1])) + DArray(t, dmn, dmnchunks, reshape(Any[results...], sz), + c.partitioning, c.concat) + end) else - return A + return c end end @@ -518,6 +506,7 @@ auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A)) const AssignmentType{N} = Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} +distribute(A::AbstractArray, assignment::AssignmentType = :arbitrary) = distribute(A, AutoBlocks(), assignment) function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} procgrid = nothing availprocs = collect(Dagger.compatible_processors()) @@ -558,10 +547,8 @@ function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Assignme procgrid = assignment end - return _distribute(current_acceleration(), A, dist, procgrid) + return _to_darray(Distribute(dist, A, procgrid)) end -_distribute(::DistributedAcceleration, A::AbstractArray{T,N}, dist::Blocks{N}, procgrid) where {T,N} = - _to_darray(Distribute(dist, A, procgrid)) distribute(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = distribute(A, auto_blocks(A), assignment) function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} @@ -570,6 +557,7 @@ function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentT end distribute(x::AbstractVector, n::Int, assignment::AssignmentType{1} = :arbitrary) = distribute(x, (n,), assignment) + DVector(A::AbstractVector{T}, part::Blocks{1}, assignment::AssignmentType{1} = :arbitrary) where T = distribute(A, part, assignment) DMatrix(A::AbstractMatrix{T}, part::Blocks{2}, assignment::AssignmentType{2} = :arbitrary) where T = distribute(A, part, assignment) DArray(A::AbstractArray{T,N}, part::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} = distribute(A, part, assignment) @@ -582,26 +570,29 @@ DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::AssignmentType{1} = :arb DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::AssignmentType{2} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment) DArray(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = DArray(A, auto_blocks(A), assignment) -@warn "Add assignment to undef initializer" maxlog=1 -function DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}) where {T,N} - dist = auto_blocks(dims) - return DArray{T,N}(undef, dist, dims...) -end -function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}) where {T,N} - domain = ArrayDomain(ntuple(i->1:dims[i], N)) +struct AllocateUndef{S} end +(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims) +function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} + domain = ArrayDomain(map(x->1:x, dims)) subdomains = partition(dist, domain) - tasks = Array{DTask,N}(undef, size(subdomains)...) - Dagger.spawn_datadeps() do - for (i, x) in enumerate(subdomains) - tasks[i] = Dagger.@spawn allocate_array_undef(T, size(x)) - end - end - return DArray(T, domain, subdomains, tasks, dist) -end -DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}) where {T,N} = - DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,)) -DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}) where {T,N} = - DArray{T,N}(undef, dist, (dims...,)) + a = AllocateArray(T, AllocateUndef{T}(), false, domain, subdomains, dist, assignment) + return _to_darray(a) +end +DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, (dims...,); assignment) +DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks(dims), dims; assignment) +DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) + +DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, dims; assignment) +DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, dist, (dims...,); assignment) +DArray{T}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks(dims), dims; assignment) +DArray{T}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} = + DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment) function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N} collect(x) == y @@ -622,7 +613,7 @@ end mapchunk(f, chunk) = tochunk(f(poolget(chunk.handle))) function mapchunks(f, d::DArray{T,N,F}) where {T,N,F} chunks = map(d.chunks) do chunk - owner = get_parent(chunk.processor).pid + owner = root_worker_id(chunk.processor) remotecall_fetch(mapchunk, owner, f, chunk) end DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat) diff --git a/src/array/linalg.jl b/src/array/linalg.jl index 553303032..3bf8e20e0 100644 --- a/src/array/linalg.jl +++ b/src/array/linalg.jl @@ -92,9 +92,23 @@ function LinearAlgebra.ishermitian(A::DArray{T,2}) where T return all(fetch, to_check) end +function LinearAlgebra.LAPACK.chkfinite(A::DArray) + Ac = A.chunks + chunk_finite = [Ref(true) for _ in Ac] + chkfinite!(finite, A) = finite[] = LinearAlgebra.LAPACK.chkfinite(A) + Dagger.spawn_datadeps() do + for idx in eachindex(Ac) + Dagger.@spawn chkfinite!(Out(chunk_finite[idx]), In(Ac[idx])) + end + end + return all(getindex, chunk_finite) +end + DMatrix{T}(::LinearAlgebra.UniformScaling, m::Int, n::Int, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, m, n), IBlocks) +DMatrix(::LinearAlgebra.UniformScaling{T}, m::Int, n::Int, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, m, n), IBlocks) DMatrix{T}(::LinearAlgebra.UniformScaling, size::Tuple, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, size), IBlocks) +DMatrix(::LinearAlgebra.UniformScaling{T}, size::Tuple, IBlocks::Blocks) where T = DMatrix(Matrix{T}(I, size), IBlocks) function LinearAlgebra.inv(F::LU{T,<:DMatrix}) where T n = size(F, 1) @@ -147,40 +161,12 @@ end function LinearAlgebra.ldiv!(A::LU{<:Any,<:DMatrix}, B::AbstractVecOrMat) - # FIXME: Don't apply pivots for NoPivot - LinearAlgebra._apply_ipiv_rows!(A, B) #apply_ipiv_rows!(A.ipiv, B) + allowscalar(true) do + LinearAlgebra._apply_ipiv_rows!(A, B) + end LinearAlgebra.ldiv!(UnitLowerTriangular(A.factors), B) LinearAlgebra.ldiv!(UpperTriangular(A.factors), B) end -#= Adapted from LinearAlgebra.jl -function apply_ipiv_rows!(ipiv::DVector{Int}, B::AbstractVecOrMat) - ipivc = ipiv.chunks - offset = 0 - incr = ipiv.partitioning.blocksize[1] - Dagger.spawn_datadeps() do - for ic in ipivc - Dagger.@spawn swap_ipiv_rows!(InOut(B), In(ic), offset) - offset += incr - end - end -end -function swap_ipiv_rows!(B::AbstractVecOrMat, ic::AbstractVector, offset::Int) - for raw_i in 1:length(ic) - i = raw_i + offset - if i != ic[i] - _swap_rows!(B, i, ic[i]) - end - end -end -function swap_ipiv_rows!(B::AbstractVector, i::Integer, j::Integer) - B[i], B[j] = B[j], B[i] -end -function swap_ipiv_rows!(B::AbstractMatrix, i::Integer, j::Integer) - for col in 1:size(B, 2) - B[i,col], B[j,col] = B[j,col], B[i,col] - end -end=# - function LinearAlgebra.ldiv!(A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLowerTriangular{<:Any,<:DMatrix},UpperTriangular{<:Any,<:DMatrix},UnitUpperTriangular{<:Any,<:DMatrix}}, B::AbstractVecOrMat) alpha = one(eltype(A)) @@ -193,26 +179,64 @@ function LinearAlgebra.ldiv!(A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLower uplo = 'L' end - dB = B isa DVecOrMat ? B : view(B, A.data.partitioning) + dB = B isa DVecOrMat ? B : (B isa AbstractMatrix ? view(B, A.data.partitioning) : view(B, AutoBlocks())) + parent_A = parent(A) if isa(B, AbstractVector) - Dagger.trsv!(uplo, trans, diag, alpha, A.data, dB) + min_bsa = min(min(parent_A.partitioning.blocksize...), dB.partitioning.blocksize[1]) + Dagger.maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa)) do parent_A, dB + Dagger.trsv!(uplo, trans, diag, alpha, parent_A, dB) + end elseif isa(B, AbstractMatrix) - min_bsa = min(A.data.partitioning.blocksize...) - Dagger.maybe_copy_buffered(A.data => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa, min_bsa)) do A, dB - Dagger.trsm!('L', uplo, trans, diag, alpha, A, dB) + min_bsa = min(parent_A.partitioning.blocksize...) + Dagger.maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB=>Blocks(min_bsa, min_bsa)) do parent_A, dB + Dagger.trsm!('L', uplo, trans, diag, alpha, parent_A, dB) end end end -function LinearAlgebra.ldiv!(Y::DArray, A::DMatrix, B::DArray) +function LinearAlgebra.ldiv!(Y::DArray, A::DMatrix, B::DArray) LinearAlgebra.ldiv!(A, copyto!(Y, B)) end -function LinearAlgebra.ldiv!(A::DMatrix, B::DArray) +function LinearAlgebra.ldiv!(A::DMatrix, B::DArray) LinearAlgebra.ldiv!(LinearAlgebra.lu(A), B) end function LinearAlgebra.ldiv!(C::DVecOrMat, A::Union{LowerTriangular{<:Any,<:DMatrix},UnitLowerTriangular{<:Any,<:DMatrix},UpperTriangular{<:Any,<:DMatrix},UnitUpperTriangular{<:Any,<:DMatrix}}, B::DVecOrMat) LinearAlgebra.ldiv!(A, copyto!(C, B)) end + +function LinearAlgebra.ldiv!(C::Cholesky{T,<:DMatrix}, B::DVecOrMat) where T + # Solve directly with C.factors and the trans parameter to avoid + # C.L / C.U which use copy(adjoint(factors)) — that creates a DMatrix + # with inconsistent block metadata vs chunk layout, breaking darray_copyto!. + factors = C.factors + alpha = one(T) + iscomplex = T <: Complex + trans = iscomplex ? 'C' : 'T' # conjugate transpose for complex, plain transpose for real + + parent_A = factors + dB = B isa DVecOrMat ? B : (B isa AbstractMatrix ? view(B, factors.partitioning) : view(B, AutoBlocks())) + min_bsa = min(parent_A.partitioning.blocksize...) + + if C.uplo == 'U' + # A = U'U → solve U'y = B, then Ux = y + maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB + Dagger.trsm!('L', 'U', trans, 'N', alpha, pA, pB) + end + maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB + Dagger.trsm!('L', 'U', 'N', 'N', alpha, pA, pB) + end + else + # A = LL' → solve Ly = B, then L'x = y + maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB + Dagger.trsm!('L', 'L', 'N', 'N', alpha, pA, pB) + end + maybe_copy_buffered(parent_A => Blocks(min_bsa, min_bsa), dB => Blocks(min_bsa, min_bsa)) do pA, pB + Dagger.trsm!('L', 'L', trans, 'N', alpha, pA, pB) + end + end + + return B +end \ No newline at end of file diff --git a/src/array/trsm.jl b/src/array/trsm.jl index 6044d9045..4535a3cb6 100644 --- a/src/array/trsm.jl +++ b/src/array/trsm.jl @@ -1,5 +1,4 @@ -function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B::AbstractArray{T,1}) where T - +function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B::DVector{T}) where T zone = one(T) mzone = -one(T) @@ -24,12 +23,12 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B: Dagger.@spawn BLAS.gemv!('N', mzone, In(Ac[i, k]), In(Bc[k]), lalpha, InOut(Bc[i])) end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in 1:Bnt lalpha = (k == 1) ? alpha : zone - Dagger.@spawn BLAS.trsv!('U', 'T', diag, In(Ac[k, k]), InOut(Bc[k])) + Dagger.@spawn BLAS.trsv!('U', trans, diag, In(Ac[k, k]), InOut(Bc[k])) for i in k+1:Bnt - Dagger.@spawn BLAS.gemv!('T', mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) + Dagger.@spawn BLAS.gemv!(trans, mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) end end end @@ -42,12 +41,12 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B: Dagger.@spawn BLAS.gemv!('N', mzone, In(Ac[i, k]), In(Bc[k]), lalpha, InOut(Bc[i])) end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in reverse(1:Bnt) lalpha = (k == Bnt) ? alpha : zone - Dagger.@spawn BLAS.trsv!('L', 'T', diag, In(Ac[k, k]), InOut(Bc[k])) + Dagger.@spawn BLAS.trsv!('L', trans, diag, In(Ac[k, k]), InOut(Bc[k])) for i in 1:k-1 - Dagger.@spawn BLAS.gemv!('T', mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) + Dagger.@spawn BLAS.gemv!(trans, mzone, In(Ac[k, i]), In(Bc[i]), lalpha, InOut(Bc[k])) end end end @@ -57,8 +56,7 @@ function trsv!(uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B: end -function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DArray{T,2}, B::DArray{T,2}) where T - +function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DMatrix{T}, B::DVecOrMat{T}) where T zone = one(T) mzone = -one(T) @@ -102,7 +100,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in range(1, Bmt) lalpha = k == 1 ? alpha : zone; for n in range(1, Bnt) @@ -110,7 +108,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end for m in range(k+1, Bmt) for n in range(1, Bnt) - Dagger.@spawn BLAS.gemm!('T', 'N', mzone, In(Ac[k, m]), In(Bc[k, n]), lalpha, InOut(Bc[m, n])) + Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[k, m]), In(Bc[k, n]), lalpha, InOut(Bc[m, n])) end end end @@ -128,7 +126,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in range(1, Bmt) lalpha = k == 1 ? alpha : zone; for n in range(1, Bnt) @@ -136,7 +134,7 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end for m in range(k+1, Bmt) for n in range(1, Bnt) - Dagger.@spawn BLAS.gemm!('T', 'N', mzone, In(Ac[(Bmt-k)+1, (Bmt-m)+1]), In(Bc[(Bmt-k)+1, n]), lalpha, InOut(Bc[(Bmt-m)+1, n])) + Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[(Bmt-k)+1, (Bmt-m)+1]), In(Bc[(Bmt-k)+1, n]), lalpha, InOut(Bc[(Bmt-m)+1, n])) end end end @@ -156,12 +154,12 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in range(1, Bnt) for m in range(1, Bmt) Dagger.@spawn BLAS.trsm!(side, uplo, trans, diag, alpha, In(Ac[(Bnt-k)+1, (Bnt-k)+1]), InOut(Bc[m, (Bnt-k)+1])) for n in range(k+1, Bnt) - Dagger.@spawn BLAS.gemm!('N', 'T', minvalpha, In(B[m, (Bnt-k)+1]), In(Ac[(Bnt-n)+1, (Bnt-k)+1]), zone, InOut(Bc[m, (Bnt-n)+1])) + Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Bc[m, (Bnt-k)+1]), In(Ac[(Bnt-n)+1, (Bnt-k)+1]), zone, InOut(Bc[m, (Bnt-n)+1])) end end end @@ -177,12 +175,12 @@ function trsm!(side::Char, uplo::Char, trans::Char, diag::Char, alpha::T, A::DAr end end end - elseif trans == 'T' + elseif trans == 'T' || trans == 'C' for k in range(1, Bnt) for m in range(1, Bmt) Dagger.@spawn BLAS.trsm!(side, uplo, trans, diag, alpha, In(Ac[k, k]), InOut(Bc[m, k])) for n in range(k+1, Bnt) - Dagger.@spawn BLAS.gemm!('N', 'T', minvalpha, In(Bc[m, k]), In(Ac[n, k]), zone, InOut(Bc[m, n])) + Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Bc[m, k]), In(Ac[n, k]), zone, InOut(Bc[m, n])) end end end diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index aec83d039..eab0abff2 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -375,16 +375,23 @@ function is_writedep(arg, deps, task::DTask) end # Aliasing state setup -function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) - # Track the task's arguments and access patterns +# Internal: iterate over task args and call callback(arg, pos, may_alias, inplace_move, deps) for each tracked arg. +function _populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask, callback) for (idx, _arg) in enumerate(spec.fargs) + arg_pos = _arg.pos # ArgPosition for this argument (Argument/TypedArgument have .pos) arg = value(_arg) # Unwrap In/InOut/Out wrappers and record dependencies arg, deps = unwrap_inout(arg) - # Unwrap the Chunk underlying any DTask arguments - arg = arg isa DTask ? fetch(arg; move_value=false, unwrap=false) : arg + # Unwrap the Chunk underlying any DTask arguments only when already ready. + # Fetching an unready DTask here would deadlock: distribute_tasks! runs before + # the scheduler, so dependent tasks have not run yet. Skip aliasing for unready + # DTasks so we pass them through; the worker will fetch at execution time (may block on MPI). + if arg isa DTask + isready(arg) || continue + arg = fetch(arg; move_value=false, unwrap=false) + end # Skip non-aliasing arguments type_may_alias(typeof(arg)) || continue @@ -413,6 +420,10 @@ function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) state.arg_origin[arg] = origin_space state.remote_arg_to_original[arg] = arg + may_alias = true + inplace_move = true + callback(arg, arg_pos, may_alias, inplace_move, deps) + # Populate argument info for all aliasing dependencies for (dep_mod, _, _) in deps # Generate an ArgumentWrapper for the argument @@ -423,6 +434,11 @@ function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) end end end + +function populate_task_info!(state::DataDepsState, spec::DTaskSpec, task::DTask) + # Track the task's arguments and access patterns (callback only for state updates) + _populate_task_info!(state, spec, task, (arg, pos, may_alias, inplace_move, deps) -> nothing) +end function populate_argument_info!(state::DataDepsState, arg_w::ArgumentWrapper, origin_space::MemorySpace) # Initialize ownership and history if !haskey(state.arg_owner, arg_w) @@ -669,6 +685,25 @@ function remotecall_endpoint(f, ::Dagger.DistributedAcceleration, from_proc, to_ end end const ALIASED_OBJECT_CACHE = TaskLocalValue{Union{Dict{AbstractAliasing,Chunk}, Nothing}}(()->nothing) + +# Explicit cache for move_rewrap (used by haloarray, tests) +struct AliasedObjectCacheStore end +struct AliasedObjectCache + dest_space::MemorySpace + backing::Chunk + cache::Dict{AbstractAliasing,Chunk} + AliasedObjectCache(dest_space::MemorySpace, backing::Chunk) = new(dest_space, backing, Dict{AbstractAliasing,Chunk}()) +end +function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::Processor, from_space::MemorySpace, to_space::MemorySpace, data) + old = ALIASED_OBJECT_CACHE[] + ALIASED_OBJECT_CACHE[] = cache.cache + try + return move_rewrap(from_proc, to_proc, from_space, to_space, data) + finally + ALIASED_OBJECT_CACHE[] = old + end +end + @warn "Document these public methods" maxlog=1 # TODO: Use state to cache aliasing() results function declare_aliased_object!(x; ainfo=aliasing(current_acceleration(), x, identity)) diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index ebf9f8fa6..8d9674143 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -2,9 +2,10 @@ const TAG_WAITING = Base.Lockable(Ref{UInt32}(1)) function to_tag() intask = Dagger.in_task() - opts = Dagger.get_options() if intask - return Dagger.get_tls().task_spec.options.tag::UInt32 + opts = Dagger.get_tls().task_spec.options + tag = opts.tag + return tag end lock(TAG_WAITING) do counter_ref @assert Sch.SCHED_MOVE[] == false "We should not create a tag on the scheduler unwrap move" @@ -259,6 +260,17 @@ struct TypedDataDepsTaskArgument{T,N} end map_or_ntuple(f, xs::Vector) = map(f, 1:length(xs)) map_or_ntuple(f, xs::Tuple) = ntuple(f, length(xs)) + +# 4-arg version: side effects + returns Vector/Tuple of DataDepsTaskArgument for distribute_task! +function populate_task_info!(state::DataDepsState, task_args, spec::DTaskSpec, task::DTask) + result = DataDepsTaskArgument[] + _populate_task_info!(state, spec, task, (arg, pos, may_alias, inplace_move, deps) -> begin + dep_infos = DataDepsTaskDependency[DataDepsTaskDependency(arg, d) for d in deps] + push!(result, DataDepsTaskArgument(arg, pos, may_alias, inplace_move, dep_infos)) + end) + return spec.fargs isa Tuple ? (result...,) : result +end + function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_procs, spec::DTaskSpec{typed}, task::DTask, fargs, proc_to_scope_lfu, write_num::Int, proc_idx::Int) where typed @specialize spec fargs @@ -528,13 +540,29 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr end @dagdebug nothing :spawn_datadeps "($(repr(value(f)))) Task has $(length(syncdeps)) syncdeps" - # Launch user's task - new_fargs = map_or_ntuple(task_arg_ws) do idx - if is_typed(spec) - return TypedArgument(task_arg_ws[idx].pos, remote_args[idx]) - else - return Argument(task_arg_ws[idx].pos, remote_args[idx]) + # Launch user's task: preserve full argument list (spec.fargs); use remote values only for tracked args + new_fargs = if spec.fargs isa Tuple + ntuple(length(spec.fargs)) do i + arg = spec.fargs[i] + pos = arg.pos + j = findfirst(w -> w.pos == pos, task_arg_ws) + if j !== nothing + val = remote_args[j] + is_typed(spec) ? TypedArgument(pos, val) : Argument(pos, val) + else + copy(arg) + end end + else + [let arg = spec.fargs[i], pos = arg.pos + j = findfirst(w -> w.pos == pos, task_arg_ws) + if j !== nothing + val = remote_args[j] + is_typed(spec) ? TypedArgument(pos, val) : Argument(pos, val) + else + copy(arg) + end + end for i in 1:length(spec.fargs)] end new_spec = DTaskSpec(new_fargs, spec.options) new_spec.options.scope = our_scope diff --git a/src/dtask.jl b/src/dtask.jl index f24cd1027..13e66cafe 100644 --- a/src/dtask.jl +++ b/src/dtask.jl @@ -11,31 +11,18 @@ Base.wait(t::ThunkFuture) = Dagger.Sch.thunk_yield() do wait(t.future) return end -const FETCH_UNIFORM = ScopedValue{Bool}(false) -@warn "Docstrings" maxlog=1 -# uniform: Asserts that this is a uniform call -# move_value: Moves the value to the specified processor -# unwrap: Unwraps the value if it is unwrappable -function Base.fetch(t::ThunkFuture; proc::Processor=OSProc(), - throw_on_error::Bool=true, - uniform::Bool=false, - move_value::Bool=true, - unwrap::Bool=false) +function Base.fetch(t::ThunkFuture; proc=OSProc(), raw=false) error, value = Dagger.Sch.thunk_yield() do fetch(t.future) end - if throw_on_error && error + if error throw(value) end - if move_value - value = @with FETCH_UNIFORM => uniform begin - move(proc, value) - end - end - if unwrap && unwrappable(value) - return fetch(value; proc, throw_on_error, uniform, move_value, unwrap) + if raw + return value + else + return move(proc, value) end - return value end Base.put!(t::ThunkFuture, x; error=false) = put!(t.future, (error, x)) @@ -78,13 +65,14 @@ function Base.wait(t::DTask) wait(t.future) return end -function Base.fetch(t::DTask; kwargs...) +function Base.fetch(t::DTask; raw=false, move_value=nothing, unwrap=nothing) if !istaskstarted(t) throw(ConcurrencyViolationError("Cannot `fetch` an unlaunched `DTask`")) end - return fetch(t.future; kwargs...) + # Datadeps/aliasing API: move_value=false => don't move => raw=true + raw_eff = move_value !== nothing ? !move_value : raw + return fetch(t.future; raw=raw_eff) end -unwrappable(x::DTask) = true function waitany(tasks::Vector{DTask}) if isempty(tasks) return diff --git a/src/lib/domain-blocks.jl b/src/lib/domain-blocks.jl index 2a0854e3b..95e5c360f 100644 --- a/src/lib/domain-blocks.jl +++ b/src/lib/domain-blocks.jl @@ -6,6 +6,8 @@ struct DomainBlocks{N} <: AbstractArray{ArrayDomain{N, NTuple{N, UnitRange{Int}} end Base.@deprecate_binding BlockedDomains DomainBlocks +ndims(::DomainBlocks{N}) where N = N + size(x::DomainBlocks) = map(length, x.cumlength) function _getindex(x::DomainBlocks{N}, idx::Tuple) where N starts = map((vec, i) -> i == 0 ? 0 : getindex(vec,i), x.cumlength, map(x->x-1, idx)) diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index dd9b8dc3f..91cf88da3 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -12,14 +12,21 @@ accelerate!(accel::Symbol) = accelerate!(Val{accel}()) accelerate!(::Val{:distributed}) = accelerate!(DistributedAcceleration()) initialize_acceleration!(a::DistributedAcceleration) = nothing -function accelerate!(accel::Acceleration) +function accelerate!(accel::Acceleration) initialize_acceleration!(accel) ACCELERATION[] = accel end +accelerate!(::Nothing) = nothing accel_matches_proc(accel::DistributedAcceleration, proc::OSProc) = true accel_matches_proc(accel::DistributedAcceleration, proc) = true +function compatible_processors(accel::Union{Acceleration,Nothing}, scope::AbstractScope, procs::Vector{<:Processor}) + comp = compatible_processors(scope, procs) + accel === nothing && return comp + return Set(p for p in comp if accel_matches_proc(accel, p)) +end + struct CPURAMMemorySpace <: MemorySpace owner::Int end @@ -136,37 +143,8 @@ end may_alias(::MemorySpace, ::MemorySpace) = true may_alias(space1::CPURAMMemorySpace, space2::CPURAMMemorySpace) = space1.owner == space2.owner -struct RemotePtr{T,S<:MemorySpace} <: Ref{T} - addr::UInt - space::S -end -RemotePtr{T}(addr::UInt, space::S) where {T,S} = RemotePtr{T,S}(addr, space) -RemotePtr{T}(ptr::Ptr{V}, space::S) where {T,V,S} = RemotePtr{T,S}(UInt(ptr), space) -RemotePtr{T}(ptr::Ptr{V}) where {T,V} = RemotePtr{T}(UInt(ptr), CPURAMMemorySpace(myid())) -# FIXME: Don't hardcode CPURAMMemorySpace -RemotePtr(addr::UInt) = RemotePtr{Cvoid}(addr, CPURAMMemorySpace(myid())) -Base.convert(::Type{RemotePtr}, x::Ptr{T}) where T = - RemotePtr(UInt(x), CPURAMMemorySpace(myid())) -Base.convert(::Type{<:RemotePtr{V}}, x::Ptr{T}) where {V,T} = - RemotePtr{V}(UInt(x), CPURAMMemorySpace(myid())) -Base.convert(::Type{UInt}, ptr::RemotePtr) = ptr.addr -Base.:+(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr + offset, ptr.space) -Base.:-(ptr::RemotePtr{T}, offset::Integer) where T = RemotePtr{T}(ptr.addr - offset, ptr.space) -function Base.isless(ptr1::RemotePtr, ptr2::RemotePtr) - @assert ptr1.space == ptr2.space - return ptr1.addr < ptr2.addr -end - -struct MemorySpan{S} - ptr::RemotePtr{Cvoid,S} - len::UInt -end -MemorySpan(ptr::RemotePtr{Cvoid,S}, len::Integer) where S = - MemorySpan{S}(ptr, UInt(len)) -MemorySpan{S}(addr::UInt, len::Integer) where S = - MemorySpan{S}(RemotePtr{Cvoid,S}(addr), UInt(len)) -Base.isless(a::MemorySpan, b::MemorySpan) = a.ptr < b.ptr -Base.isempty(x::MemorySpan) = x.len == 0 +# RemotePtr and MemorySpan are defined in utils/memory-span.jl (included earlier). + abstract type AbstractAliasing end memory_spans(::T) where T<:AbstractAliasing = throw(ArgumentError("Must define `memory_spans` for `$T`")) memory_spans(x) = memory_spans(aliasing(x)) @@ -454,34 +432,4 @@ function will_alias(x_span::MemorySpan, y_span::MemorySpan) return x_span.ptr <= y_end && y_span.ptr <= x_end end -### More space-efficient memory spans - -struct LocalMemorySpan - ptr::UInt - len::UInt -end -LocalMemorySpan(span::MemorySpan) = LocalMemorySpan(span.ptr.addr, span.len) -Base.isempty(x::LocalMemorySpan) = x.len == 0 - -# FIXME: Store the length separately, since it's shared by all spans -struct ManyMemorySpan{N} - spans::NTuple{N,LocalMemorySpan} -end -Base.isempty(x::ManyMemorySpan) = all(isempty, x.spans) - -struct ManyPair{N} <: Unsigned - pairs::NTuple{N,UInt} -end -Base.promote_rule(::Type{ManyPair}, ::Type{T}) where {T<:Integer} = ManyPair -Base.convert(::Type{ManyPair{N}}, x::T) where {T<:Integer,N} = ManyPair(ntuple(i -> x, N)) -Base.convert(::Type{ManyPair}, x::ManyPair) = x -Base.:+(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] + y.pairs[i], N)) -Base.:-(x::ManyPair{N}, y::ManyPair{N}) where N = ManyPair(ntuple(i -> x.pairs[i] - y.pairs[i], N)) -Base.:-(x::ManyPair) = error("Can't negate a ManyPair") -Base.:(==)(x::ManyPair, y::ManyPair) = x.pairs == y.pairs -Base.isless(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] -Base.:(<)(x::ManyPair, y::ManyPair) = x.pairs[1] < y.pairs[1] -Base.string(x::ManyPair) = "ManyPair($(x.pairs))" - -ManyMemorySpan{N}(start::ManyPair{N}, len::ManyPair{N}) where N = - ManyMemorySpan{N}(ntuple(i -> LocalMemorySpan(start.pairs[i], len.pairs[i]), N)) +# LocalMemorySpan, ManyMemorySpan, ManyPair are defined in utils/memory-span.jl (included earlier). diff --git a/src/mpi.jl b/src/mpi.jl index 4b85122b9..adad2e4ec 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -313,8 +313,6 @@ mutable struct MPIRef end Base.hash(ref::MPIRef, h::UInt=UInt(0)) = hash(ref.id, hash(MPIRef, h)) root_worker_id(ref::MPIRef) = myid() -@warn "Move this definition somewhere else" maxlog=1 -root_worker_id(ref::DRef) = ref.owner function check_uniform(ref::MPIRef, original=ref) return check_uniform(ref.rank, original) && @@ -831,15 +829,16 @@ function move(src::MPIProcessor, dst::MPIProcessor, x::Chunk) end end + #FIXME:try to think of a better move! scheme -function execute!(proc::MPIProcessor, world::UInt64, f, args...; kwargs...) +function execute!(proc::MPIProcessor, f, args...; kwargs...) local_rank = MPI.Comm_rank(proc.comm) islocal = local_rank == proc.rank inplace_move = f === move! result = nothing - tag_space = to_tag() + tag = to_tag() if islocal || inplace_move - result = execute!(proc.innerProc, world, f, args...; kwargs...) + result = execute!(proc.innerProc, f, args...; kwargs...) end if inplace_move space = memory_space(nothing, proc)::MPIMemorySpace diff --git a/src/options.jl b/src/options.jl index 580ddef53..1b178df34 100644 --- a/src/options.jl +++ b/src/options.jl @@ -7,7 +7,6 @@ Stores per-task options to be passed to the scheduler. # Arguments - `propagates::Vector{Symbol}`: The set of option names that will be propagated by this task to tasks that it spawns. -- `acceleration::Acceleration`: The acceleration (cluster/network) type to use for this task. - `processor::Processor`: The processor associated with this task's function. Generally ignored by the scheduler. - `compute_scope::AbstractScope`: The execution scope of the task, which determines where the task can be scheduled and executed. `scope` is another name for this option. - `result_scope::AbstractScope`: The data scope of the task's result, which determines where the task's result can be accessed from. @@ -27,6 +26,7 @@ Stores per-task options to be passed to the scheduler. - `storage_leaf_tag::Union{MemPool.Tag,Nothing}=nothing`: If not `nothing`, specifies the MemPool storage leaf tag to associate with the task's result. This tag can be used by MemPool's storage devices to manipulate their behavior, such as the file name used to store data on disk." - `storage_retain::Union{Bool,Nothing}=nothing`: The value of `retain` to pass to `MemPool.poolset` when constructing the result `Chunk`. `nothing` defaults to `false`. - `name::Union{String,Nothing}=nothing`: If not `nothing`, annotates the task with a name for logging purposes. +- `tag::Union{UInt32,Nothing}=nothing`: (Data-deps/MPI) MPI message tag for this task; assigned automatically if `nothing`. - `stream_input_buffer_amount::Union{Int,Nothing}=nothing`: (Streaming only) Specifies the amount of slots to allocate for the input buffer of the task. Defaults to 1. - `stream_output_buffer_amount::Union{Int,Nothing}=nothing`: (Streaming only) Specifies the amount of slots to allocate for the output buffer of the task. Defaults to 1. - `stream_buffer_type::Union{Type,Nothing}=nothing`: (Streaming only) Specifies the type of buffer to use for the input and output buffers of the task. Defaults to `Dagger.ProcessRingBuffer`. @@ -35,8 +35,6 @@ Stores per-task options to be passed to the scheduler. Base.@kwdef mutable struct Options propagates::Union{Vector{Symbol},Nothing} = nothing - tag::Union{UInt32,Nothing} = nothing - acceleration::Union{Acceleration,Nothing} = nothing processor::Union{Processor,Nothing} = nothing scope::Union{AbstractScope,Nothing} = nothing compute_scope::Union{AbstractScope,Nothing} = scope @@ -64,10 +62,14 @@ Base.@kwdef mutable struct Options name::Union{String,Nothing} = nothing + tag::Union{UInt32,Nothing} = nothing + stream_input_buffer_amount::Union{Int,Nothing} = nothing stream_output_buffer_amount::Union{Int,Nothing} = nothing stream_buffer_type::Union{Type, Nothing} = nothing stream_max_evals::Union{Int,Nothing} = nothing + + acceleration::Union{Acceleration,Nothing} = nothing end Options(::Nothing) = Options() function Options(old_options::NamedTuple) @@ -124,8 +126,6 @@ signature `sig`, if the option was previously unspecified in `opts`. """ function populate_defaults!(opts::Options, sig) maybe_default!(opts, Val{:propagates}(), sig) - maybe_default!(opts, Val{:tag}(), sig) - maybe_default!(opts, Val{:acceleration}(), sig) maybe_default!(opts, Val{:processor}(), sig) maybe_default!(opts, Val{:compute_scope}(), sig) maybe_default!(opts, Val{:result_scope}(), sig) diff --git a/src/processor.jl b/src/processor.jl index 75e19094d..4944dc083 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -2,6 +2,8 @@ export OSProc, Context, addprocs!, rmprocs! import Base: @invokelatest +abstract type Processor end + const PROCESSOR_CALLBACKS = Dict{Symbol,Any}() const OSPROC_PROCESSOR_CACHE = LockedObject(Dict{Int,Set{Processor}}()) @@ -138,3 +140,20 @@ iscompatible_arg(proc::OSProc, opts, args...) = "Returns a very brief `String` representation of `proc`." short_name(proc::Processor) = string(proc) short_name(p::OSProc) = "W: $(p.pid)" + +"Returns true if the processor is on the local worker (for MPI/ordering)." +is_local_processor(proc::Processor) = (root_worker_id(proc) == myid()) + +"Ordering key for task firing (used by MPI to avoid deadlock)." +fire_order_key(proc::Processor) = (root_worker_id(proc), 0) + +@doc """ + Processor + +An abstract type representing a processing device and associated memory, where +data can be stored and operated on. Subtypes should be immutable, and +instances should compare equal if they represent the same logical processing +device/memory. Subtype instances should be serializable between different +nodes. Subtype instances may contain a "parent" `Processor` to make it easy to +transfer data to/from other types of `Processor` at runtime. +""" Processor diff --git a/src/queue.jl b/src/queue.jl index d55b31e6a..37947a0ac 100644 --- a/src/queue.jl +++ b/src/queue.jl @@ -125,7 +125,7 @@ function wait_all(f; check_errors::Bool=false) result = with_options(f; task_queue=queue) for task in queue.tasks if check_errors - fetch(task; move_value=false, unwrap=false, throw_on_error=true) + fetch(task; raw=true) else wait(task) end diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 954867098..8ab9dcdc4 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -15,7 +15,7 @@ import Base: @invokelatest import ..Dagger import ..Dagger: Context, Processor, SchedulerOptions, Options, Thunk, WeakThunk, ThunkFuture, ThunkID, DTaskFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, InvalidScope, LockedObject, Argument, Signature -import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, root_worker_id, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc!, is_local_processor, fire_order_key +import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, wrap_weak, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, default_enabled, processor, get_processors, get_parent, root_worker_id, execute!, rmprocs!, task_processor, constrain, cputhreadtime, maybe_take_or_alloc!, is_local_processor, fire_order_key, short_name import ..Dagger: @dagdebug, @safe_lock_spin1, @maybelog, @take_or_alloc! import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek @@ -688,7 +688,13 @@ end # Fire all newly-scheduled tasks (owner/local first, then by fire_order_key to avoid MPI execute! deadlock) @label fire_tasks task_locs = collect(keys(to_fire)) - sort!(task_locs; by=loc -> (is_local_processor(loc.proc) ? 0 : 1, fire_order_key(loc.proc))) + rank = try + M = parentmodule(@__MODULE__) + (isdefined(M, :MPI) && M.MPI.Initialized()) ? Int(M.MPI.Comm_rank(M.MPI.COMM_WORLD)) : nothing + catch + nothing + end + Core.println("fire order rank=", rank, " task_locs=", task_locs) for task_loc in task_locs fire_tasks!(ctx, task_loc, to_fire[task_loc], state) end diff --git a/src/sch/eager.jl b/src/sch/eager.jl index 3478863da..67e895815 100644 --- a/src/sch/eager.jl +++ b/src/sch/eager.jl @@ -31,8 +31,7 @@ function init_eager() sopts = SchedulerOptions(;allow_errors=true) opts = Dagger.Options((;scope=Dagger.ExactScope(Dagger.ThreadProc(1, 1)), occupancy=Dict(Dagger.ThreadProc=>0), - time_util=Dict(Dagger.ThreadProc=>0), - acceleration=Dagger.DistributedAcceleration())) + time_util=Dict(Dagger.ThreadProc=>0))) Dagger.compute(ctx, Dagger._delayed(eager_thunk, opts)(); options=sopts) catch err diff --git a/src/sch/util.jl b/src/sch/util.jl index 514604b11..b28222a13 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -373,7 +373,7 @@ function signature(f, args) value = Dagger.value(arg) if value isa Dagger.DTask # Only occurs via manual usage of signature - value = fetch(value; move_value=false, unwrap=false) + value = fetch(value; raw=true) end if istask(value) throw(ConcurrencyViolationError("Must call `collect_task_inputs!(state, task)` before calling `signature`")) @@ -403,7 +403,6 @@ end function can_use_proc(state, task, gproc, proc, opts, scope) # Check against proclist - pid = Dagger.root_worker_id(gproc) if opts.proclist !== nothing @warn "The `proclist` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 if opts.proclist isa Function @@ -422,10 +421,6 @@ function can_use_proc(state, task, gproc, proc, opts, scope) else throw(SchedulingException("proclist must be a Function, Vector, or nothing")) end - if !Dagger.accel_matches_proc(opts.acceleration, proc) - @dagdebug task :scope "Rejected $proc: Not compatible with acceleration ($opts.acceleration)" - return false, scope - end if scope isa Dagger.InvalidScope @dagdebug task :scope "Rejected $proc: Not contained in task scope ($scope)" return false, scope @@ -435,8 +430,8 @@ function can_use_proc(state, task, gproc, proc, opts, scope) # Check against single if opts.single !== nothing @warn "The `single` option is deprecated, please use scopes instead\nSee https://juliaparallel.org/Dagger.jl/stable/scopes/ for details" maxlog=1 - if pid != opts.single - @dagdebug task :scope "Rejected $proc: pid ($(pid)) != single ($(opts.single))" + if root_worker_id(gproc) != opts.single + @dagdebug task :scope "Rejected $proc: gproc root_worker_id ($(root_worker_id(gproc))) != single ($(opts.single))" return false, scope end scope = constrain(scope, Dagger.ProcessScope(opts.single)) diff --git a/src/scopes.jl b/src/scopes.jl index ff76e121c..28aa8fa00 100644 --- a/src/scopes.jl +++ b/src/scopes.jl @@ -1,5 +1,7 @@ export AnyScope, DefaultScope, UnionScope, NodeScope, ProcessScope, ExactScope, ProcessorTypeScope +abstract type AbstractScope end + "Widest scope that contains all processors." struct AnyScope <: AbstractScope end proc_in_scope(::Processor, ::AnyScope) = true @@ -95,12 +97,11 @@ ProcessorTypeScope(T, inner_scope=AnyScope()) = Set{AbstractScopeTaint}([ProcessorTypeTaint{T}()])) "Scoped to a specific processor." -struct ExactScope{P<:AbstractScope} <: AbstractScope - parent::P +struct ExactScope <: AbstractScope + parent::ProcessScope processor::Processor end -ExactScope(proc) = ExactScope(enclosing_scope(get_parent(proc)), proc) -enclosing_scope(proc::OSProc) = ProcessScope(proc.pid) +ExactScope(proc) = ExactScope(ProcessScope(root_worker_id(get_parent(proc))), proc) proc_in_scope(proc::Processor, scope::ExactScope) = proc == scope.processor "Indicates that the applied scopes `x` and `y` are incompatible." @@ -456,4 +457,4 @@ function Base.issubset(scope1::AbstractScope, scope2::AbstractScope) proc in scope2_procs || return false end return true -end +end \ No newline at end of file diff --git a/src/thunk.jl b/src/thunk.jl index c4783762a..5008e4a1b 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -462,7 +462,7 @@ function _par(mod, ex::Expr; lazy=true, recur=true, opts=()) end args = filter(arg->!Meta.isexpr(arg, :parameters), allargs) kwargs = filter(arg->Meta.isexpr(arg, :parameters), allargs) - if !isempty(kwargs) + if !Base.isempty(kwargs) kwargs = only(kwargs).args end if body !== nothing @@ -493,7 +493,7 @@ function _par(mod, ex::Expr; lazy=true, recur=true, opts=()) $spawn($f, $Options(;$(opts...)), $(args...); $(kwargs...)) end if $(Expr(:islocal, sync_var)) - put!($sync_var, schedule(Task(()->fetch($result; move_value=false, unwrap=false)))) + put!($sync_var, schedule(Task(()->fetch($result; raw=true)))) end $result end @@ -530,7 +530,7 @@ function spawn(f, args...; kwargs...) @nospecialize f args kwargs # Merge all passed options - if length(args) >= 1 && first(args) isa Options + if length(args) >= 1 && first(args) isa Options # N.B. Make a defensive copy in case user aliases Options struct task_options = copy(first(args)::Options) args = args[2:end] @@ -545,7 +545,7 @@ function spawn(f, args...; kwargs...) end function typed_spawn(f, args...; kwargs...) # Merge all passed options - if length(args) >= 1 && first(args) isa Options + if length(args) >= 1 && first(args) isa Options # N.B. Make a defensive copy in case user aliases Options struct task_options = copy(first(args)::Options) args = args[2:end] @@ -578,8 +578,6 @@ function _spawn(args_kwargs, task_options) # Get task queue, and don't let it propagate task_queue = get(scoped_options, :task_queue, DefaultTaskQueue())::AbstractTaskQueue filter!(prop -> prop != :task_queue, propagates) - - # Update propagates from scoped options propagates if task_options.propagates !== nothing append!(task_options.propagates, propagates) else @@ -587,11 +585,6 @@ function _spawn(args_kwargs, task_options) end unique!(task_options.propagates) - # Read task-local acceleration into options - if task_options.acceleration === nothing - task_options.acceleration = current_acceleration() - end - # Construct task spec and handle spec = DTaskSpec(args_kwargs, task_options) task = eager_spawn(spec) diff --git a/src/tochunk.jl b/src/tochunk.jl index 25ae9a965..386e1b80f 100644 --- a/src/tochunk.jl +++ b/src/tochunk.jl @@ -36,6 +36,16 @@ function tochunk(x::X, proc::P, space::M, scope::S; device=nothing, type=X, rewr ref = tochunk_pset(x, space; device, kwargs...) return Chunk{type,typeof(ref),P,S,typeof(space)}(type, domain(x), ref, proc, scope, space) end +# Disambiguate: Chunk-specific 3-arg so kwcall(tochunk, Chunk, Processor, Scope) is not ambiguous with utils/chunks.jl +function tochunk(x::Chunk, proc::P, scope::S; rewrap=false, kwargs...) where {P<:Processor,S} + if rewrap + return remotecall_fetch(x.handle.owner) do + tochunk(MemPool.poolget(x.handle), proc, scope; kwargs...) + end + else + return x + end +end function tochunk(x::X, proc::P, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,P<:Processor,S} if device === nothing device = if Sch.walk_storage_safe(x) @@ -70,8 +80,11 @@ function tochunk(x::X, space::M, scope::S; device=nothing, type=X, rewrap=false, ref = tochunk_pset(x, space; device, kwargs...) return Chunk{type,typeof(ref),typeof(proc),S,M}(type, domain(x), ref, proc, scope, space) end -tochunk(x, procOrSpace; kwargs...) = tochunk(x, procOrSpace, AnyScope(); kwargs...) -tochunk(x; kwargs...) = tochunk(x, default_memory_space(current_acceleration(), x), AnyScope(); kwargs...) +# 2-arg: avoid overwriting utils/chunks.jl's tochunk(Any, Any) and tochunk(Any); only add Processor/MemorySpace variants +# Chunk + Processor: disambiguate vs utils/chunks.jl's tochunk(x::Chunk, proc; ...) +tochunk(x::Chunk, proc::Processor; kwargs...) = tochunk(x, proc, AnyScope(); kwargs...) +tochunk(x, proc::Processor; kwargs...) = tochunk(x, proc, AnyScope(); kwargs...) +tochunk(x, space::MemorySpace; kwargs...) = tochunk(x, space, AnyScope(); kwargs...) check_proc_space(x, proc, space) = nothing function check_proc_space(x::Chunk, proc, space) @@ -94,13 +107,4 @@ end tochunk_pset(x, space::MemorySpace; device=nothing, kwargs...) = poolset(x; device, kwargs...) -function savechunk(data, dir, f) - sz = open(joinpath(dir, f), "w") do io - serialize(io, MemPool.MMWrap(data)) - return position(io) - end - fr = FileRef(f, sz) - proc = OSProc() - scope = AnyScope() # FIXME: Scoped to this node - return Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true) -end +# savechunk: defined in utils/chunks.jl (fork Chunk has space field; do not duplicate here) diff --git a/src/types/processor.jl b/src/types/processor.jl index e70600b24..1e333413f 100644 --- a/src/types/processor.jl +++ b/src/types/processor.jl @@ -1,11 +1,2 @@ -""" - Processor - -An abstract type representing a processing device and associated memory, where -data can be stored and operated on. Subtypes should be immutable, and -instances should compare equal if they represent the same logical processing -device/memory. Subtype instances should be serializable between different -nodes. Subtype instances may contain a "parent" `Processor` to make it easy to -transfer data to/from other types of `Processor` at runtime. -""" +# Docstring for Processor is attached in src/processor.jl after OSProc is defined (avoids "Replacing docs" warning). abstract type Processor end \ No newline at end of file diff --git a/src/utils/chunks.jl b/src/utils/chunks.jl new file mode 100644 index 000000000..1300a5a1d --- /dev/null +++ b/src/utils/chunks.jl @@ -0,0 +1,191 @@ +### Mutation + +function _mutable_inner(@nospecialize(f), proc, scope) + result = f() + return Ref(Dagger.tochunk(result, proc, scope)) +end + +""" + mutable(f::Base.Callable; worker, processor, scope) -> Chunk + +Calls `f()` on the specified worker or processor, returning a `Chunk` +referencing the result with the specified scope `scope`. +""" +function mutable(@nospecialize(f); worker=nothing, processor=nothing, scope=nothing) + if processor === nothing + if worker === nothing + processor = OSProc() + else + processor = OSProc(worker) + end + else + @assert worker === nothing "mutable: Can't mix worker and processor" + end + if scope === nothing + scope = processor isa OSProc ? ProcessScope(processor) : ExactScope(processor) + end + return fetch(Dagger.@spawn scope=scope _mutable_inner(f, processor, scope))[] +end + +""" + @mutable [worker=1] [processor=OSProc()] [scope=ProcessorScope()] f() + +Helper macro for [`mutable()`](@ref). +""" +macro mutable(exs...) + opts = esc.(exs[1:end-1]) + ex = exs[end] + quote + let f = @noinline ()->$(esc(ex)) + $mutable(f; $(opts...)) + end + end +end + +""" +Maps a value to one of multiple distributed "mirror" values automatically when +used as a thunk argument. Construct using `@shard` or `shard`. +""" +struct Shard + chunks::Dict{Processor,Chunk} +end + +""" + shard(f; kwargs...) -> Chunk{Shard} + +Executes `f` on all workers in `workers`, wrapping the result in a +process-scoped `Chunk`, and constructs a `Chunk{Shard}` containing all of these +`Chunk`s on the current worker. + +Keyword arguments: +- `procs` -- The list of processors to create pieces on. May be any iterable container of `Processor`s. +- `workers` -- The list of workers to create pieces on. May be any iterable container of `Integer`s. +- `per_thread::Bool=false` -- If `true`, creates a piece per each thread, rather than a piece per each worker. +""" +function shard(@nospecialize(f); procs=nothing, workers=nothing, per_thread=false) + if procs === nothing + if workers !== nothing + procs = [OSProc(w) for w in workers] + else + procs = lock(Sch.eager_context()) do + copy(Sch.eager_context().procs) + end + end + if per_thread + _procs = ThreadProc[] + for p in procs + append!(_procs, filter(p->p isa ThreadProc, get_processors(p))) + end + procs = _procs + end + else + if workers !== nothing + throw(ArgumentError("Cannot combine `procs` and `workers`")) + elseif per_thread + throw(ArgumentError("Cannot combine `procs` and `per_thread=true`")) + end + end + isempty(procs) && throw(ArgumentError("Cannot create empty Shard")) + shard_running_dict = Dict{Processor,DTask}() + for proc in procs + scope = proc isa OSProc ? ProcessScope(proc) : ExactScope(proc) + thunk = Dagger.@spawn scope=scope _mutable_inner(f, proc, scope) + shard_running_dict[proc] = thunk + end + shard_dict = Dict{Processor,Chunk}() + for proc in procs + shard_dict[proc] = fetch(shard_running_dict[proc])[] + end + return Shard(shard_dict) +end + +"Creates a `Shard`. See [`Dagger.shard`](@ref) for details." +macro shard(exs...) + opts = esc.(exs[1:end-1]) + ex = exs[end] + quote + let f = @noinline ()->$(esc(ex)) + $shard(f; $(opts...)) + end + end +end + +function move(from_proc::Processor, to_proc::Processor, shard::Shard) + # Match either this proc or some ancestor + # N.B. This behavior may bypass the piece's scope restriction + proc = to_proc + if haskey(shard.chunks, proc) + return move(from_proc, to_proc, shard.chunks[proc]) + end + parent = Dagger.get_parent(proc) + while parent != proc + proc = parent + parent = Dagger.get_parent(proc) + if haskey(shard.chunks, proc) + return move(from_proc, to_proc, shard.chunks[proc]) + end + end + + throw(KeyError(to_proc)) +end +Base.iterate(s::Shard) = iterate(values(s.chunks)) +Base.iterate(s::Shard, state) = iterate(values(s.chunks), state) +Base.length(s::Shard) = length(s.chunks) + +### Core Stuff + +""" + tochunk(x, proc::Processor, scope::AbstractScope; device=nothing, rewrap=false, kwargs...) -> Chunk + +Create a chunk from data `x` which resides on `proc` and which has scope +`scope`. + +`device` specifies a `MemPool.StorageDevice` (which is itself wrapped in a +`Chunk`) which will be used to manage the reference contained in the `Chunk` +generated by this function. If `device` is `nothing` (the default), the data +will be inspected to determine if it's safe to serialize; if so, the default +MemPool storage device will be used; if not, then a `MemPool.CPURAMDevice` will +be used. + +If `rewrap==true` and `x isa Chunk`, then the `Chunk` will be rewrapped in a +new `Chunk`. + +All other kwargs are passed directly to `MemPool.poolset`. +""" +function tochunk(x::X, proc::P=OSProc(), scope::S=AnyScope(); device=nothing, rewrap=false, kwargs...) where {X,P,S} + if device === nothing + device = if Sch.walk_storage_safe(x) + MemPool.GLOBAL_DEVICE[] + else + MemPool.CPURAMDevice() + end + end + ref = poolset(x; device, kwargs...) + space = memory_space(proc) + Chunk{X,typeof(ref),P,S,typeof(space)}(X, domain(x), ref, proc, scope, space) +end +function tochunk(x::Chunk, proc=nothing, scope=nothing; rewrap=false, kwargs...) + if rewrap + return remotecall_fetch(x.handle.owner) do + tochunk(MemPool.poolget(x.handle), proc, scope; kwargs...) + end + else + return x + end +end +tochunk(x::Thunk, proc=nothing, scope=nothing; kwargs...) = x + +root_worker_id(chunk::Chunk) = root_worker_id(chunk.handle) +root_worker_id(dref::DRef) = dref.owner # FIXME: Migration + +function savechunk(data, dir, f) + sz = open(joinpath(dir, f), "w") do io + serialize(io, MemPool.MMWrap(data)) + return position(io) + end + fr = FileRef(f, sz) + proc = OSProc() + scope = AnyScope() # FIXME: Scoped to this node + space = memory_space(proc) + Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope),typeof(space)}(typeof(data), domain(data), fr, proc, scope, space) +end diff --git a/src/utils/dagdebug.jl b/src/utils/dagdebug.jl index b305f85ce..873e47e79 100644 --- a/src/utils/dagdebug.jl +++ b/src/utils/dagdebug.jl @@ -36,36 +36,27 @@ macro dagdebug(thunk, category, msg, args...) end) end -@warn "Make this threadsafe by putting counter into Module" maxlog=1 -@warn "Calculate fast-growth based on clock time, not iteration" maxlog=1 +# FIXME: Calculate fast-growth based on clock time, not iteration const OPCOUNTER_CATEGORIES = Symbol[] const OPCOUNTER_FAST_GROWTH_THRESHOLD = Ref(10_000_000) -const OPCOUNTERS = Dict{Symbol,Threads.Atomic{Int}}() +struct OpCounter + value::Threads.Atomic{Int} +end +OpCounter() = OpCounter(Threads.Atomic{Int}(0)) macro opcounter(category, count=1) cat_sym = category.value @gensym old + opcounter_sym = Symbol(:OPCOUNTER_, cat_sym) + if !isdefined(__module__, opcounter_sym) + __module__.eval(:(#=const=# $opcounter_sym = OpCounter())) + end esc(quote if $(QuoteNode(cat_sym)) in $OPCOUNTER_CATEGORIES - if !haskey($OPCOUNTERS, $(QuoteNode(cat_sym))) - $OPCOUNTERS[$(QuoteNode(cat_sym))] = Threads.Atomic{Int}(0) - end - $old = Threads.atomic_add!($OPCOUNTERS[$(QuoteNode(cat_sym))], Int($count)) + $old = Threads.atomic_add!($opcounter_sym.value, Int($count)) if $old > 1 && (mod1($old, $OPCOUNTER_FAST_GROWTH_THRESHOLD[]) == 1 || $count > $OPCOUNTER_FAST_GROWTH_THRESHOLD[]) println("Fast-growing counter: $($(QuoteNode(cat_sym))) = $($old)") end end end) end -opcounters() = Dict(cat=>OPCOUNTERS[cat][] for cat in keys(OPCOUNTERS)) - -const LARGEST_VALUE_COUNTER = Ref(0) -function largest_value_update!(value) - prev = LARGEST_VALUE_COUNTER[] - if value > prev - LARGEST_VALUE_COUNTER[] = value - if value - prev > 10_000 || value > 1_000_000 - println("Largest value growing: $value") - end - end -end -largest_value_counter() = LARGEST_VALUE_COUNTER[] \ No newline at end of file +opcounter(mod::Module, category::Symbol) = getfield(mod, Symbol(:OPCOUNTER_, category)).value[] \ No newline at end of file diff --git a/src/utils/interval_tree.jl b/src/utils/interval_tree.jl index 1c2b3a7f6..7dc59532e 100644 --- a/src/utils/interval_tree.jl +++ b/src/utils/interval_tree.jl @@ -195,44 +195,11 @@ function Base.delete!(tree::IntervalTree{M,E}, span::M) where {M,E} parent_of_succ.right = replacement end - target.span = successor.span - replacement = target - end - - # Phase 3: Handle overlap case - add remaining portions - if target_type == :overlap - original_start = span_start(original_span) - original_end = span_end(original_span) - del_start = span_start(span) - del_end = span_end(span) - verify_span(span) - - # Left portion: exists if original starts before deleted span - if original_start < del_start - left_end = min(original_end, del_start - _span_one(del_start)) - if left_end >= original_start - left_span = M(original_start, left_end - original_start + _span_one(left_end)) - if !isempty(left_span) - replacement = insert_node!(replacement, left_span) - end - end + # Update max_end bottom-up for the successor's original path + update_max_end!(parent_of_succ) + for i in length(succ_path)-1:-1:1 + update_max_end!(succ_path[i]) end - - # Right portion: exists if original extends beyond deleted span - if original_end > del_end - right_start = max(original_start, del_end + _span_one(del_end)) - if original_end >= right_start - right_span = M(right_start, original_end - right_start + _span_one(original_end)) - if !isempty(right_span) - replacement = insert_node!(replacement, right_span) - end - end - end - end - - # Phase 4: Update parent's child pointer - if isempty(path) - root = replacement else # Zero or one child replacement = target.left !== nothing ? target.left : target.right @@ -292,14 +259,16 @@ function find_overlapping!(node::IntervalNode{M,E}, query::M, result::Vector{M}; end end - # Enqueue left subtree if it might contain overlapping intervals + # Search left subtree if its max_end is at least query_start if current.left !== nothing && current.left.max_end >= span_start(query) - push!(queue, current.left) + push!(stack, current.left) end - # Enqueue right subtree if query extends beyond current node's start - if current.right !== nothing && span_end(query) >= span_start(current.span) - push!(queue, current.right) + # Search right subtree if it could contain an overlap + if current.right !== nothing && + span_start(current.span) <= span_end(query) && + current.right.max_end >= span_start(query) + push!(stack, current.right) end end end diff --git a/src/utils/scopes.jl b/src/utils/scopes.jl index 84aecc179..949ae2276 100644 --- a/src/utils/scopes.jl +++ b/src/utils/scopes.jl @@ -29,23 +29,14 @@ compatible_processors(scope::AbstractScope=get_compute_scope(), ctx::Context=Sch function compatible_processors(scope::AbstractScope, procs::Vector{<:Processor}) compat_procs = Set{Processor}() for gproc in procs - for proc in get_processors(gproc) - proc_in_scope(proc, scope) || continue - push!(compat_procs, proc) - end - end - return compat_procs -end -compatible_processors(acceleration::Acceleration, scope::AbstractScope=get_compute_scope(), ctx::Context=Sch.eager_context()) = - compatible_processors(acceleration, scope, procs(ctx)) -function compatible_processors(acceleration::Acceleration, scope::AbstractScope, procs::Vector{<:Processor}) - compat_procs = Set{Processor}() - for gproc in procs - accel_matches_proc(acceleration, gproc) || continue - for proc in get_processors(gproc) - accel_matches_proc(acceleration, proc) || continue - proc_in_scope(proc, scope) || continue - push!(compat_procs, proc) + # Fast-path in case entire process is incompatible + gproc_scope = ProcessScope(gproc) + if !isa(constrain(scope, gproc_scope), InvalidScope) + for proc in get_processors(gproc) + if proc_in_scope(proc, scope) + push!(compat_procs, proc) + end + end end end return compat_procs From bc2f7d768ba34911c4c3279b5c38b403f535e728 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 5 Mar 2026 14:08:15 -0300 Subject: [PATCH 21/24] MPI: Fixing non-uniformity in dict-key iteration --- src/datadeps/aliasing.jl | 5 +++- src/datadeps/queue.jl | 16 ++++++++--- src/datadeps/scheduling.jl | 6 +++- src/mpi.jl | 2 +- src/sch/Sch.jl | 59 +++++++++++++++++++++++++++++--------- src/sch/util.jl | 6 ++-- 6 files changed, 71 insertions(+), 23 deletions(-) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index eab0abff2..518c4bd2c 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -463,7 +463,10 @@ function populate_ainfo!(state::DataDepsState, original_arg_w::ArgumentWrapper, if !haskey(state.ainfos_owner, target_ainfo) overlaps = Set{AliasingWrapper}() push!(overlaps, target_ainfo) - for other_ainfo in keys(state.ainfos_owner) + other_ainfos = (Dagger.current_acceleration() isa Dagger.MPIAcceleration + ? sort(collect(keys(state.ainfos_owner)), by=hash) + : keys(state.ainfos_owner)) + for other_ainfo in other_ainfos target_ainfo == other_ainfo && continue if will_alias(target_ainfo, other_ainfo) # Mark us and them as overlapping diff --git a/src/datadeps/queue.jl b/src/datadeps/queue.jl index 8d9674143..b203c3e44 100644 --- a/src/datadeps/queue.jl +++ b/src/datadeps/queue.jl @@ -332,9 +332,13 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr sig = Sch.signature(sch_state, f, map(first, chunks_locality)) task_pressure = get(sch_state.signature_time_cost, sig, 1000^3) - # Shuffle procs around, so equally-costly procs are equally considered - P = randperm(length(all_procs)) - procs = getindex.(Ref(all_procs), P) + # Shuffle procs around, so equally-costly procs are equally considered (skip when MPI for deterministic tie-breaking) + procs = if current_acceleration() isa Dagger.MPIAcceleration + collect(all_procs) + else + P = randperm(length(all_procs)) + getindex.(Ref(all_procs), P) + end # Sort by lowest cost first sort!(procs, by=p->costs[p]) @@ -397,7 +401,11 @@ function distribute_task!(queue::DataDepsTaskQueue, state::DataDepsState, all_pr delete!(spaces_completed, our_space) continue end - our_proc = rand(our_space_procs) + our_proc = if current_acceleration() isa Dagger.MPIAcceleration + first(sort(collect(our_space_procs), by=short_name)) + else + rand(our_space_procs) + end break end diff --git a/src/datadeps/scheduling.jl b/src/datadeps/scheduling.jl index 0bf9818f6..b2bcaca7b 100644 --- a/src/datadeps/scheduling.jl +++ b/src/datadeps/scheduling.jl @@ -111,7 +111,11 @@ function datadeps_schedule_task(sched::UltraScheduler, state::DataDepsState, all delete!(spaces_completed, our_space) continue end - our_proc = rand(our_space_procs) + our_proc = if Dagger.current_acceleration() isa Dagger.MPIAcceleration + first(sort(collect(our_space_procs), by=Dagger.short_name)) + else + rand(our_space_procs) + end break end diff --git a/src/mpi.jl b/src/mpi.jl index adad2e4ec..623d5d16a 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -367,7 +367,7 @@ end const DEADLOCK_DETECT = TaskLocalValue{Bool}(()->true) const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) -const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->60.0) +const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->600.0) const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) struct InplaceInfo diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 8ab9dcdc4..442472692 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -532,6 +532,25 @@ struct ScheduleTaskSpec est_alloc_util::UInt64 est_occupancy::UInt32 end + +"Ordering key for task locations when using MPI acceleration (deterministic across ranks)." +function _mpi_fire_order_key(loc::ScheduleTaskLocation) + g = loc.gproc + p = loc.proc + g_rank = g isa Union{Dagger.MPIOSProc, Dagger.MPIProcessor} ? g.rank : root_worker_id(g) + p_rank = p isa Union{Dagger.MPIOSProc, Dagger.MPIProcessor} ? p.rank : root_worker_id(p) + return (g_rank, p_rank) +end + +"Ordering key for a single Processor when using MPI acceleration (deterministic across ranks)." +function _mpi_proc_rank(proc::Processor) + g = get_parent(proc) + p = proc + g_rank = g isa Union{Dagger.MPIOSProc, Dagger.MPIProcessor} ? g.rank : root_worker_id(g) + p_rank = p isa Union{Dagger.MPIOSProc, Dagger.MPIProcessor} ? p.rank : root_worker_id(p) + return (g_rank, p_rank) +end + @reuse_scope function schedule!(ctx, state, sch_options, procs=procs_to_use(ctx, sch_options)) lock(state.lock) do safepoint(state) @@ -688,14 +707,17 @@ end # Fire all newly-scheduled tasks (owner/local first, then by fire_order_key to avoid MPI execute! deadlock) @label fire_tasks task_locs = collect(keys(to_fire)) + if Dagger.current_acceleration() isa Dagger.MPIAcceleration + sort!(task_locs, by=_mpi_fire_order_key) + end rank = try M = parentmodule(@__MODULE__) (isdefined(M, :MPI) && M.MPI.Initialized()) ? Int(M.MPI.Comm_rank(M.MPI.COMM_WORLD)) : nothing catch nothing end - Core.println("fire order rank=", rank, " task_locs=", task_locs) - for task_loc in task_locs + for (i, task_loc) in enumerate(task_locs) + #Core.println("fire_order rank=", rank, " [", i, "/", length(task_locs), "] task_loc=", task_loc) fire_tasks!(ctx, task_loc, to_fire[task_loc], state) end to_fire_cleanup() @@ -1141,12 +1163,15 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re # Try to steal a task @maybelog ctx timespan_start(ctx, :proc_steal_local, (;uid, worker=wid, processor=to_proc), nothing) - # Try to steal from local queues randomly + # Try to steal from local queues randomly (deterministic order when MPI to avoid deadlocks) # TODO: Prioritize stealing from busiest processors states = proc_states_values(uid) - # TODO: Try to pre-allocate this - P = randperm(length(states)) - for state in getindex.(Ref(states), P) + order = if Dagger.current_acceleration() isa Dagger.MPIAcceleration + sort(1:length(states), by=i->_mpi_proc_rank(states[i].state.proc)) + else + randperm(length(states)) + end + for state in getindex.(Ref(states), order) other_istate = state.state if other_istate.proc === to_proc continue @@ -1355,11 +1380,15 @@ function do_tasks(to_proc, return_queue, tasks) end notify(istate.reschedule) - # Kick other processors to make them steal + # Kick other processors to make them steal (deterministic order when MPI to avoid deadlocks) # TODO: Alternatively, automatically balance work instead of blindly enqueueing states = proc_states_values(uid) - P = randperm(length(states)) - for other_state in getindex.(Ref(states), P) + order = if Dagger.current_acceleration() isa Dagger.MPIAcceleration + sort(1:length(states), by=i->_mpi_proc_rank(states[i].state.proc)) + else + randperm(length(states)) + end + for other_state in getindex.(Ref(states), order) other_istate = other_state.state if other_istate.proc === to_proc continue @@ -1477,11 +1506,13 @@ Executes a single task specified by `task` on `to_proc`. #= FIXME: This isn't valid if x is written to x = if x isa Chunk value = lock(TASK_SYNC) do - if haskey(CHUNK_CACHE, x) - Some{Any}(get!(CHUNK_CACHE[x], to_proc) do - # Convert from cached value - # TODO: Choose "closest" processor of same type first - some_proc = first(keys(CHUNK_CACHE[x])) + if haskey(CHUNK_CACHE, x) + Some{Any}(get!(CHUNK_CACHE[x], to_proc) do + # Convert from cached value + # TODO: Choose "closest" processor of same type first + cache_procs = keys(CHUNK_CACHE[x]) + some_proc = Dagger.current_acceleration() isa Dagger.MPIAcceleration ? + minimum(cache_procs, by=_mpi_proc_rank) : first(cache_procs) some_x = CHUNK_CACHE[x][some_proc] @dagdebug thunk_id :move "Cache hit for argument $id at $some_proc: $some_x" @invokelatest move(some_proc, to_proc, some_x) diff --git a/src/sch/util.jl b/src/sch/util.jl index b28222a13..eee360c3b 100644 --- a/src/sch/util.jl +++ b/src/sch/util.jl @@ -590,12 +590,14 @@ end end chunks_cleanup() - # Shuffle procs around, so equally-costly procs are equally considered + # Shuffle procs around, so equally-costly procs are equally considered (skip shuffle when MPI for deterministic tie-breaking) np = length(procs) @reusable :estimate_task_costs_P Vector{Int} 0 4 np P begin resize!(P, np) copyto!(P, 1:np) - randperm!(P) + if !(Dagger.current_acceleration() isa Dagger.MPIAcceleration) + randperm!(P) + end for idx in 1:np sorted_procs[idx] = procs[P[idx]] end From d0b0c7100f5bda5a3248aa1d435a2028b091a108 Mon Sep 17 00:00:00 2001 From: Super User Date: Thu, 12 Mar 2026 10:27:49 -0300 Subject: [PATCH 22/24] MPI benchmarks and matmul correctness: Float32, 10k, per-block check - benchmarks/run_matmul.jl: Float32, N=10k, relative error + per-block report - benchmarks/run_distribute_fetch.jl, run_qr.jl, check_comm_asymmetry (jl/py) - src: alloc, darray, mul, mpi, options, Sch, submission, thunk, tochunk, dagdebug Made-with: Cursor --- benchmarks/check_comm_asymmetry.jl | 111 ++++++++++++++++++++++ benchmarks/check_comm_asymmetry.py | 97 +++++++++++++++++++ benchmarks/run_distribute_fetch.jl | 42 ++++++++ benchmarks/run_matmul.jl | 107 +++++++++++++++++++++ benchmarks/run_qr.jl | 46 +++++++++ src/array/alloc.jl | 21 +++- src/array/darray.jl | 43 ++++++++- src/array/mul.jl | 56 +++++++---- src/mpi.jl | 148 +++++++++++++++++++++-------- src/options.jl | 2 + src/sch/Sch.jl | 16 +++- src/submission.jl | 28 +++++- src/thunk.jl | 8 ++ src/tochunk.jl | 17 +++- src/utils/dagdebug.jl | 5 +- 15 files changed, 677 insertions(+), 70 deletions(-) create mode 100644 benchmarks/check_comm_asymmetry.jl create mode 100644 benchmarks/check_comm_asymmetry.py create mode 100644 benchmarks/run_distribute_fetch.jl create mode 100644 benchmarks/run_matmul.jl create mode 100644 benchmarks/run_qr.jl diff --git a/benchmarks/check_comm_asymmetry.jl b/benchmarks/check_comm_asymmetry.jl new file mode 100644 index 000000000..684240ec5 --- /dev/null +++ b/benchmarks/check_comm_asymmetry.jl @@ -0,0 +1,111 @@ +#!/usr/bin/env julia +# Parse MPI+Dagger logs and report communication decision asymmetry per tag. +# Asymmetry: for the same tag, one rank decides to send (local+bcast, sender+communicated, etc.) +# and another rank decides to infer (inferred, uninvolved) and never recv → deadlock. +# +# Usage: julia check_comm_asymmetry.jl < logfile +# Or: mpiexec -n 10 julia ... run_matmul.jl 2>&1 | tee matmul.log; julia check_comm_asymmetry.jl < matmul.log + +const SEND_DECISIONS = Set([ + "local+bcast", "sender+communicated", "sender+inferred", "receiver+bcast", + "aliasing", # when followed by local+bcast we already capture local+bcast +]) +const RECV_DECISIONS = Set([ + "communicated", "receiver", "sender+communicated", # received data +]) +const INFER_DECISIONS = Set([ + "inferred", "uninvolved", # did not recv (uses inferred type) +]) + +function parse_line(line) + # Match [rank X][tag Y] then any [...] and capture the last bracket pair before space or end + rank = nothing + tag = nothing + decision = nothing + category = nothing # aliasing, execute!, remotecall_endpoint + for m in eachmatch(r"\[rank\s+(\d+)\]", line) + rank = parse(Int, m.captures[1]) + end + for m in eachmatch(r"\[tag\s+(\d+)\]", line) + tag = parse(Int, m.captures[1]) + end + for m in eachmatch(r"\[(execute!|aliasing|remotecall_endpoint)\]", line) + category = m.captures[1] + end + # Decision is usually in last [...] that looks like [word] or [word+word] + for m in eachmatch(r"\]\[([^\]]+)\]", line) + candidate = m.captures[1] + # Normalize: "communicated" "inferred" "local+bcast" "sender+inferred" "receiver" etc. + if occursin("inferred", candidate) && !occursin("communicated", candidate) + decision = "inferred" + break + elseif occursin("communicated", candidate) + decision = "communicated" + break + elseif occursin("local+bcast", candidate) + decision = "local+bcast" + break + elseif occursin("sender+", candidate) + decision = startswith(candidate, "sender+inferred") ? "sender+inferred" : "sender+communicated" + break + elseif candidate == "receiver" + decision = "receiver" + break + elseif candidate == "receiver+bcast" + decision = "receiver+bcast" + break + elseif candidate == "inplace_move" + decision = "inplace_move" + break + end + end + return rank, tag, category, decision +end + +function main() + # tag => Dict(rank => decision) + by_tag = Dict{Int, Dict{Int, String}}() + for line in eachline(stdin) + rank, tag, category, decision = parse_line(line) + isnothing(rank) && continue + isnothing(tag) && continue + isnothing(decision) && continue + if !haskey(by_tag, tag) + by_tag[tag] = Dict{Int, String}() + end + by_tag[tag][rank] = decision + end + + # For each tag, check: is there at least one sender and one inferrer (non-receiver)? + send_keys = Set(["local+bcast", "sender+communicated", "sender+inferred", "receiver+bcast"]) + infer_keys = Set(["inferred", "sender+inferred"]) # sender+inferred means sender didn't need to recv + recv_keys = Set(["communicated", "receiver", "sender+communicated"]) + + asymmetries = [] + for (tag, ranks) in sort(collect(by_tag), by = first) + senders = [r for (r, d) in ranks if d in send_keys] + inferrers = [r for (r, d) in ranks if d in infer_keys || d == "uninvolved"] + receivers = [r for (r, d) in ranks if d in recv_keys] + # Asymmetry: someone sends (bcast) so will send to ALL other ranks; someone chose infer and won't recv. + if !isempty(senders) && !isempty(inferrers) + push!(asymmetries, (tag, senders, inferrers, receivers, ranks)) + end + end + + if isempty(asymmetries) + println("No communication decision asymmetry found (no tag has both sender and inferrer).") + return + end + + println("=== Communication decision asymmetry (can cause deadlock) ===\n") + for (tag, senders, inferrers, receivers, ranks) in asymmetries + println("Tag $tag:") + println(" Senders (will bcast to all others): $senders") + println(" Inferrers (did not recv): $inferrers") + println(" Receivers: $receivers") + println(" All decisions: $ranks") + println() + end +end + +main() diff --git a/benchmarks/check_comm_asymmetry.py b/benchmarks/check_comm_asymmetry.py new file mode 100644 index 000000000..31a117442 --- /dev/null +++ b/benchmarks/check_comm_asymmetry.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Parse MPI+Dagger logs and report communication decision asymmetry per tag. +Asymmetry: for the same tag, one rank decides to send (local+bcast, etc.) +and another decides to infer (inferred) and never recv → deadlock. + +Usage: + # Capture full log (all ranks' Core.println from mpi.jl go to stdout): + mpiexec -n 10 julia --project=/path/to/Dagger.jl benchmarks/run_matmul.jl 2>&1 | tee matmul.log + # Then look for asymmetry (same tag: one rank sends, another infers → deadlock): + python3 check_comm_asymmetry.py < matmul.log +""" + +import re +import sys +from collections import defaultdict + +SEND_DECISIONS = {"local+bcast", "sender+communicated", "sender+inferred", "receiver+bcast"} +RECV_DECISIONS = {"communicated", "receiver", "sender+communicated"} +INFER_DECISIONS = {"inferred", "uninvolved", "sender+inferred"} + + +def parse_line(line: str): + rank = tag = category = decision = None + m = re.search(r"\[rank\s+(\d+)\]", line) + if m: + rank = int(m.group(1)) + m = re.search(r"\[tag\s+(\d+)\]", line) + if m: + tag = int(m.group(1)) + m = re.search(r"\[(execute!|aliasing|remotecall_endpoint)\]", line) + if m: + category = m.group(1) + # Capture decision from [...] blocks + for m in re.finditer(r"\]\[([^\]]+)\]", line): + candidate = m.group(1) + if "inferred" in candidate and "communicated" not in candidate: + decision = "inferred" + break + if "communicated" in candidate: + decision = "communicated" + break + if "local+bcast" in candidate: + decision = "local+bcast" + break + if candidate.startswith("sender+"): + decision = "sender+inferred" if "inferred" in candidate else "sender+communicated" + break + if candidate == "receiver": + decision = "receiver" + break + if candidate == "receiver+bcast": + decision = "receiver+bcast" + break + if candidate == "inplace_move": + decision = "inplace_move" + break + return rank, tag, category, decision + + +def main(): + by_tag = defaultdict(dict) # tag -> {rank: decision} + for line in sys.stdin: + rank, tag, category, decision = parse_line(line) + if rank is None or tag is None or decision is None: + continue + by_tag[tag][rank] = decision + + send_keys = {"local+bcast", "sender+communicated", "sender+inferred", "receiver+bcast"} + infer_keys = {"inferred", "sender+inferred", "uninvolved"} + recv_keys = {"communicated", "receiver", "sender+communicated"} + + asymmetries = [] + for tag in sorted(by_tag.keys()): + ranks = by_tag[tag] + senders = [r for r, d in ranks.items() if d in send_keys] + inferrers = [r for r, d in ranks.items() if d in infer_keys] + receivers = [r for r, d in ranks.items() if d in recv_keys] + if senders and inferrers: + asymmetries.append((tag, senders, inferrers, receivers, ranks)) + + if not asymmetries: + print("No communication decision asymmetry found (no tag has both sender and inferrer).") + return + + print("=== Communication decision asymmetry (can cause deadlock) ===\n") + for tag, senders, inferrers, receivers, ranks in asymmetries: + print(f"Tag {tag}:") + print(f" Senders (will bcast to all others): {senders}") + print(f" Inferrers (did not recv): {inferrers}") + print(f" Receivers: {receivers}") + print(f" All decisions: {dict(ranks)}") + print() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/run_distribute_fetch.jl b/benchmarks/run_distribute_fetch.jl new file mode 100644 index 000000000..822e1ad2c --- /dev/null +++ b/benchmarks/run_distribute_fetch.jl @@ -0,0 +1,42 @@ +#!/usr/bin/env julia +# Create a matrix with a fixed reproducible pattern, distribute it with an +# MPI procgrid, then on each rank fetch and println the chunk(s) it owns. +# Usage (from repo root, use full path to Dagger.jl): +# mpiexec -n 4 julia --project=/path/to/Dagger.jl benchmarks/run_distribute_fetch.jl + +using MPI +using Dagger + +if !isdefined(Dagger, :accelerate!) + error("Dagger.accelerate! not found. Run with the local Dagger project: julia --project=/path/to/Dagger.jl ...") +end +Dagger.accelerate!(:mpi) + +const comm = MPI.COMM_WORLD +const rank = MPI.Comm_rank(comm) +const nranks = MPI.Comm_size(comm) + +# Fixed reproducible pattern: 6×6 matrix, M[i,j] = 10*i + j (same on all ranks) +const N = 6 +const BLOCK = 2 +A = [10 * i + j for i in 1:N, j in 1:N] + +# Procgrid: use Dagger's compatible processors so the procgrid passes validation +availprocs = collect(Dagger.compatible_processors()) +nblocks = (cld(N, BLOCK), cld(N, BLOCK)) +procgrid = reshape( + [availprocs[mod(i - 1, length(availprocs)) + 1] for i in 1:prod(nblocks)], + nblocks, +) + +# Distribute so chunk (i,j) is computed on procgrid[i,j] +D = distribute(A, Blocks(BLOCK, BLOCK), procgrid) +D_fetched = fetch(D) + +# On each rank: fetch and print only the chunk(s) this rank owns +for (idx, ch) in enumerate(D_fetched.chunks) + if ch isa Dagger.Chunk && ch.handle isa Dagger.MPIRef && ch.handle.rank == rank + data = fetch(ch) + println("rank $rank chunk $idx: ", data) + end +end diff --git a/benchmarks/run_matmul.jl b/benchmarks/run_matmul.jl new file mode 100644 index 000000000..c455a22d9 --- /dev/null +++ b/benchmarks/run_matmul.jl @@ -0,0 +1,107 @@ +#!/usr/bin/env julia +# N×N matmul benchmark (Float32); block size scales with number of ranks. +# Usage (use the full path to Dagger.jl, not "..."): +# mpiexec -n 10 julia --project=/home/felipetome/dagger-dev/mpi/Dagger.jl benchmarks/run_matmul.jl +# Set CHECK_CORRECTNESS=true to collect and compare against GPU baseline: +# CHECK_CORRECTNESS=true mpiexec -n 10 julia --project=/home/felipetome/dagger-dev/mpi/Dagger.jl benchmarks/run_matmul.jl + +using MPI +using Dagger +using LinearAlgebra + +if !isdefined(Dagger, :accelerate!) + error("Dagger.accelerate! not found. Run with the local Dagger project: julia --project=/path/to/Dagger.jl ...") +end +Dagger.accelerate!(:mpi) + +const N = 10_000 +const comm = MPI.COMM_WORLD +const rank = MPI.Comm_rank(comm) +const nranks = MPI.Comm_size(comm) +# Block size proportional to ranks: ~nranks blocks in 2D => side blocks ≈ √nranks +const BLOCK = max(1, ceil(Int, N / ceil(Int, sqrt(nranks)))) + +const CHECK_CORRECTNESS = parse(Bool, get(ENV, "CHECK_CORRECTNESS", "false")) + +if rank == 0 + println("Benchmark: ", nranks, " ranks, N=", N, ", block size ", BLOCK, "×", BLOCK, " (matmul)") +end + +# Allocate and fill matrices in blocks (Float32) +A = rand(Blocks(BLOCK, BLOCK), Float32, N, N) +B = rand(Blocks(BLOCK, BLOCK), Float32, N, N) + +# Matrix multiply C = A * B +t_matmul = @elapsed begin + C = A * B +end + +if rank == 0 + println("Matmul time: ", round(t_matmul; digits=4), " s") +end + +# Optional: collect via datadeps (root=0). All ranks participate in the datadeps region. +if CHECK_CORRECTNESS + t_collect = @elapsed begin + A_full = Dagger.collect_datadeps(A; root=0) + B_full = Dagger.collect_datadeps(B; root=0) + C_dagger = Dagger.collect_datadeps(C; root=0) + end + if rank == 0 + println("Collecting result and computing baseline for correctness check (GPU)...") + using CUDA + CUDA.functional() || error("CUDA not functional; cannot compute GPU baseline. Check CUDA driver and device.") + t_upload = @elapsed begin + A_g = CUDA.cu(A_full) + B_g = CUDA.cu(B_full) + C_dagger_g = CUDA.cu(C_dagger) + end + println("Collect + upload time: ", round(t_collect + t_upload; digits=4), " s") + + t_baseline = @elapsed begin + C_ref_g = A_g * B_g + end + println("Baseline (GPU/CUDA) time: ", round(t_baseline; digits=4), " s") + + rtol = 1f-5 + atol = 1f-6 + err = norm(C_dagger_g - C_ref_g) + ref_norm = norm(C_ref_g) + rel_err = ref_norm > 0 ? err / ref_norm : err + ok = err <= atol + rtol * ref_norm + if ok + println("Correctness: OK (rel_err = ", Float32(rel_err), ", abs_err = ", Float32(err), ")") + else + println("Correctness: FAIL (rel_err = ", Float32(rel_err), ", abs_err = ", Float32(err), ", rtol=$rtol, atol=$atol)") + end + + # Per-block analysis: which sections exceed tolerance (same block size as Dagger layout) + C_dagger_cpu = Array(C_dagger_g) + C_ref_cpu = Array(C_ref_g) + n_bi = ceil(Int, N / BLOCK) + n_bj = ceil(Int, N / BLOCK) + bad_blocks = Tuple{Int,Int,Float32,Float32}[] + for bi in 1:n_bi, bj in 1:n_bj + ri = (bi - 1) * BLOCK + 1 : min(bi * BLOCK, N) + rj = (bj - 1) * BLOCK + 1 : min(bj * BLOCK, N) + diff_block = @view(C_dagger_cpu[ri, rj]) .- @view(C_ref_cpu[ri, rj]) + ref_block = @view(C_ref_cpu[ri, rj]) + block_err = norm(diff_block) + block_ref = norm(ref_block) + block_rel = block_ref > 0 ? block_err / block_ref : block_err + if block_err > atol + rtol * block_ref + push!(bad_blocks, (bi, bj, Float32(block_rel), Float32(block_err))) + end + end + if isempty(bad_blocks) + println("Per-block: all ", n_bi * n_bj, " blocks within tolerance.") + else + println("Per-block: ", length(bad_blocks), " block(s) exceed tolerance (block size ", BLOCK, "×", BLOCK, "):") + sort!(bad_blocks; by = x -> -x[3]) + for (bi, bj, brel, babs) in bad_blocks + println(" block [", bi, ",", bj, "] rows ", (bi - 1) * BLOCK + 1, ":", min(bi * BLOCK, N), + ", cols ", (bj - 1) * BLOCK + 1, ":", min(bj * BLOCK, N), " rel_err = ", brel, " abs_err = ", babs) + end + end + end +end diff --git a/benchmarks/run_qr.jl b/benchmarks/run_qr.jl new file mode 100644 index 000000000..c5915db2a --- /dev/null +++ b/benchmarks/run_qr.jl @@ -0,0 +1,46 @@ +#!/usr/bin/env julia +# 10k×10k QR + matmul benchmark; block size scales with number of ranks. +# Usage: mpiexec -n 100 julia --project=/path/to/Dagger.jl benchmarks/bench_100rank_qr_matmul.jl +# Or: bash benchmarks/run_100rank_qr_matmul.sh . + +using MPI +using Dagger +using LinearAlgebra + +Dagger.accelerate!(:mpi) + +const N = 10_000 +const comm = MPI.COMM_WORLD +const rank = MPI.Comm_rank(comm) +const nranks = MPI.Comm_size(comm) +# Block size proportional to ranks: ~nranks blocks in 2D => side blocks ≈ √nranks +const BLOCK = max(1, ceil(Int, N / ceil(Int, sqrt(nranks)))) + +if rank == 0 + println("Benchmark: ", nranks, " ranks, N=", N, ", block size ", BLOCK, "×", BLOCK, " (QR + matmul)") +end + +# Allocate and fill 10k×10k matrix in 1k×1k blocks +A = rand(Blocks(BLOCK, BLOCK), Float64, N, N) +MPI.Barrier(comm) + +# QR factorization (computing Q runs the full factorization) +t_qr = @elapsed begin + qr!(A) +end +MPI.Barrier(comm) + +if rank == 0 + println("QR time: ", round(t_qr; digits=4), " s") +end + +# Matrix multiply A * A +t_matmul = @elapsed begin + C = A * A +end +MPI.Barrier(comm) + +if rank == 0 + println("Matmul time: ", round(t_matmul; digits=4), " s") +end + diff --git a/src/array/alloc.jl b/src/array/alloc.jl index fe92ae1e1..0c45443da 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -93,14 +93,31 @@ function stage(ctx, A::AllocateArray) scope = ExactScope(A.procgrid[CartesianIndex(mod1.(Tuple(I), size(A.procgrid))...)]) end + N = ndims(A.domainchunks) + ret_type = Array{A.eltype, N} if A.want_index - task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, i, size(x)) + task = Dagger.@spawn compute_scope=scope return_type=ret_type allocate_array(A.f, A.eltype, i, size(x)) else - task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, size(x)) + task = Dagger.@spawn compute_scope=scope return_type=ret_type allocate_array(A.f, A.eltype, size(x)) end tasks[i] = task end end + # MPI type propagation: ensure all ranks know the concrete chunk types + accel = Dagger.current_acceleration() + if accel isa Dagger.MPIAcceleration + N = ndims(A.domainchunks) + expected_type = Array{A.eltype, N} + Dagger.mpi_propagate_chunk_types!(tasks, accel, expected_type) + # Log chunk types per rank after array creation + rank = MPI.Comm_rank(accel.comm) + #=chunk_types = Type[chunktype(t) for t in tasks] + if allequal(chunk_types) + @info "[rank $rank] Array creation (alloc): all $(length(chunk_types)) chunk types are uniform: $(first(chunk_types))" + else + @warn "[rank $rank] Array creation (alloc): chunk types are NOT uniform: $chunk_types" + end=# + end return DArray(A.eltype, A.domain, A.domainchunks, tasks, A.partitioning) end diff --git a/src/array/darray.jl b/src/array/darray.jl index 5e95adf94..7f22d6f88 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -1,7 +1,7 @@ import Base: ==, fetch, length, isempty, size export DArray, DVector, DMatrix, DVecOrMat, Blocks, AutoBlocks -export distribute +export distribute, collect_datadeps ###### Array Domains ###### @@ -175,6 +175,7 @@ domain(d::DArray) = d.domain chunks(d::DArray) = d.chunks domainchunks(d::DArray) = d.subdomains size(x::DArray) = size(domain(x)) +Base.ndims(d::DArray{T,N}) where {T,N} = N stage(ctx, c::DArray) = c function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N} @@ -201,6 +202,31 @@ function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N} collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks))) end end + +""" + collect_datadeps(d::DArray; root=nothing) + +Collect a DArray to a single array by fetching every chunk on the current rank +and assembling into a full array. No datadeps scheduling or root-only assembly: +each rank that calls this gets the full matrix (useful when correctness matters +more than communication cost). +""" +function collect_datadeps(d::DArray{T,N}; root=nothing) where {T,N} + if isempty(d.chunks) + return Array{eltype(d)}(undef, size(d)...) + end + if N == 0 + return fetch(d.chunks[1]) + end + + chks = d.chunks + doms = domainchunks(d) + out = Array{T,N}(undef, size(d)) + for I in CartesianIndices(chks) + copyto!(view(out, indexes(doms[I])...), fetch(chks[I])) + end + return out +end Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A)) Base.wait(A::DArray) = foreach(wait, A.chunks) @@ -482,6 +508,21 @@ function stage(ctx::Context, d::Distribute) Dagger.@spawn compute_scope=scope identity(d.data[c]) end end + # MPI type propagation: ensure all ranks know the concrete chunk types + accel = Dagger.current_acceleration() + if accel isa Dagger.MPIAcceleration + N = Base.ndims(d.data) + expected_type = Array{eltype(d.data), N} + Dagger.mpi_propagate_chunk_types!(cs, accel, expected_type) + # Log chunk types per rank after array creation + rank = MPI.Comm_rank(accel.comm) + #=chunk_types = Type[chunktype(t) for t in cs] + if allequal(chunk_types) + @info "[rank $rank] Array creation (distribute): all $(length(chunk_types)) chunk types are uniform: $(first(chunk_types))" + else + @warn "[rank $rank] Array creation (distribute): chunk types are NOT uniform: $chunk_types" + end=# + end return DArray(eltype(d.data), domain(d.data), d.domainchunks, diff --git a/src/array/mul.jl b/src/array/mul.jl index 02b207641..5890473da 100644 --- a/src/array/mul.jl +++ b/src/array/mul.jl @@ -41,7 +41,7 @@ function LinearAlgebra.generic_matmatmul!( return gemm_dagger!(C, transA, transB, A, B, alpha, beta) end end -function _repartition_matmatmul(C, A, B, transA::Char, transB::Char) +function _repartition_matmatmul(C, A, B, transA::Char, transB::Char)::Tuple{Blocks{2}, Blocks{2}, Blocks{2}} partA = A.partitioning.blocksize partB = B.partitioning.blocksize istransA = transA == 'T' || transA == 'C' @@ -93,6 +93,24 @@ function _repartition_matmatmul(C, A, B, transA::Char, transB::Char) return Blocks(partC...), Blocks(partA...), Blocks(partB...) end +# Typed BLAS wrappers so that every @spawn kernel has an inferable return type +@inline function _gemm!(transA::Char, transB::Char, alpha::T, A, B, mzone, C)::Matrix{T} where {T} + BLAS.gemm!(transA, transB, alpha, A, B, mzone, C) + return C +end +@inline function _syrk!(uplo::AbstractChar, trans::AbstractChar, alpha::T, A, mzone, C)::Matrix{T} where {T} + BLAS.syrk!(uplo, trans, alpha, A, mzone, C) + return C +end +@inline function _herk!(uplo::AbstractChar, trans::AbstractChar, alpha::Real, A, mzone, C)::Matrix{<:Complex} + BLAS.herk!(uplo, trans, alpha, A, mzone, C) + return C +end +@inline function _gemv!(transA::Char, alpha::T, A, x, mzone, y)::Vector{T} where {T} + BLAS.gemv!(transA, alpha, A, x, mzone, y) + return y +end + """ Performs one of the matrix-matrix operations @@ -136,7 +154,7 @@ function gemm_dagger!( # A: NoTrans / B: NoTrans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transA, transB, alpha, @@ -150,7 +168,7 @@ function gemm_dagger!( # A: NoTrans / B: [Conj]Trans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transA, transB, alpha, @@ -166,7 +184,7 @@ function gemm_dagger!( # A: [Conj]Trans / B: NoTrans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transA, transB, alpha, @@ -180,7 +198,7 @@ function gemm_dagger!( # A: [Conj]Trans / B: [Conj]Trans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transA, transB, alpha, @@ -243,7 +261,7 @@ function syrk_dagger!( for k in range(1, Ant) mzone = k == 1 ? real(beta) : one(real(T)) if iscomplex - Dagger.@spawn BLAS.herk!( + Dagger.@spawn _herk!( uplo, trans, real(alpha), @@ -252,7 +270,7 @@ function syrk_dagger!( InOut(Cc[n, n]), ) else - Dagger.@spawn BLAS.syrk!( + Dagger.@spawn _syrk!( uplo, trans, alpha, @@ -267,7 +285,7 @@ function syrk_dagger!( for m in range(n + 1, Cmt) for k in range(1, Ant) mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( trans, transs, alpha, @@ -283,7 +301,7 @@ function syrk_dagger!( for m in range(n + 1, Cmt) for k in range(1, Ant) mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( trans, transs, alpha, @@ -300,7 +318,7 @@ function syrk_dagger!( for k in range(1, Amt) mzone = k == 1 ? real(beta) : one(real(T)) if iscomplex - Dagger.@spawn BLAS.herk!( + Dagger.@spawn _herk!( uplo, transs, real(alpha), @@ -309,7 +327,7 @@ function syrk_dagger!( InOut(Cc[n, n]), ) else - Dagger.@spawn BLAS.syrk!( + Dagger.@spawn _syrk!( uplo, trans, alpha, @@ -324,7 +342,7 @@ function syrk_dagger!( for m in range(n + 1, Cmt) for k in range(1, Amt) mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transs, 'N', alpha, @@ -340,7 +358,7 @@ function syrk_dagger!( for m in range(n + 1, Cmt) for k in range(1, Amt) mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn _gemm!( transs, 'N', alpha, @@ -393,16 +411,17 @@ end return A end -@inline function copytile!(A, B) +@inline function copytile!(A::AbstractMatrix{T}, B::AbstractMatrix{T})::Nothing where {T} m, n = size(A) C = B' for i = 1:m, j = 1:n A[i, j] = C[i, j] end + return nothing end -@inline function copydiagtile!(A, uplo) +@inline function copydiagtile!(A::AbstractMatrix{T}, uplo::AbstractChar)::Nothing where {T} m, n = size(A) Acpy = copy(A) @@ -417,6 +436,7 @@ end for i = 1:m, j = 1:n A[i, j] = C[i, j] end + return nothing end function LinearAlgebra.generic_matvecmul!( C::DVector{T}, @@ -440,7 +460,7 @@ function LinearAlgebra.generic_matvecmul!( return gemv_dagger!(C, transA, A, B, _alpha, _beta) end end -function _repartition_matvecmul(C, A, B, transA::Char) +function _repartition_matvecmul(C, A, B, transA::Char)::Tuple{Blocks{1}, Blocks{2}, Blocks{1}} partA = A.partitioning.blocksize partB = B.partitioning.blocksize istransA = transA == 'T' || transA == 'C' @@ -495,7 +515,7 @@ function gemv_dagger!( # A: NoTrans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemv!( + Dagger.@spawn _gemv!( transA, alpha, In(Ac[m, k]), @@ -508,7 +528,7 @@ function gemv_dagger!( # A: [Conj]Trans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemv!( + Dagger.@spawn _gemv!( transA, alpha, In(Ac[k, m]), diff --git a/src/mpi.jl b/src/mpi.jl index 623d5d16a..09e4d5c1e 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -361,13 +361,15 @@ function tochunk_pset(x, space::MPIMemorySpace; device=nothing, kwargs...) if local_rank != space.rank return MPIRef(space.comm, space.rank, 0, nothing, Mid) else - return MPIRef(space.comm, space.rank, sizeof(x), poolset(x; device, kwargs...), Mid) + # type= is for Chunk metadata only; MemPool.poolset does not accept it + pset_kw = (; (k => v for (k, v) in pairs(kwargs) if k !== :type)...) + return MPIRef(space.comm, space.rank, sizeof(x), poolset(x; device, pset_kw...), Mid) end end const DEADLOCK_DETECT = TaskLocalValue{Bool}(()->true) const DEADLOCK_WARN_PERIOD = TaskLocalValue{Float64}(()->10.0) -const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->600.0) +const DEADLOCK_TIMEOUT_PERIOD = TaskLocalValue{Float64}(()->120.0) const RECV_WAITING = Base.Lockable(Dict{Tuple{MPI.Comm, Int, Int}, Base.Event}()) struct InplaceInfo @@ -748,8 +750,9 @@ function move(src::MPIOSProc, dst::MPIProcessor, x::Chunk) end const MPI_UNIFORM = ScopedValue{Bool}(false) +# When true, move(_, _, MPIRef) uses poolget(; uniform=true) so the owner bcasts and the fetcher recv (e.g. rank 0 collecting). +const FETCH_UNIFORM = ScopedValue{Bool}(true) -@warn "bcast T if return type is not concrete" maxlog=1 function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_proc, from_space, to_space, data) loc_rank = MPI.Comm_rank(accel.comm) task = DATADEPS_CURRENT_TASK[] @@ -757,48 +760,65 @@ function remotecall_endpoint(f, accel::Dagger.MPIAcceleration, from_proc, to_pro @assert data isa Chunk "Expected Chunk, got $(typeof(data))" space = memory_space(data) tag = to_tag() + type_tag = to_tag() + T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) + T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T + need_bcast = !isconcretetype(T_new) || T_new === Union{} || T_new === Nothing || T_new === Any + if space.rank != from_proc.rank - # If the data is already where it needs to be + # Data is already at destination (to_proc.rank) @assert space.rank == to_proc.rank if space.rank == loc_rank value = poolget(data.handle) data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) - return tochunk(data_converted, to_proc, to_space) + T_actual = typeof(data_converted) + if need_bcast + bcast_send_yield(T_actual, accel.comm, to_proc.rank, type_tag) + end + return tochunk(data_converted, to_proc, to_space; type=T_actual) else - T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) - T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T - @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" - return tochunk(nothing, to_proc, to_space; type=T_new) + T_actual = need_bcast ? recv_yield(accel.comm, to_proc.rank, type_tag) : T_new + return tochunk(nothing, to_proc, to_space; type=T_actual) end end - # The data is on the source rank + # Data is on the source rank @assert space.rank == from_proc.rank if loc_rank == from_proc.rank == to_proc.rank value = poolget(data.handle) data_converted = f(move(from_proc.innerProc, to_proc.innerProc, value)) - return tochunk(data_converted, to_proc, to_space) - else - if loc_rank == from_proc.rank - value = poolget(data.handle) - data_moved = move(from_proc.innerProc, to_proc.innerProc, value) - Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) - # FIXME: This is wrong to take typeof(data_moved), because the type may change - return tochunk(nothing, to_proc, to_space; type=typeof(data_moved)) - elseif loc_rank == to_proc.rank - data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) - data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) - return tochunk(data_converted, to_proc, to_space) - else - T = move_type(from_proc.innerProc, to_proc.innerProc, chunktype(data)) - T_new = f !== identity ? Base._return_type(f, Tuple{T}) : T - @assert isconcretetype(T_new) "Return type inference failed, expected concrete type, got $T -> $T_new" - return tochunk(nothing, to_proc, to_space; type=T_new) + return tochunk(data_converted, to_proc, to_space; type=typeof(data_converted)) + end + + if loc_rank == from_proc.rank + value = poolget(data.handle) + data_moved = move(from_proc.innerProc, to_proc.innerProc, value) + Dagger.send_yield(data_moved, accel.comm, to_proc.rank, tag) + T_actual = need_bcast ? recv_yield(accel.comm, to_proc.rank, type_tag) : T_new + return tochunk(nothing, to_proc, to_space; type=T_actual) + elseif loc_rank == to_proc.rank + data_moved = Dagger.recv_yield(accel.comm, from_space.rank, tag) + data_converted = f(move(from_proc.innerProc, to_proc.innerProc, data_moved)) + T_actual = typeof(data_converted) + if need_bcast + bcast_send_yield(T_actual, accel.comm, to_proc.rank, type_tag) end + return tochunk(data_converted, to_proc, to_space; type=T_actual) + else + T_actual = need_bcast ? recv_yield(accel.comm, to_proc.rank, type_tag) : T_new + return tochunk(nothing, to_proc, to_space; type=T_actual) end end end +# Chunk may be MPI-backed (MPIRef) but labeled with OSProc; treat source as the owning rank +function move(src::OSProc, dst::MPIProcessor, x::Chunk) + if x.handle isa MPIRef + return move(MPIOSProc(x.handle.comm, x.handle.rank), dst, x) + end + error("MPI move not supported") +end + move(src::Processor, dst::MPIProcessor, x::Chunk) = error("MPI move not supported") move(to_proc::MPIProcessor, chunk::Chunk) = move(chunk.processor, to_proc, chunk) @@ -842,20 +862,52 @@ function execute!(proc::MPIProcessor, f, args...; kwargs...) end if inplace_move space = memory_space(nothing, proc)::MPIMemorySpace - return tochunk(nothing, proc, space) - else - # Handle commun1ication ourselves - if islocal - T = typeof(result) - space = memory_space(result, proc)::MPIMemorySpace - T_space = (T, space.innerSpace) + # move!(..., to, from): result type is the destination chunk's type + dest_type = length(args) >= 4 && args[4] isa Chunk ? chunktype(args[4]) : Any + return tochunk(nothing, proc, space; type=dest_type) + end + + # Infer return type; only bcast when inference is not concrete + fname = nameof(f) + arg_types = map(chunktype, args) + for (i, a) in enumerate(args) + if arg_types[i] === Nothing + if a === nothing + error("Argument at position $i is the value `nothing` (dependency not resolved on this rank). f=$fname arg_types=$arg_types") + else + error("Argument at position $i has chunktype Nothing. f=$fname arg_types=$arg_types") + end + end + end + inferred_type = Base.promote_op(f, arg_types...) + if (inferred_type === Any || !isconcretetype(inferred_type)) && f === Dagger.allocate_array && length(args) >= 2 + T_el = args[2] + sz = args[end] + if T_el isa Type && isconcretetype(T_el) && sz isa Tuple{Vararg{Integer}} + inferred_type = Array{T_el, length(sz)} + end + end + need_bcast = !isconcretetype(inferred_type) || inferred_type === Union{} || inferred_type === Nothing || inferred_type === Any + if inferred_type === Nothing + error("execute!: inferred type is Nothing. f=$fname arg_types=$arg_types") + end + + if islocal + T = typeof(result) + space = memory_space(result, proc)::MPIMemorySpace + if need_bcast @opcounter :execute_bcast_send_yield - bcast_send_yield(T_space, proc.comm, proc.rank, tag) - return tochunk(result, proc, space) - else + bcast_send_yield((T, space.innerSpace), proc.comm, proc.rank, tag) + end + return tochunk(result, proc, space; type=T) + else + if need_bcast T, innerSpace = recv_yield(proc.comm, proc.rank, tag) space = MPIMemorySpace(innerSpace, proc.comm, proc.rank) return tochunk(nothing, proc, space; type=T) + else + space = memory_space(nothing, proc)::MPIMemorySpace + return tochunk(nothing, proc, space; type=inferred_type) end end end @@ -874,6 +926,28 @@ function initialize_acceleration!(a::MPIAcceleration) unique!(ctx.procs) end +""" + mpi_propagate_chunk_types!(tasks, accel::MPIAcceleration, expected_type) + +Ensure all ranks use the same concrete type for the given tasks by setting +each task's options.return_type to expected_type when it is concrete. +This allows chunktype(task) to return the concrete type on every rank +without an MPI allgather of actual result types. +""" +function mpi_propagate_chunk_types!(tasks, accel::MPIAcceleration, expected_type) + isconcretetype(expected_type) || return + for t in tasks + if t isa Thunk + if t.options !== nothing + t.options.return_type = expected_type + else + t.options = Options(return_type=expected_type) + end + end + end + return +end + accel_matches_proc(accel::MPIAcceleration, proc::MPIOSProc) = true accel_matches_proc(accel::MPIAcceleration, proc::MPIClusterProc) = true accel_matches_proc(accel::MPIAcceleration, proc::MPIProcessor) = true diff --git a/src/options.jl b/src/options.jl index 1b178df34..09067da51 100644 --- a/src/options.jl +++ b/src/options.jl @@ -70,6 +70,8 @@ Base.@kwdef mutable struct Options stream_max_evals::Union{Int,Nothing} = nothing acceleration::Union{Acceleration,Nothing} = nothing + + return_type::Union{Type,Nothing} = nothing end Options(::Nothing) = Options() function Options(old_options::NamedTuple) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 442472692..7c724c0ab 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -1551,11 +1551,19 @@ Executes a single task specified by `task` on `to_proc`. @invokelatest move(to_proc, value) end #end - if new_value !== value - @dagdebug thunk_id :move "Moved argument @ $position to $to_proc: $(typeof(value)) -> $(typeof(new_value))" + # Preserve Chunk reference when move returns nothing (placeholder on this rank). This keeps + # type information correct at all ranks: chunktype(Chunk) is concrete even when Chunk holds no data. + # So execute! sees correct arg_types. Materializing the value (for the kernel) must happen in + # execute! and may require lazy recv from the executor if this rank has a placeholder. + if new_value === nothing && (value isa Dagger.Chunk || value isa Dagger.WeakChunk) + arg.value = value + else + if new_value !== value + @dagdebug thunk_id :move "Moved argument @ $position to $to_proc: $(typeof(value)) -> $(typeof(new_value))" + end + arg.value = new_value end - @maybelog ctx timespan_finish(ctx, :move, (;thunk_id, position, processor=to_proc), (;f, data=new_value); tasks=[Base.current_task()]) - arg.value = new_value + @maybelog ctx timespan_finish(ctx, :move, (;thunk_id, position, processor=to_proc), (;f, data=Dagger.value(arg)); tasks=[Base.current_task()]) return end end diff --git a/src/submission.jl b/src/submission.jl index 4ff4f2294..fffcc577d 100644 --- a/src/submission.jl +++ b/src/submission.jl @@ -285,7 +285,13 @@ function eager_process_args_submission_to_local(id_map, spec::DTaskSpec{true}) return ntuple(i->eager_process_elem_submission_to_local(id_map, spec.fargs[i]), length(spec.fargs)) end -DTaskMetadata(spec::DTaskSpec) = DTaskMetadata(eager_metadata(spec.fargs)) +function DTaskMetadata(spec::DTaskSpec) + rt = spec.options.return_type + if rt !== nothing && isconcretetype(rt) && rt !== Any + return DTaskMetadata(rt) + end + return DTaskMetadata(eager_metadata(spec.fargs)) +end function eager_metadata(fargs) f = value(fargs[1]) f = f isa StreamingFunction ? f.f : f @@ -298,6 +304,10 @@ function eager_spawn(spec::DTaskSpec) uid = eager_next_id() future = ThunkFuture() metadata = DTaskMetadata(spec) + # Propagate inferred return type to options so execute! can skip MPI bcast + if isconcretetype(metadata.return_type) + spec.options.return_type = metadata.return_type + end return DTask(uid, future, metadata) end @@ -320,10 +330,16 @@ function eager_launch!(pair::DTaskPair) end end + # Propagate DTask return_type into options so the created Thunk has chunktype for downstream inference + options = spec.options + if isconcretetype(task.metadata.return_type) + options = copy(options) + options.return_type = task.metadata.return_type + end # Submit the task #=FIXME:REALLOC=# thunk_id = eager_submit!(PayloadOne(task.uid, task.future, - fargs, spec.options, true)) + fargs, options, true)) task.thunk_ref = thunk_id.ref end # FIXME: Don't convert Tuple to Vector{Argument} @@ -353,7 +369,13 @@ function eager_launch!(pairs::Vector{DTaskPair}) end end end - all_options = Options[pair.spec.options for pair in pairs] + # Propagate DTask return_type into options so created Thunks have chunktype for downstream inference + all_options = Options[ + let opts = pair.spec.options + isconcretetype(pair.task.metadata.return_type) ? (o = copy(opts); o.return_type = pair.task.metadata.return_type; o) : opts + end + for pair in pairs + ] # Submit the tasks #=FIXME:REALLOC=# diff --git a/src/thunk.jl b/src/thunk.jl index 5008e4a1b..c24e0c329 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -247,6 +247,14 @@ isweak(t) = false Base.show(io::IO, t::WeakThunk) = (print(io, "~"); Base.show(io, t.x.value)) Base.convert(::Type{WeakThunk}, t::Thunk) = WeakThunk(t) chunktype(t::WeakThunk) = chunktype(unwrap_weak_checked(t)) +# Use options.return_type when set (e.g. from mpi_propagate_chunk_types! or eager_metadata) +# so that Thunk arguments propagate type to downstream eager_metadata/execute! +function chunktype(t::Thunk) + if t.options !== nothing && t.options.return_type !== nothing && isconcretetype(t.options.return_type) + return t.options.return_type + end + return typeof(t) +end Base.convert(::Type{ThunkSyncdep}, t::WeakThunk) = ThunkSyncdep(nothing, t) ThunkSyncdep(t::WeakThunk) = ThunkSyncdep(nothing, t) diff --git a/src/tochunk.jl b/src/tochunk.jl index 386e1b80f..ff15e426e 100644 --- a/src/tochunk.jl +++ b/src/tochunk.jl @@ -22,6 +22,9 @@ All other kwargs are passed directly to `MemPool.poolset`. tochunk(x::X, proc::P, space::M; kwargs...) where {X,P<:Processor,M<:MemorySpace} = tochunk(x, proc, space, AnyScope(); kwargs...) function tochunk(x::X, proc::P, space::M, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,P<:Processor,S,M<:MemorySpace} + if type === Nothing + throw(ArgumentError("Chunk type cannot be Nothing. Placeholder chunks must be created with an explicit type= (e.g. tochunk(nothing, proc, space; type=Matrix{Float64})). x=$(repr(x))")) + end if x isa Chunk check_proc_space(x, proc, space) return maybe_rewrap(x, proc, space, scope; type, rewrap) @@ -33,7 +36,7 @@ function tochunk(x::X, proc::P, space::M, scope::S; device=nothing, type=X, rewr MemPool.CPURAMDevice() end end - ref = tochunk_pset(x, space; device, kwargs...) + ref = tochunk_pset(x, space; device, type, kwargs...) return Chunk{type,typeof(ref),P,S,typeof(space)}(type, domain(x), ref, proc, scope, space) end # Disambiguate: Chunk-specific 3-arg so kwcall(tochunk, Chunk, Processor, Scope) is not ambiguous with utils/chunks.jl @@ -47,6 +50,9 @@ function tochunk(x::Chunk, proc::P, scope::S; rewrap=false, kwargs...) where {P< end end function tochunk(x::X, proc::P, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,P<:Processor,S} + if type === Nothing + throw(ArgumentError("Chunk type cannot be Nothing. Placeholder chunks must be created with an explicit type= (e.g. tochunk(nothing, proc, space; type=Matrix{Float64})). x=$(repr(x))")) + end if device === nothing device = if Sch.walk_storage_safe(x) MemPool.GLOBAL_DEVICE[] @@ -60,10 +66,13 @@ function tochunk(x::X, proc::P, scope::S; device=nothing, type=X, rewrap=false, return maybe_rewrap(x, proc, space, scope; type, rewrap) end space = default_memory_space(current_acceleration(), x) - ref = tochunk_pset(x, space; device, kwargs...) + ref = tochunk_pset(x, space; device, type, kwargs...) return Chunk{type,typeof(ref),P,S,typeof(space)}(type, domain(x), ref, proc, scope, space) end function tochunk(x::X, space::M, scope::S; device=nothing, type=X, rewrap=false, kwargs...) where {X,M<:MemorySpace,S} + if type === Nothing + throw(ArgumentError("Chunk type cannot be Nothing. Placeholder chunks must be created with an explicit type= (e.g. tochunk(nothing, proc, space; type=Matrix{Float64})). x=$(repr(x))")) + end if device === nothing device = if Sch.walk_storage_safe(x) MemPool.GLOBAL_DEVICE[] @@ -77,7 +86,7 @@ function tochunk(x::X, space::M, scope::S; device=nothing, type=X, rewrap=false, return maybe_rewrap(x, proc, space, scope; type, rewrap) end proc = default_processor(current_acceleration(), x) - ref = tochunk_pset(x, space; device, kwargs...) + ref = tochunk_pset(x, space; device, type, kwargs...) return Chunk{type,typeof(ref),typeof(proc),S,M}(type, domain(x), ref, proc, scope, space) end # 2-arg: avoid overwriting utils/chunks.jl's tochunk(Any, Any) and tochunk(Any); only add Processor/MemorySpace variants @@ -105,6 +114,6 @@ function maybe_rewrap(x, proc, space, scope; type, rewrap) end end -tochunk_pset(x, space::MemorySpace; device=nothing, kwargs...) = poolset(x; device, kwargs...) +tochunk_pset(x, space::MemorySpace; device=nothing, type=nothing, kwargs...) = poolset(x; device, kwargs...) # savechunk: defined in utils/chunks.jl (fork Chunk has space field; do not duplicate here) diff --git a/src/utils/dagdebug.jl b/src/utils/dagdebug.jl index 873e47e79..678445051 100644 --- a/src/utils/dagdebug.jl +++ b/src/utils/dagdebug.jl @@ -59,4 +59,7 @@ macro opcounter(category, count=1) end end) end -opcounter(mod::Module, category::Symbol) = getfield(mod, Symbol(:OPCOUNTER_, category)).value[] \ No newline at end of file +opcounter(mod::Module, category::Symbol) = getfield(mod, Symbol(:OPCOUNTER_, category)).value[] + +# No-op debug helper for tracking largest values (used alongside @opcounter) +largest_value_update!(::Any) = nothing \ No newline at end of file From 989349dc829954a79c2ce4440cd50ba7075def56 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 12 Mar 2026 14:45:28 -0300 Subject: [PATCH 23/24] WIP: MPI inference --- benchmarks/run_matmul.jl | 48 ++++--- matmul.log | 287 +++++++++++++++++++++++++++++++++++++++ src/mpi.jl | 25 +--- 3 files changed, 314 insertions(+), 46 deletions(-) create mode 100644 matmul.log diff --git a/benchmarks/run_matmul.jl b/benchmarks/run_matmul.jl index c455a22d9..0eb4ec0d7 100644 --- a/benchmarks/run_matmul.jl +++ b/benchmarks/run_matmul.jl @@ -14,7 +14,7 @@ if !isdefined(Dagger, :accelerate!) end Dagger.accelerate!(:mpi) -const N = 10_000 +const N = 2_000 const comm = MPI.COMM_WORLD const rank = MPI.Comm_rank(comm) const nranks = MPI.Comm_size(comm) @@ -54,7 +54,6 @@ if CHECK_CORRECTNESS t_upload = @elapsed begin A_g = CUDA.cu(A_full) B_g = CUDA.cu(B_full) - C_dagger_g = CUDA.cu(C_dagger) end println("Collect + upload time: ", round(t_collect + t_upload; digits=4), " s") @@ -63,44 +62,43 @@ if CHECK_CORRECTNESS end println("Baseline (GPU/CUDA) time: ", round(t_baseline; digits=4), " s") - rtol = 1f-5 - atol = 1f-6 - err = norm(C_dagger_g - C_ref_g) - ref_norm = norm(C_ref_g) - rel_err = ref_norm > 0 ? err / ref_norm : err - ok = err <= atol + rtol * ref_norm + # Require all elements within 100× machine epsilon relative error (componentwise) + C_dagger_cpu = C_dagger + C_ref_cpu = Array(C_ref_g) + eps_f = eps(Float32) + rtol = 50.0f0 * eps_f + diff = C_dagger_cpu .- C_ref_cpu + # rel_ij = |diff|/|C_ref|, denominator at least eps to avoid div by zero + denom = max.(abs.(C_ref_cpu), eps_f) + rel_err = abs.(diff) ./ denom + max_rel_err = Float32(maximum(rel_err)) + ok = max_rel_err <= rtol if ok - println("Correctness: OK (rel_err = ", Float32(rel_err), ", abs_err = ", Float32(err), ")") + println("Correctness: OK (max rel_err = ", max_rel_err, " <= 100×eps = ", rtol, ")") else - println("Correctness: FAIL (rel_err = ", Float32(rel_err), ", abs_err = ", Float32(err), ", rtol=$rtol, atol=$atol)") + println("Correctness: FAIL (max rel_err = ", max_rel_err, " > 100×eps = ", rtol, ")") end - # Per-block analysis: which sections exceed tolerance (same block size as Dagger layout) - C_dagger_cpu = Array(C_dagger_g) - C_ref_cpu = Array(C_ref_g) + # Per-block: which blocks have any element with rel_err > 100×eps n_bi = ceil(Int, N / BLOCK) n_bj = ceil(Int, N / BLOCK) - bad_blocks = Tuple{Int,Int,Float32,Float32}[] + bad_blocks = Tuple{Int,Int,Float32}[] for bi in 1:n_bi, bj in 1:n_bj ri = (bi - 1) * BLOCK + 1 : min(bi * BLOCK, N) rj = (bj - 1) * BLOCK + 1 : min(bj * BLOCK, N) - diff_block = @view(C_dagger_cpu[ri, rj]) .- @view(C_ref_cpu[ri, rj]) - ref_block = @view(C_ref_cpu[ri, rj]) - block_err = norm(diff_block) - block_ref = norm(ref_block) - block_rel = block_ref > 0 ? block_err / block_ref : block_err - if block_err > atol + rtol * block_ref - push!(bad_blocks, (bi, bj, Float32(block_rel), Float32(block_err))) + block_rel = Float32(maximum(@view(rel_err[ri, rj]))) + if block_rel > rtol + push!(bad_blocks, (bi, bj, block_rel)) end end if isempty(bad_blocks) - println("Per-block: all ", n_bi * n_bj, " blocks within tolerance.") + println("Per-block: all ", n_bi * n_bj, " blocks within 100×eps rel_err.") else - println("Per-block: ", length(bad_blocks), " block(s) exceed tolerance (block size ", BLOCK, "×", BLOCK, "):") + println("Per-block: ", length(bad_blocks), " block(s) exceed 100×eps rel_err (block size ", BLOCK, "×", BLOCK, "):") sort!(bad_blocks; by = x -> -x[3]) - for (bi, bj, brel, babs) in bad_blocks + for (bi, bj, block_rel) in bad_blocks println(" block [", bi, ",", bj, "] rows ", (bi - 1) * BLOCK + 1, ":", min(bi * BLOCK, N), - ", cols ", (bj - 1) * BLOCK + 1, ":", min(bj * BLOCK, N), " rel_err = ", brel, " abs_err = ", babs) + ", cols ", (bj - 1) * BLOCK + 1, ":", min(bj * BLOCK, N), " max rel_err = ", block_rel) end end end diff --git a/matmul.log b/matmul.log new file mode 100644 index 000000000..9f8999fbe --- /dev/null +++ b/matmul.log @@ -0,0 +1,287 @@ +Precompiling packages... +Precompiling packages... +Precompiling packages... + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) +Precompiling packages... + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) +Precompiling packages... +Precompiling packages... +Precompiling Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) + packages... + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) +Precompiling packages... +Precompiling packages... +Precompiling Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) + packages... + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) + Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) +Info Given Dagger was explicitly requested, output will be shown live  +┌ Warning: Fix semantics of collect +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/chunks.jl:30 +┌ Warning: Update tochunk docstring +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:1 +┌ Warning: Dispatch bcast behavior on acceleration +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:229 +┌ Warning: Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:254 +┌ Warning: Fix this to work with MPI (can't call poolget on the wrong rank) +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:547 +┌ Warning: Document these public methods +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:710 +┌ Warning: Don't blindly set occupancy=0, only do for MPI +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/queue.jl:124 +┌ Warning: Is this uniform logic valuable to have? +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:825 +┌ Warning: Precompile failed to clean up all tasks +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/precompile.jl:21 + 28913.0 ms ✓ Dagger + 28511.1 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 28861.7 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 28781.8 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 28653.0 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 29004.3 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 28834.6 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 29246.5 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 29136.4 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 29157.1 ms ✓ Dagger + DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) + 2114.3 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 1 dependency had output during precompilation: +┌ Dagger +│ [Output was shown above] +└ + 1969.6 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 2155.1 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 2008.6 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 1784.2 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 2051.3 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 1858.0 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 2131.2 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. + 2485.6 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 32 seconds. 113 already precompiled. + 2240.8 ms ✓ Dagger → DistributionsExt + 2 dependencies successfully precompiled in 32 seconds. 113 already precompiled. +Benchmark: 10 ranks, N=2000, block size 500×500 (matmul) +┌ Warning: [rank 4][tag 49] Hit probable hang on recv (dest: 0) +└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:626 +Matmul time: 30.3111 s +ERROR: LoadError: ERROR: LoadError: ERROR: ERROR: LoadError: LoadError: ERROR: ERROR: LoadError: LoadError: ERROR: LoadError: ERROR: LoadError: ERROR: LoadError: ERROR: LoadError: AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 +Stacktrace: + [1] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 + [2] take_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 + [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 + [5] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] + [6] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] + [7] #691 + @ ./none:-1 [inlined] + [8] iterate + @ ./generator.jl:48 [inlined] + [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) + @ Base ./array.jl:790 + [10] view(A::Matrix{Float64}, p::Blocks{2}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 + [11] #collect_datadeps#548 + @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] + [12] macro expansion + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] + [13] macro expansion + @ ./timing.jl:461 [inlined] + [14] top-level scope + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 + [15] include(mod::Module, _path::String) + @ Base ./Base.jl:306 + [16] exec_options(opts::Base.JLOptions) + @ Base ./client.jl:317 + [17] _start() + @ Base ./client.jl:550 +in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 +AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 +Stacktrace: + [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 +Stacktrace: + [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 +Stacktrace: + [1] +Stacktrace: + [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 +Stacktrace: + [1] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 + [2] take_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 + [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 + [5] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] + [6] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] + [7] #691 + @ ./none:-1 [inlined] + [8] iterate + @ ./generator.jl:48 [inlined] + [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) + @ Base ./array.jl:790 + [10] view(A::Matrix{Float64}, p::Blocks{2}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 + [11] #collect_datadeps#548 + @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] + [12] macro expansion + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] + [13] macro expansion + @ ./timing.jl:461 [inlined] + [14] top-level scope + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 + [15] include(mod::Module, _path::String) + @ Base ./Base.jl:306 + [16] exec_options(opts::Base.JLOptions) + @ Base ./client.jl:317 + [17] _start() + @ Base ./client.jl:550 +in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 +Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 + [2] take_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 +Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) [2] + @ Daggertake_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 + [2] take_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 + [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 + [5] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] + [6] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] + [7] #691 + @ ./none:-1 [inlined] + [8] iterate + @ ./generator.jl:48 [inlined] + [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) + @ Base ./array.jl:790 + [10] view(A::Matrix{Float64}, p::Blocks{2}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 + [11] #collect_datadeps#548 + @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] + [12] macro expansion + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] + [13] macro expansion + @ ./timing.jl:461 [inlined] + [14] top-level scope + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 + [15] include(mod::Module, _path::String) + @ Base ./Base.jl:306 + [16] exec_options(opts::Base.JLOptions) + @ Base ./client.jl:317 + [17] _start() + @ Base ./client.jl:550 +in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 +tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) + @tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) + @ Dagger Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 + ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 + [4] [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 + [5] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] + [6] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] + [7] #691 + @ ./none:-1 [inlined] + [8] iterate + @ ./generator.jl:48 [inlined] + [9] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 + [5] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] + [6] tochunk + @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] + [7] #691 + @ ./none:-1 [inlined] + [8] iterate + @ ./generator.jl:48 [inlined] + [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) + @ Base ./array.jl:790 + [10] view(A::Matrix{Float64}, p::Blocks{2}) + @ Daggercollect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) ~/dagger-dev/mpi/Dagger.jl/src/array/ +alloc.jl:221 + @ Base [11] #collect_datadeps#548 + @ ./array.jl:790 + ~/dagger-dev/mpi/Dagger.jl/src/array/ [10] darray.jl:213 [inlined] + [12] macro expansion + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] + [13] macro expansion + @ ./timing.jl:461 [inlined] + [14] top-level scope + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 + [15] include(mod::Module, _path::String) + @ Base ./Base.jl:306 + [16] exec_options(opts::Base.JLOptions) + @ Base ./client.jl:317 + [17] _start() + @ Base ./client.jl:550 +in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 +view(A::Matrix{Float64}, p::Blocks{2}) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 + [11] #collect_datadeps#548 + @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] + [12] macro expansion + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] + [13] macro expansion + @ ./timing.jl:461 [inlined] + [14] top-level scope + @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 + [15] include(mod::Module, _path::String) + @ Base ./Base.jl:306 + [16] exec_options(opts::Base.JLOptions) + @ Base ./client.jl:317 + [17] _start() + @ Base ./client.jl:550 +in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 +Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 + [2] take_ref_id!() + @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 + [3] +=================================================================================== += BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES += PID 246518 RUNNING AT fedora += EXIT CODE: 9 += CLEANING UP REMAINING PROCESSES += YOU CAN IGNORE THE BELOW CLEANUP MESSAGES +=================================================================================== +YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Killed (signal 9) +This typically refers to a problem with your application. +Please see the FAQ page for debugging suggestions diff --git a/src/mpi.jl b/src/mpi.jl index 09e4d5c1e..1b84a7b9d 100644 --- a/src/mpi.jl +++ b/src/mpi.jl @@ -857,40 +857,23 @@ function execute!(proc::MPIProcessor, f, args...; kwargs...) inplace_move = f === move! result = nothing tag = to_tag() + if islocal || inplace_move result = execute!(proc.innerProc, f, args...; kwargs...) end + if inplace_move space = memory_space(nothing, proc)::MPIMemorySpace - # move!(..., to, from): result type is the destination chunk's type - dest_type = length(args) >= 4 && args[4] isa Chunk ? chunktype(args[4]) : Any + dest_type = chunktype(args[4]) return tochunk(nothing, proc, space; type=dest_type) end # Infer return type; only bcast when inference is not concrete fname = nameof(f) arg_types = map(chunktype, args) - for (i, a) in enumerate(args) - if arg_types[i] === Nothing - if a === nothing - error("Argument at position $i is the value `nothing` (dependency not resolved on this rank). f=$fname arg_types=$arg_types") - else - error("Argument at position $i has chunktype Nothing. f=$fname arg_types=$arg_types") - end - end - end inferred_type = Base.promote_op(f, arg_types...) - if (inferred_type === Any || !isconcretetype(inferred_type)) && f === Dagger.allocate_array && length(args) >= 2 - T_el = args[2] - sz = args[end] - if T_el isa Type && isconcretetype(T_el) && sz isa Tuple{Vararg{Integer}} - inferred_type = Array{T_el, length(sz)} - end - end + need_bcast = !isconcretetype(inferred_type) || inferred_type === Union{} || inferred_type === Nothing || inferred_type === Any - if inferred_type === Nothing - error("execute!: inferred type is Nothing. f=$fname arg_types=$arg_types") - end if islocal T = typeof(result) From 02070224fbef78a4fc50ed99258a5fbe91fe2607 Mon Sep 17 00:00:00 2001 From: Felipe Tome Date: Thu, 12 Mar 2026 14:46:11 -0300 Subject: [PATCH 24/24] Removing a faulty log file --- matmul.log | 287 ----------------------------------------------------- 1 file changed, 287 deletions(-) delete mode 100644 matmul.log diff --git a/matmul.log b/matmul.log deleted file mode 100644 index 9f8999fbe..000000000 --- a/matmul.log +++ /dev/null @@ -1,287 +0,0 @@ -Precompiling packages... -Precompiling packages... -Precompiling packages... - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) -Precompiling packages... - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) -Precompiling packages... -Precompiling packages... -Precompiling Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) - packages... - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) -Precompiling packages... -Precompiling packages... -Precompiling Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) - packages... - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) - Dagger Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/Dagger/0a2f8_osDI2.ji.pidfile) -Info Given Dagger was explicitly requested, output will be shown live  -┌ Warning: Fix semantics of collect -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/chunks.jl:30 -┌ Warning: Update tochunk docstring -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:1 -┌ Warning: Dispatch bcast behavior on acceleration -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:229 -┌ Warning: Switch ArgumentWrapper to contain just the argument, and add DependencyWrapper -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:254 -┌ Warning: Fix this to work with MPI (can't call poolget on the wrong rank) -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:547 -┌ Warning: Document these public methods -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/aliasing.jl:710 -┌ Warning: Don't blindly set occupancy=0, only do for MPI -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/datadeps/queue.jl:124 -┌ Warning: Is this uniform logic valuable to have? -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:825 -┌ Warning: Precompile failed to clean up all tasks -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/precompile.jl:21 - 28913.0 ms ✓ Dagger - 28511.1 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 28861.7 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 28781.8 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 28653.0 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 29004.3 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 28834.6 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 29246.5 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 29136.4 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 29157.1 ms ✓ Dagger - DistributionsExt Being precompiled by another process (pid: 246521, pidfile: /home/felipetome/.julia/compiled/v1.12/DistributionsExt/KnLSB_osDI2.ji.pidfile) - 2114.3 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 1 dependency had output during precompilation: -┌ Dagger -│ [Output was shown above] -└ - 1969.6 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 2155.1 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 2008.6 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 1784.2 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 2051.3 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 1858.0 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 2131.2 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 31 seconds. 113 already precompiled. - 2485.6 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 32 seconds. 113 already precompiled. - 2240.8 ms ✓ Dagger → DistributionsExt - 2 dependencies successfully precompiled in 32 seconds. 113 already precompiled. -Benchmark: 10 ranks, N=2000, block size 500×500 (matmul) -┌ Warning: [rank 4][tag 49] Hit probable hang on recv (dest: 0) -└ @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:626 -Matmul time: 30.3111 s -ERROR: LoadError: ERROR: LoadError: ERROR: ERROR: LoadError: LoadError: ERROR: ERROR: LoadError: LoadError: ERROR: LoadError: ERROR: LoadError: ERROR: LoadError: ERROR: LoadError: AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 -Stacktrace: - [1] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 - [2] take_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 - [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 - [5] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] - [6] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] - [7] #691 - @ ./none:-1 [inlined] - [8] iterate - @ ./generator.jl:48 [inlined] - [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) - @ Base ./array.jl:790 - [10] view(A::Matrix{Float64}, p::Blocks{2}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 - [11] #collect_datadeps#548 - @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] - [12] macro expansion - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] - [13] macro expansion - @ ./timing.jl:461 [inlined] - [14] top-level scope - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 - [15] include(mod::Module, _path::String) - @ Base ./Base.jl:306 - [16] exec_options(opts::Base.JLOptions) - @ Base ./client.jl:317 - [17] _start() - @ Base ./client.jl:550 -in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 -AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 -Stacktrace: - [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 -Stacktrace: - [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 -Stacktrace: - [1] -Stacktrace: - [1] AssertionError: Invalid MPIRefID: tid=0, uid=0, id=0 -Stacktrace: - [1] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 - [2] take_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 - [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 - [5] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] - [6] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] - [7] #691 - @ ./none:-1 [inlined] - [8] iterate - @ ./generator.jl:48 [inlined] - [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) - @ Base ./array.jl:790 - [10] view(A::Matrix{Float64}, p::Blocks{2}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 - [11] #collect_datadeps#548 - @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] - [12] macro expansion - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] - [13] macro expansion - @ ./timing.jl:461 [inlined] - [14] top-level scope - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 - [15] include(mod::Module, _path::String) - @ Base ./Base.jl:306 - [16] exec_options(opts::Base.JLOptions) - @ Base ./client.jl:317 - [17] _start() - @ Base ./client.jl:550 -in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 -Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 - [2] take_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 -Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) [2] - @ Daggertake_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 - [2] take_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 - [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 - [5] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] - [6] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] - [7] #691 - @ ./none:-1 [inlined] - [8] iterate - @ ./generator.jl:48 [inlined] - [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) - @ Base ./array.jl:790 - [10] view(A::Matrix{Float64}, p::Blocks{2}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 - [11] #collect_datadeps#548 - @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] - [12] macro expansion - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] - [13] macro expansion - @ ./timing.jl:461 [inlined] - [14] top-level scope - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 - [15] include(mod::Module, _path::String) - @ Base ./Base.jl:306 - [16] exec_options(opts::Base.JLOptions) - @ Base ./client.jl:317 - [17] _start() - @ Base ./client.jl:550 -in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 -tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) - @tochunk_pset(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, space::Dagger.MPIMemorySpace{Dagger.CPURAMMemorySpace}; device::MemPool.CPURAMDevice, kwargs::@Kwargs{type::DataType}) - @ Dagger Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 - ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:360 - [4] [4] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 - [5] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] - [6] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] - [7] #691 - @ ./none:-1 [inlined] - [8] iterate - @ ./generator.jl:48 [inlined] - [9] tochunk(x::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, proc::OSProc, scope::AnyScope; device::Nothing, type::Type, rewrap::Bool, kwargs::@Kwargs{}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:69 - [5] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/tochunk.jl:52 [inlined] - [6] tochunk - @ ~/dagger-dev/mpi/Dagger.jl/src/utils/chunks.jl:155 [inlined] - [7] #691 - @ ./none:-1 [inlined] - [8] iterate - @ ./generator.jl:48 [inlined] - [9] collect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) - @ Base ./array.jl:790 - [10] view(A::Matrix{Float64}, p::Blocks{2}) - @ Daggercollect(itr::Base.Generator{Dagger.DomainBlocks{2}, Dagger.var"#691#692"{Matrix{Float64}}}) ~/dagger-dev/mpi/Dagger.jl/src/array/ -alloc.jl:221 - @ Base [11] #collect_datadeps#548 - @ ./array.jl:790 - ~/dagger-dev/mpi/Dagger.jl/src/array/ [10] darray.jl:213 [inlined] - [12] macro expansion - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] - [13] macro expansion - @ ./timing.jl:461 [inlined] - [14] top-level scope - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 - [15] include(mod::Module, _path::String) - @ Base ./Base.jl:306 - [16] exec_options(opts::Base.JLOptions) - @ Base ./client.jl:317 - [17] _start() - @ Base ./client.jl:550 -in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 -view(A::Matrix{Float64}, p::Blocks{2}) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/array/alloc.jl:221 - [11] #collect_datadeps#548 - @ ~/dagger-dev/mpi/Dagger.jl/src/array/darray.jl:213 [inlined] - [12] macro expansion - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:46 [inlined] - [13] macro expansion - @ ./timing.jl:461 [inlined] - [14] top-level scope - @ ~/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:45 - [15] include(mod::Module, _path::String) - @ Base ./Base.jl:306 - [16] exec_options(opts::Base.JLOptions) - @ Base ./client.jl:317 - [17] _start() - @ Base ./client.jl:550 -in expression starting at /home/felipetome/dagger-dev/mpi/Dagger.jl/benchmarks/run_matmul.jl:44 -Dagger.MPIRefID(tid::Int64, uid::Int64, id::Int64) - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:291 - [2] take_ref_id!() - @ Dagger ~/dagger-dev/mpi/Dagger.jl/src/mpi.jl:353 - [3] -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 246518 RUNNING AT fedora -= EXIT CODE: 9 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Killed (signal 9) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions