From 32b2fd99a98ff036e9cd3abc2353ad9c31bc723a Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Mon, 9 Mar 2026 12:52:40 -0700 Subject: [PATCH 1/6] Update scaffold-tuolumne.job --- scripts/scaffold-tuolumne.job | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index ce50b46..9b46955 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -15,6 +15,9 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi # (2) Removing libmpi may cause segfault on mpi4py import export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" +# Disable direct convolution benchmarking (should speedup warmup by a significant amount) +export MIOPEN_DEBUG_CONV_DIRECT=0 + torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml # Uncomment if you want torch profiling From 73c6d120580a19178bc318a291924063609a8aec Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Mon, 9 Mar 2026 12:53:03 -0700 Subject: [PATCH 2/6] Update scaffold-tuolumne-torchpypi.job --- scripts/scaffold-tuolumne-torchpypi.job | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index cc9b10e..31136a3 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -14,6 +14,9 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi # Use ccl plugin that we manually built with install-rccl.sh export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so +# Disable direct convolution benchmarking (should speedup warmup by a significant amount) +export MIOPEN_DEBUG_CONV_DIRECT=0 + torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml # Uncomment if you want torch profiling From 3a8f5e041b6c82b20b2b01a2b482cb115023c403 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 13 Mar 2026 10:41:30 -0700 Subject: [PATCH 3/6] Update scaffold-tuolumne.job --- scripts/scaffold-tuolumne.job | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index 9b46955..e01a265 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -16,7 +16,9 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" # Disable direct convolution benchmarking (should speedup warmup by a significant amount) -export MIOPEN_DEBUG_CONV_DIRECT=0 +# export MIOPEN_DEBUG_CONV_DIRECT=0 +# Disable direct naive convolution benchmarking +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0 torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml From 2cc593294f83493fbd72b7b024784899451a6e0f Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 13 Mar 2026 10:41:49 -0700 Subject: [PATCH 4/6] Update scaffold-tuolumne-torchpypi.job --- scripts/scaffold-tuolumne-torchpypi.job | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index 31136a3..d6d1371 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -15,7 +15,9 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so # Disable direct convolution benchmarking (should speedup warmup by a significant amount) -export MIOPEN_DEBUG_CONV_DIRECT=0 +# export MIOPEN_DEBUG_CONV_DIRECT=0 +# Disable direct naive convolution benchmarking +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0 torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml From 69aa4ff14e7412eca2b7814118f8b293fa2a64e2 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 20 Mar 2026 16:56:38 -0700 Subject: [PATCH 5/6] Update scaffold-tuolumne-torchpypi.job --- scripts/scaffold-tuolumne-torchpypi.job | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index d6d1371..df96ee8 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -16,8 +16,13 @@ export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so # Disable direct convolution benchmarking (should speedup warmup by a significant amount) # export MIOPEN_DEBUG_CONV_DIRECT=0 -# Disable direct naive convolution benchmarking + +# Disable direct naive convolution benchmarking (naive_conv_ab_nonpacked_fwd_ndhwc_half_double_half.kd) export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0 +# Disable naive_conv_ab_nonpacked_bwd_ndhwc_half_double_half.kd +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0 +# Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0 torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml From 263b4a93d41d3542f2b140f5eb64da08a9033cc5 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 20 Mar 2026 16:56:51 -0700 Subject: [PATCH 6/6] Update scaffold-tuolumne.job --- scripts/scaffold-tuolumne.job | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index e01a265..393d45b 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -17,8 +17,13 @@ export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/o # Disable direct convolution benchmarking (should speedup warmup by a significant amount) # export MIOPEN_DEBUG_CONV_DIRECT=0 -# Disable direct naive convolution benchmarking + +# Disable direct naive convolution benchmarking (naive_conv_ab_nonpacked_fwd_ndhwc_half_double_half.kd) export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0 +# Disable naive_conv_ab_nonpacked_bwd_ndhwc_half_double_half.kd +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0 +# Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd +export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0 torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml