From e7c1509cad22788e10c05c47afe24d1bd3c48575 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 19 Feb 2026 17:20:27 +0100 Subject: [PATCH 01/43] Added GHA CI workflow --- .github/.workflows/rocm-nightly.yml | 231 ++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 .github/.workflows/rocm-nightly.yml diff --git a/.github/.workflows/rocm-nightly.yml b/.github/.workflows/rocm-nightly.yml new file mode 100644 index 000000000..345c0a130 --- /dev/null +++ b/.github/.workflows/rocm-nightly.yml @@ -0,0 +1,231 @@ +name: rocm-nightly-ci + +on: + pull_request: + branches: [main] + workflow_dispatch: + schedule: + - cron: "0 2 * * *" + +concurrency: + group: rocm-nightly-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + PYTHON_VERSION: "3.12" + ROCM_PATH: "/opt/rocm" + PYTORCH_ROCM_ARCH: "gfx942" + # Avoid cross-run contamination on self-hosted boxes + TORCH_EXTENSIONS_DIR: ${{ github.workspace }}/.torch_extensions + PIP_DISABLE_PIP_VERSION_CHECK: "1" + PIP_NO_PYTHON_VERSION_WARNING: "1" + +jobs: + build: + name: Build (PyTorch main + apex wheel) + runs-on: build-only-apex + timeout-minutes: 720 + + steps: + - name: Checkout apex (with submodules) + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Print ROCm info (runner sanity) + shell: bash + run: | + set -euxo pipefail + echo "ROCM_PATH=${ROCM_PATH}" + ls -la "${ROCM_PATH}" || true + sudo hipcc --version || true + rocm-smi || true + sudo rocminfo | head -n 50 || true + + - name: Resolve latest PyTorch main SHA + id: pytorch + shell: bash + run: | + set -euxo pipefail + SHA="$(git ls-remote https://github.com/pytorch/pytorch.git refs/heads/main | awk '{print $1}')" + echo "sha=${SHA}" >> "${GITHUB_OUTPUT}" + echo "PyTorch main SHA: ${SHA}" + + - name: Restore cached PyTorch wheel + id: cache-torch + uses: actions/cache@v4 + with: + path: .ci/torch-wheel + key: torch-rocm72-py312-${{ steps.pytorch.outputs.sha }} + + - name: Build PyTorch wheel from source if cache miss + if: steps.cache-torch.outputs.cache-hit != 'true' + shell: bash + env: + USE_ROCM: "1" + USE_CUDA: "0" + BUILD_TEST: "0" + MAX_JOBS: "16" + run: | + set -euxo pipefail + + mkdir -p .ci/torch-wheel + rm -rf .ci/pytorch-src + git clone --recursive https://github.com/pytorch/pytorch.git .ci/pytorch-src + cd .ci/pytorch-src + git checkout "${{ steps.pytorch.outputs.sha }}" + git submodule update --init --recursive + + python -m pip install -U pip setuptools wheel + python -m pip install -r requirements.txt + # Common build helpers; harmless if already installed + python -m pip install cmake ninja + + export ROCM_HOME="${ROCM_PATH}" + export PATH="${ROCM_PATH}/bin:${PATH}" + export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" + + # Build a wheel + python setup.py bdist_wheel + + ls -la dist + cp -v dist/*.whl "${GITHUB_WORKSPACE}/.ci/torch-wheel/" + + - name: Install PyTorch wheel + shell: bash + run: | + set -euxo pipefail + ls -la .ci/torch-wheel + python -m pip install .ci/torch-wheel/*.whl + python -c "import torch; print('torch:', torch.__version__); print('hip:', torch.version.hip); print('git:', getattr(torch.version, 'git_version', None))" + + - name: Install apex Python requirements + shell: bash + run: | + set -euxo pipefail + python -m pip install -r requirements.txt + + - name: Clean + build apex wheel + shell: bash + env: + APEX_BUILD_CPP_OPS: "1" + APEX_BUILD_CUDA_OPS: "1" + PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} + run: | + set -euxo pipefail + python -m pip uninstall -y apex || true + make clean || true + + python -m build --wheel --no-isolation . + ls -la dist + + - name: Upload wheels (torch + apex) + uses: actions/upload-artifact@v4 + with: + name: wheels-py312-rocm + if-no-files-found: error + path: | + .ci/torch-wheel/*.whl + dist/*.whl + + test: + name: Test + runs-on: linux-apex-mi325-8 + needs: build + timeout-minutes: 360 + + steps: + - name: Checkout apex (with submodules) + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Download wheels + uses: actions/download-artifact@v4 + with: + name: wheels-py312-rocm + path: .ci/wheels + + - name: Install PyTorch + deps + apex wheel + shell: bash + env: + PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} + run: | + set -euxo pipefail + + ls -la .ci/wheels + python -m pip install .ci/wheels/torch*.whl + + python -m pip install -r requirements.txt + python -m pip install .ci/wheels/apex-*.whl + + python -c "import torch; print('torch:', torch.__version__, 'hip:', torch.version.hip)" + python -c "import apex; print('apex import OK')" + + - name: Run ROCm test suite (jit_build/run_tests.sh) + shell: bash + env: + PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} + TORCH_EXTENSIONS_DIR: ${{ env.TORCH_EXTENSIONS_DIR }} + NPROC_PER_NODE: "8" + run: | + set -euxo pipefail + mkdir -p test-artifacts + + # Avoid rendezvous port clashes on shared runners + export MASTER_ADDR=127.0.0.1 + export MASTER_PORT="$((10000 + RANDOM % 20000))" + + # Run and capture output + set -o pipefail + bash tests/jit_build/run_tests.sh "condition" "11" 2>&1 | tee test-artifacts/ci-console.log + + # The script prints a final line: "FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT" + LAST_LINE="$(tail -n 1 test-artifacts/ci-console.log)" + echo "Last line: ${LAST_LINE}" + + FAILED_L0="$(echo "${LAST_LINE}" | awk '{print $1}')" + FAILED_CONTRIB="$(echo "${LAST_LINE}" | awk '{print $2}')" + BUILT_SO_COUNT="$(echo "${LAST_LINE}" | awk '{print $3}')" + TORCH_EXT_COUNT="$(echo "${LAST_LINE}" | awk '{print $4}')" + + { + echo "## ROCm apex test summary" + echo "" + echo "- L0 failed: ${FAILED_L0}" + echo "- contrib failed: ${FAILED_CONTRIB}" + echo "- built .so count: ${BUILT_SO_COUNT}" + echo "- torch extensions count: ${TORCH_EXT_COUNT}" + echo "" + echo "Artifacts include raw logs from the test scripts." + } >> "${GITHUB_STEP_SUMMARY}" + + # Fail CI if any failures were detected + test "${FAILED_L0}" -eq 0 + test "${FAILED_CONTRIB}" -eq 0 + + - name: Collect logs produced by scripts + if: always() + shell: bash + run: | + set -euxo pipefail + # These are created by tests/jit_build/run_tests.sh + ls -la . || true + cp -v results_jit_unit_test*.log test-artifacts/ || true + cp -v results_jit_unit_test*.csv test-artifacts/ || true + + - name: Upload test results/logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-py312-rocm + if-no-files-found: warn + path: | + test-artifacts/** + results_jit_unit_test*.log + results_jit_unit_test*.csv From 70231ba6ed5a848a9d1dba9e93e2a8c7419ad06c Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 19 Feb 2026 17:22:14 +0100 Subject: [PATCH 02/43] Change target branch --- .github/.workflows/{rocm-nightly.yml => rocm-ci.yml} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename .github/.workflows/{rocm-nightly.yml => rocm-ci.yml} (98%) diff --git a/.github/.workflows/rocm-nightly.yml b/.github/.workflows/rocm-ci.yml similarity index 98% rename from .github/.workflows/rocm-nightly.yml rename to .github/.workflows/rocm-ci.yml index 345c0a130..67d2a66a2 100644 --- a/.github/.workflows/rocm-nightly.yml +++ b/.github/.workflows/rocm-ci.yml @@ -1,14 +1,14 @@ -name: rocm-nightly-ci +name: rocm-ci on: pull_request: - branches: [main] + branches: [master] workflow_dispatch: schedule: - cron: "0 2 * * *" concurrency: - group: rocm-nightly-${{ github.event.pull_request.number || github.ref }} + group: rocm-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true permissions: From 29a8e0b8d36630078875a8e424ebc7819f4e4657 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 19 Feb 2026 17:23:57 +0100 Subject: [PATCH 03/43] Update naming --- .github/.workflows/rocm-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/.workflows/rocm-ci.yml b/.github/.workflows/rocm-ci.yml index 67d2a66a2..062294187 100644 --- a/.github/.workflows/rocm-ci.yml +++ b/.github/.workflows/rocm-ci.yml @@ -1,4 +1,4 @@ -name: rocm-ci +name: ROCm CI on: pull_request: From 7bcfc34a052045d81663c719ac8ce8e584e5a723 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 19 Feb 2026 17:51:46 +0100 Subject: [PATCH 04/43] ci: trigger actions From a19f56c8b7f01c32cdbb2aafb7788af04d12d209 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 19 Feb 2026 17:53:12 +0100 Subject: [PATCH 05/43] Move the file --- .github/{.workflows => workflows}/rocm-ci.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/{.workflows => workflows}/rocm-ci.yml (100%) diff --git a/.github/.workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml similarity index 100% rename from .github/.workflows/rocm-ci.yml rename to .github/workflows/rocm-ci.yml From 43a1d6d77d36e5eed93e258e269da2878b63dcff Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 20 Feb 2026 11:29:42 +0100 Subject: [PATCH 06/43] Setup python env --- .github/workflows/rocm-ci.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 062294187..cd2276931 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -36,6 +36,19 @@ jobs: submodules: recursive fetch-depth: 0 + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Create venv + base tooling + shell: bash + run: | + set -euxo pipefail + python -m venv .venv + echo "${GITHUB_WORKSPACE}/.venv/bin" >> "${GITHUB_PATH}" + python -m pip install -U pip setuptools wheel + - name: Print ROCm info (runner sanity) shell: bash run: | @@ -145,6 +158,19 @@ jobs: submodules: recursive fetch-depth: 0 + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Create venv + base tooling + shell: bash + run: | + set -euxo pipefail + python -m venv .venv + echo "${GITHUB_WORKSPACE}/.venv/bin" >> "${GITHUB_PATH}" + python -m pip install -U pip setuptools wheel + - name: Download wheels uses: actions/download-artifact@v4 with: From 79963bada9ef5ae77416f79099aeaff0fb3f8121 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 20 Feb 2026 12:03:46 +0100 Subject: [PATCH 07/43] Use containers --- .github/workflows/rocm-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index cd2276931..3b9610444 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -27,6 +27,8 @@ jobs: build: name: Build (PyTorch main + apex wheel) runs-on: build-only-apex + container: + image: rocm/dev-ubuntu-24.04:7.2-complete timeout-minutes: 720 steps: @@ -101,6 +103,7 @@ jobs: export ROCM_HOME="${ROCM_PATH}" export PATH="${ROCM_PATH}/bin:${PATH}" export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" + export CMAKE_PREFIX_PATH="${GITHUB_WORKSPACE}/.venv:/opt/rocm" # Build a wheel python setup.py bdist_wheel @@ -148,6 +151,9 @@ jobs: test: name: Test runs-on: linux-apex-mi325-8 + container: + image: rocm/dev-ubuntu-24.04:7.2-complete + options: --device=/dev/kfd --device=/dev/dri --group-add video needs: build timeout-minutes: 360 From 48d9e9c80e0cf18a196d2102354b294e648f287d Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 20 Feb 2026 18:02:25 +0100 Subject: [PATCH 08/43] These k8s runners don't support native containers, therefore I am running containers in bash --- .github/workflows/rocm-ci.yml | 299 +++++++++++++++------------------- 1 file changed, 135 insertions(+), 164 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 3b9610444..058dd75a3 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -2,10 +2,8 @@ name: ROCm CI on: pull_request: - branches: [master] + branches: [main] workflow_dispatch: - schedule: - - cron: "0 2 * * *" concurrency: group: rocm-${{ github.event.pull_request.number || github.ref }} @@ -15,20 +13,12 @@ permissions: contents: read env: - PYTHON_VERSION: "3.12" - ROCM_PATH: "/opt/rocm" PYTORCH_ROCM_ARCH: "gfx942" - # Avoid cross-run contamination on self-hosted boxes - TORCH_EXTENSIONS_DIR: ${{ github.workspace }}/.torch_extensions - PIP_DISABLE_PIP_VERSION_CHECK: "1" - PIP_NO_PYTHON_VERSION_WARNING: "1" jobs: build: name: Build (PyTorch main + apex wheel) runs-on: build-only-apex - container: - image: rocm/dev-ubuntu-24.04:7.2-complete timeout-minutes: 720 steps: @@ -38,34 +28,10 @@ jobs: submodules: recursive fetch-depth: 0 - - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Create venv + base tooling - shell: bash - run: | - set -euxo pipefail - python -m venv .venv - echo "${GITHUB_WORKSPACE}/.venv/bin" >> "${GITHUB_PATH}" - python -m pip install -U pip setuptools wheel - - - name: Print ROCm info (runner sanity) - shell: bash - run: | - set -euxo pipefail - echo "ROCM_PATH=${ROCM_PATH}" - ls -la "${ROCM_PATH}" || true - sudo hipcc --version || true - rocm-smi || true - sudo rocminfo | head -n 50 || true - - name: Resolve latest PyTorch main SHA id: pytorch shell: bash run: | - set -euxo pipefail SHA="$(git ls-remote https://github.com/pytorch/pytorch.git refs/heads/main | awk '{print $1}')" echo "sha=${SHA}" >> "${GITHUB_OUTPUT}" echo "PyTorch main SHA: ${SHA}" @@ -77,72 +43,106 @@ jobs: path: .ci/torch-wheel key: torch-rocm72-py312-${{ steps.pytorch.outputs.sha }} - - name: Build PyTorch wheel from source if cache miss + - name: Build PyTorch if: steps.cache-torch.outputs.cache-hit != 'true' shell: bash - env: - USE_ROCM: "1" - USE_CUDA: "0" - BUILD_TEST: "0" - MAX_JOBS: "16" run: | set -euxo pipefail - - mkdir -p .ci/torch-wheel - rm -rf .ci/pytorch-src - git clone --recursive https://github.com/pytorch/pytorch.git .ci/pytorch-src - cd .ci/pytorch-src + + # Clone PyTorch locally + git clone --recursive https://github.com/pytorch/pytorch.git pytorch-src + cd pytorch-src git checkout "${{ steps.pytorch.outputs.sha }}" git submodule update --init --recursive + cd .. - python -m pip install -U pip setuptools wheel - python -m pip install -r requirements.txt - # Common build helpers; harmless if already installed - python -m pip install cmake ninja - - export ROCM_HOME="${ROCM_PATH}" - export PATH="${ROCM_PATH}/bin:${PATH}" - export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" - export CMAKE_PREFIX_PATH="${GITHUB_WORKSPACE}/.venv:/opt/rocm" - - # Build a wheel - python setup.py bdist_wheel - - ls -la dist - cp -v dist/*.whl "${GITHUB_WORKSPACE}/.ci/torch-wheel/" - - - name: Install PyTorch wheel - shell: bash - run: | + # Create build script for the container + cat << 'EOF' > build_torch.sh + #!/bin/bash set -euxo pipefail - ls -la .ci/torch-wheel - python -m pip install .ci/torch-wheel/*.whl - python -c "import torch; print('torch:', torch.__version__); print('hip:', torch.version.hip); print('git:', getattr(torch.version, 'git_version', None))" + export DEBIAN_FRONTEND=noninteractive + apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake ninja-build git + + python3 -m venv /venv + export PATH="/venv/bin:$PATH" + pip install -U pip setuptools wheel + + cd /workspace/pytorch-src + pip install -r requirements.txt + + export ROCM_HOME=/opt/rocm + export PYTORCH_ROCM_ARCH=gfx942 + export CMAKE_PREFIX_PATH="/venv:/opt/rocm" + export USE_ROCM=1 + export USE_CUDA=0 + export BUILD_TEST=0 + export MAX_JOBS=16 + + python setup.py bdist_wheel + + mkdir -p /workspace/dist + cp dist/*.whl /workspace/dist/ + EOF + chmod +x build_torch.sh + + # Run container in background, inject files, build, extract wheels + docker rm -f rocm-builder || true + docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity + + docker exec rocm-builder mkdir -p /workspace + docker cp pytorch-src rocm-builder:/workspace/pytorch-src + docker cp build_torch.sh rocm-builder:/workspace/build_torch.sh + + docker exec rocm-builder /workspace/build_torch.sh + + mkdir -p .ci/torch-wheel + docker cp rocm-builder:/workspace/dist/. .ci/torch-wheel/ + docker rm -f rocm-builder - - name: Install apex Python requirements + - name: Build Apex shell: bash run: | set -euxo pipefail - python -m pip install -r requirements.txt - - - name: Clean + build apex wheel - shell: bash - env: - APEX_BUILD_CPP_OPS: "1" - APEX_BUILD_CUDA_OPS: "1" - PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} - run: | + + cat << 'EOF' > build_apex.sh + #!/bin/bash set -euxo pipefail - python -m pip uninstall -y apex || true + export DEBIAN_FRONTEND=noninteractive + apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git + + python3 -m venv /venv + export PATH="/venv/bin:$PATH" + pip install -U pip setuptools wheel build + + cd /workspace + pip install .ci/torch-wheel/*.whl + pip install -r requirements.txt + + export PYTORCH_ROCM_ARCH=gfx942 + export APEX_BUILD_CPP_OPS=1 + export APEX_BUILD_CUDA_OPS=1 + make clean || true - python -m build --wheel --no-isolation . - ls -la dist + EOF + chmod +x build_apex.sh + + docker rm -f rocm-builder || true + docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity + + docker exec rocm-builder mkdir -p /workspace + docker cp . rocm-builder:/workspace/ + + docker exec rocm-builder /workspace/build_apex.sh + + mkdir -p dist + docker cp rocm-builder:/workspace/dist/. dist/ + docker rm -f rocm-builder - name: Upload wheels (torch + apex) uses: actions/upload-artifact@v4 with: - name: wheels-py312-rocm + name: wheels-py312-rocm72 if-no-files-found: error path: | .ci/torch-wheel/*.whl @@ -151,113 +151,84 @@ jobs: test: name: Test runs-on: linux-apex-mi325-8 - container: - image: rocm/dev-ubuntu-24.04:7.2-complete - options: --device=/dev/kfd --device=/dev/dri --group-add video needs: build timeout-minutes: 360 steps: - - name: Checkout apex (with submodules) + - name: Checkout apex uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 - - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Create venv + base tooling - shell: bash - run: | - set -euxo pipefail - python -m venv .venv - echo "${GITHUB_WORKSPACE}/.venv/bin" >> "${GITHUB_PATH}" - python -m pip install -U pip setuptools wheel - - name: Download wheels uses: actions/download-artifact@v4 with: - name: wheels-py312-rocm + name: wheels-py312-rocm72 path: .ci/wheels - - name: Install PyTorch + deps + apex wheel + - name: Run Tests shell: bash - env: - PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} run: | set -euxo pipefail - - ls -la .ci/wheels - python -m pip install .ci/wheels/torch*.whl - - python -m pip install -r requirements.txt - python -m pip install .ci/wheels/apex-*.whl - + + cat << 'EOF' > run_tests_docker.sh + #!/bin/bash + set -euxo pipefail + export DEBIAN_FRONTEND=noninteractive + apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git + + python3 -m venv /venv + export PATH="/venv/bin:$PATH" + pip install -U pip setuptools wheel + + cd /workspace + pip install .ci/wheels/torch*.whl + pip install -r requirements.txt + pip install .ci/wheels/apex*.whl + python -c "import torch; print('torch:', torch.__version__, 'hip:', torch.version.hip)" python -c "import apex; print('apex import OK')" - - - name: Run ROCm test suite (jit_build/run_tests.sh) - shell: bash - env: - PYTORCH_ROCM_ARCH: ${{ env.PYTORCH_ROCM_ARCH }} - TORCH_EXTENSIONS_DIR: ${{ env.TORCH_EXTENSIONS_DIR }} - NPROC_PER_NODE: "8" - run: | - set -euxo pipefail - mkdir -p test-artifacts - - # Avoid rendezvous port clashes on shared runners + + export PYTORCH_ROCM_ARCH=gfx942 + export TORCH_EXTENSIONS_DIR=/workspace/.torch_extensions + export NPROC_PER_NODE=8 export MASTER_ADDR=127.0.0.1 export MASTER_PORT="$((10000 + RANDOM % 20000))" - - # Run and capture output - set -o pipefail + + mkdir -p test-artifacts bash tests/jit_build/run_tests.sh "condition" "11" 2>&1 | tee test-artifacts/ci-console.log + + # Manually move script logs so we can copy them out + cp results_jit_unit_test*.log test-artifacts/ || true + cp results_jit_unit_test*.csv test-artifacts/ || true + EOF + chmod +x run_tests_docker.sh + + docker rm -f rocm-tester || true + docker run -d --name rocm-tester --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-24.04:7.2-complete sleep infinity + + docker exec rocm-tester mkdir -p /workspace + docker cp . rocm-tester:/workspace/ + + # Run tests and capture exit code + set +e + docker exec rocm-tester /workspace/run_tests_docker.sh + EXIT_CODE=$? + set -e + + # Extract logs + mkdir -p test-artifacts + docker cp rocm-tester:/workspace/test-artifacts/. test-artifacts/ + docker rm -f rocm-tester + + # Fail CI if tests failed + exit $EXIT_CODE - # The script prints a final line: "FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT" - LAST_LINE="$(tail -n 1 test-artifacts/ci-console.log)" - echo "Last line: ${LAST_LINE}" - - FAILED_L0="$(echo "${LAST_LINE}" | awk '{print $1}')" - FAILED_CONTRIB="$(echo "${LAST_LINE}" | awk '{print $2}')" - BUILT_SO_COUNT="$(echo "${LAST_LINE}" | awk '{print $3}')" - TORCH_EXT_COUNT="$(echo "${LAST_LINE}" | awk '{print $4}')" - - { - echo "## ROCm apex test summary" - echo "" - echo "- L0 failed: ${FAILED_L0}" - echo "- contrib failed: ${FAILED_CONTRIB}" - echo "- built .so count: ${BUILT_SO_COUNT}" - echo "- torch extensions count: ${TORCH_EXT_COUNT}" - echo "" - echo "Artifacts include raw logs from the test scripts." - } >> "${GITHUB_STEP_SUMMARY}" - - # Fail CI if any failures were detected - test "${FAILED_L0}" -eq 0 - test "${FAILED_CONTRIB}" -eq 0 - - - name: Collect logs produced by scripts - if: always() - shell: bash - run: | - set -euxo pipefail - # These are created by tests/jit_build/run_tests.sh - ls -la . || true - cp -v results_jit_unit_test*.log test-artifacts/ || true - cp -v results_jit_unit_test*.csv test-artifacts/ || true - - - name: Upload test results/logs + - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: name: test-results-py312-rocm if-no-files-found: warn - path: | - test-artifacts/** - results_jit_unit_test*.log - results_jit_unit_test*.csv + path: test-artifacts/** From 25126ef76b04d5877b173de0a4827e66cf40e496 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 20 Feb 2026 18:05:55 +0100 Subject: [PATCH 09/43] Typo --- .github/workflows/rocm-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 058dd75a3..53f5a2b36 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -2,7 +2,7 @@ name: ROCm CI on: pull_request: - branches: [main] + branches: [master] workflow_dispatch: concurrency: From c307e2091b174b0708f92461d658ab49c3783194 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 10:54:28 +0100 Subject: [PATCH 10/43] Fix git dubious ownership --- .github/workflows/rocm-ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 53f5a2b36..9fb4481b0 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -22,7 +22,7 @@ jobs: timeout-minutes: 720 steps: - - name: Checkout apex (with submodules) + - name: Checkout apex uses: actions/checkout@v4 with: submodules: recursive @@ -63,6 +63,8 @@ jobs: export DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake ninja-build git + git config --global --add safe.directory '*' + python3 -m venv /venv export PATH="/venv/bin:$PATH" pip install -U pip setuptools wheel @@ -110,6 +112,8 @@ jobs: export DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git + git config --global --add safe.directory '*' + python3 -m venv /venv export PATH="/venv/bin:$PATH" pip install -U pip setuptools wheel build From bdc31f040b91048a0e6061008259d054b6bf7e37 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 11:13:42 +0100 Subject: [PATCH 11/43] Git fixes --- .github/workflows/rocm-ci.yml | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 9fb4481b0..6fe4665b1 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -49,7 +49,7 @@ jobs: run: | set -euxo pipefail - # Clone PyTorch locally + # Clone PyTorch locally (ensure deep clone) git clone --recursive https://github.com/pytorch/pytorch.git pytorch-src cd pytorch-src git checkout "${{ steps.pytorch.outputs.sha }}" @@ -63,8 +63,9 @@ jobs: export DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake ninja-build git - git config --global --add safe.directory '*' - + # Tell Git to trust the copied directories + git config --global --add safe.directory /workspace/pytorch-src + python3 -m venv /venv export PATH="/venv/bin:$PATH" pip install -U pip setuptools wheel @@ -80,19 +81,27 @@ jobs: export BUILD_TEST=0 export MAX_JOBS=16 + # Clean any existing generated files before building + python tools/amd_build/build_amd.py clean + + # Force HIPIFY to process all files, ignoring Git state + export BUILD_SPLIT_CUDA=ON + export BUILD_AMD_FORCE_HIPIFY=1 + python setup.py bdist_wheel mkdir -p /workspace/dist cp dist/*.whl /workspace/dist/ EOF chmod +x build_torch.sh - + # Run container in background, inject files, build, extract wheels docker rm -f rocm-builder || true docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity docker exec rocm-builder mkdir -p /workspace - docker cp pytorch-src rocm-builder:/workspace/pytorch-src + # Ensure we copy the hidden .git directory as well + docker cp pytorch-src/. rocm-builder:/workspace/pytorch-src/ docker cp build_torch.sh rocm-builder:/workspace/build_torch.sh docker exec rocm-builder /workspace/build_torch.sh @@ -113,7 +122,7 @@ jobs: apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git git config --global --add safe.directory '*' - + python3 -m venv /venv export PATH="/venv/bin:$PATH" pip install -U pip setuptools wheel build From b85b9ca38a2c3a8af19ab64753e39253227122fd Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 11:33:21 +0100 Subject: [PATCH 12/43] Typo --- .github/workflows/rocm-ci.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 6fe4665b1..52d8bd74d 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -81,9 +81,6 @@ jobs: export BUILD_TEST=0 export MAX_JOBS=16 - # Clean any existing generated files before building - python tools/amd_build/build_amd.py clean - # Force HIPIFY to process all files, ignoring Git state export BUILD_SPLIT_CUDA=ON export BUILD_AMD_FORCE_HIPIFY=1 From 9464a90631a53a2892cb3e108ef639dddc3d1f68 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 12:05:21 +0100 Subject: [PATCH 13/43] Cmake change --- .github/workflows/rocm-ci.yml | 82 +++++++++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 52d8bd74d..0905f7e58 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -61,7 +61,21 @@ jobs: #!/bin/bash set -euxo pipefail export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake ninja-build git + apt-get update && apt-get install -y \ + ca-certificates \ + curl \ + git \ + pkg-config \ + python3-dev \ + python3-venv \ + python-is-python3 \ + build-essential \ + cmake \ + ninja-build \ + libssl-dev \ + libopenblas-dev \ + zlib1g-dev \ + libffi-dev # Tell Git to trust the copied directories git config --global --add safe.directory /workspace/pytorch-src @@ -71,7 +85,39 @@ jobs: pip install -U pip setuptools wheel cd /workspace/pytorch-src - pip install -r requirements.txt + + # PyTorch's requirements.txt may pull in the Python "cmake" package which + # ships CMake 4.x into /venv/bin. CMake 4 breaks third_party projects + # that still set cmake_minimum_required(VERSION < 3.5) (e.g. NNPACK). + # We must use system CMake from apt (Ubuntu 24.04: 3.28.x). + python - << 'PY' + from pathlib import Path + src = Path("requirements.txt").read_text().splitlines() + out = [] + for line in src: + s = line.strip() + if not s or s.startswith("#"): + out.append(line) + continue + name = s.split()[0] + base = name.split("==")[0].split(">=")[0].split("<=")[0] + base = base.split("~=")[0].split("!=")[0] + if base in {"cmake", "ninja"}: + continue + out.append(line) + Path("/tmp/requirements.no-cmake-ninja.txt").write_text( + "\n".join(out) + "\n" + ) + PY + pip install -r /tmp/requirements.no-cmake-ninja.txt + + # Defensive: if anything still pulled them in, remove venv-provided binaries. + pip uninstall -y cmake ninja || true + rm -f /venv/bin/cmake /venv/bin/ninja || true + hash -r + + cmake --version + ninja --version export ROCM_HOME=/opt/rocm export PYTORCH_ROCM_ARCH=gfx942 @@ -80,10 +126,10 @@ jobs: export USE_CUDA=0 export BUILD_TEST=0 export MAX_JOBS=16 - - # Force HIPIFY to process all files, ignoring Git state - export BUILD_SPLIT_CUDA=ON - export BUILD_AMD_FORCE_HIPIFY=1 + + export USE_NNPACK=0 + export USE_XNNPACK=0 + export USE_PYTORCH_QNNPACK=0 python setup.py bdist_wheel @@ -116,7 +162,16 @@ jobs: #!/bin/bash set -euxo pipefail export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git + apt-get update && apt-get install -y \ + ca-certificates \ + git \ + python3-venv \ + python3-dev \ + python-is-python3 \ + build-essential \ + cmake \ + ninja-build \ + libssl-dev git config --global --add safe.directory '*' @@ -128,6 +183,7 @@ jobs: pip install .ci/torch-wheel/*.whl pip install -r requirements.txt + export ROCM_HOME=/opt/rocm export PYTORCH_ROCM_ARCH=gfx942 export APEX_BUILD_CPP_OPS=1 export APEX_BUILD_CUDA_OPS=1 @@ -186,7 +242,16 @@ jobs: #!/bin/bash set -euxo pipefail export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y python3-venv python3-dev python-is-python3 build-essential cmake git + apt-get update && apt-get install -y \ + ca-certificates \ + git \ + python3-venv \ + python3-dev \ + python-is-python3 \ + build-essential \ + cmake \ + ninja-build \ + libssl-dev python3 -m venv /venv export PATH="/venv/bin:$PATH" @@ -200,6 +265,7 @@ jobs: python -c "import torch; print('torch:', torch.__version__, 'hip:', torch.version.hip)" python -c "import apex; print('apex import OK')" + export ROCM_HOME=/opt/rocm export PYTORCH_ROCM_ARCH=gfx942 export TORCH_EXTENSIONS_DIR=/workspace/.torch_extensions export NPROC_PER_NODE=8 From 73fdcc99e8d30640b6d40cd111394997cb3d5a56 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 12:25:42 +0100 Subject: [PATCH 14/43] requirements.txt fix --- .github/workflows/rocm-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 0905f7e58..48b6e91bc 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -105,11 +105,11 @@ jobs: if base in {"cmake", "ninja"}: continue out.append(line) - Path("/tmp/requirements.no-cmake-ninja.txt").write_text( + Path("requirements.no-cmake-ninja.txt").write_text( "\n".join(out) + "\n" ) PY - pip install -r /tmp/requirements.no-cmake-ninja.txt + pip install -r requirements.no-cmake-ninja.txt # Defensive: if anything still pulled them in, remove venv-provided binaries. pip uninstall -y cmake ninja || true From 2616627f3a6d9df887516269024b5e4e9b32a5ce Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 12:51:54 +0100 Subject: [PATCH 15/43] Clone in container --- .github/workflows/rocm-ci.yml | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 48b6e91bc..3c76c31d0 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -48,18 +48,13 @@ jobs: shell: bash run: | set -euxo pipefail - - # Clone PyTorch locally (ensure deep clone) - git clone --recursive https://github.com/pytorch/pytorch.git pytorch-src - cd pytorch-src - git checkout "${{ steps.pytorch.outputs.sha }}" - git submodule update --init --recursive - cd .. # Create build script for the container cat << 'EOF' > build_torch.sh #!/bin/bash set -euxo pipefail + : "${PYTORCH_SHA:?PYTORCH_SHA env var not set}" + export DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y \ ca-certificates \ @@ -77,14 +72,25 @@ jobs: zlib1g-dev \ libffi-dev - # Tell Git to trust the copied directories - git config --global --add safe.directory /workspace/pytorch-src + git config --global --add safe.directory '*' + + rm -rf /workspace/pytorch-src + git clone --recursive https://github.com/pytorch/pytorch.git /workspace/pytorch-src + cd /workspace/pytorch-src + git checkout "${PYTORCH_SHA}" + git submodule sync --recursive + git submodule update --init --recursive + + # Fail early with a clear error if the checkout is missing ROCm/HIP bits + test -f c10/hip/impl/hip_cmake_macros.h.in + test -f aten/src/ATen/hip/HIPConfig.h.in + test -d aten/src/THH + git rev-parse HEAD + git show -s --oneline python3 -m venv /venv export PATH="/venv/bin:$PATH" pip install -U pip setuptools wheel - - cd /workspace/pytorch-src # PyTorch's requirements.txt may pull in the Python "cmake" package which # ships CMake 4.x into /venv/bin. CMake 4 breaks third_party projects @@ -143,11 +149,10 @@ jobs: docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity docker exec rocm-builder mkdir -p /workspace - # Ensure we copy the hidden .git directory as well - docker cp pytorch-src/. rocm-builder:/workspace/pytorch-src/ docker cp build_torch.sh rocm-builder:/workspace/build_torch.sh - docker exec rocm-builder /workspace/build_torch.sh + docker exec -e PYTORCH_SHA='${{ steps.pytorch.outputs.sha }}' \ + rocm-builder /workspace/build_torch.sh mkdir -p .ci/torch-wheel docker cp rocm-builder:/workspace/dist/. .ci/torch-wheel/ From d368e9e572ee1c776836a3536174f3e3aaaefa82 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 13:10:19 +0100 Subject: [PATCH 16/43] Resolve latest PyTorch main SHA --- .github/workflows/rocm-ci.yml | 39 ++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 3c76c31d0..70f4f6159 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -32,9 +32,42 @@ jobs: id: pytorch shell: bash run: | - SHA="$(git ls-remote https://github.com/pytorch/pytorch.git refs/heads/main | awk '{print $1}')" - echo "sha=${SHA}" >> "${GITHUB_OUTPUT}" - echo "PyTorch main SHA: ${SHA}" + set -euxo pipefail + + PROBE_DIR="$(mktemp -d)" + git init "${PROBE_DIR}" + cd "${PROBE_DIR}" + git remote add origin https://github.com/pytorch/pytorch.git + + # Fetch a window of recent commits from main (no blobs needed for path checks) + git -c protocol.version=2 fetch --depth=500 --filter=blob:none origin main + TIP_SHA="$(git rev-parse FETCH_HEAD)" + echo "tip_sha=${TIP_SHA}" >> "${GITHUB_OUTPUT}" + echo "PyTorch main tip SHA: ${TIP_SHA}" + + has_path() { + local sha="$1" + local path="$2" + git ls-tree "${sha}" -- "${path}" | grep -q . + } + + GOOD_SHA="" + for sha in $(git rev-list FETCH_HEAD); do + if has_path "${sha}" "c10/hip/impl/hip_cmake_macros.h.in" && \ + has_path "${sha}" "aten/src/ATen/hip/HIPConfig.h.in" && \ + has_path "${sha}" "aten/src/THH"; then + GOOD_SHA="${sha}" + break + fi + done + + if [ -z "${GOOD_SHA}" ]; then + echo "ERROR: No ROCm-buildable SHA found in the last 500 commits of main" >&2 + exit 1 + fi + + echo "sha=${GOOD_SHA}" >> "${GITHUB_OUTPUT}" + echo "Selected PyTorch SHA: ${GOOD_SHA}" - name: Restore cached PyTorch wheel id: cache-torch From 143384a96a8e9265a21d1a0d15ef98e981f080d6 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 13:23:48 +0100 Subject: [PATCH 17/43] Rewrite from scratch --- .github/workflows/rocm-ci.yml | 408 ++++++++++++---------------------- 1 file changed, 140 insertions(+), 268 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 70f4f6159..b8f10e1cb 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -4,6 +4,13 @@ on: pull_request: branches: [master] workflow_dispatch: + inputs: + pytorch_index_url: + description: >- + PyTorch nightly wheel index URL (e.g. + https://download.pytorch.org/whl/nightly/rocm7.2) + required: false + default: "https://download.pytorch.org/whl/nightly/rocm7.2" concurrency: group: rocm-${{ github.event.pull_request.number || github.ref }} @@ -17,332 +24,197 @@ env: jobs: build: - name: Build (PyTorch main + apex wheel) + name: Build apex wheel (ROCm) runs-on: build-only-apex - timeout-minutes: 720 + timeout-minutes: 180 steps: - - name: Checkout apex + - name: Checkout uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 - - name: Resolve latest PyTorch main SHA - id: pytorch + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + + - name: Ensure ROCm is on PATH shell: bash run: | set -euxo pipefail + echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" + echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" + test -x "${ROCM_HOME}/bin/hipcc" - PROBE_DIR="$(mktemp -d)" - git init "${PROBE_DIR}" - cd "${PROBE_DIR}" - git remote add origin https://github.com/pytorch/pytorch.git - - # Fetch a window of recent commits from main (no blobs needed for path checks) - git -c protocol.version=2 fetch --depth=500 --filter=blob:none origin main - TIP_SHA="$(git rev-parse FETCH_HEAD)" - echo "tip_sha=${TIP_SHA}" >> "${GITHUB_OUTPUT}" - echo "PyTorch main tip SHA: ${TIP_SHA}" - - has_path() { - local sha="$1" - local path="$2" - git ls-tree "${sha}" -- "${path}" | grep -q . - } - - GOOD_SHA="" - for sha in $(git rev-list FETCH_HEAD); do - if has_path "${sha}" "c10/hip/impl/hip_cmake_macros.h.in" && \ - has_path "${sha}" "aten/src/ATen/hip/HIPConfig.h.in" && \ - has_path "${sha}" "aten/src/THH"; then - GOOD_SHA="${sha}" - break + - name: Install build deps (best-effort) + shell: bash + run: | + set -euxo pipefail + if command -v apt-get >/dev/null; then + SUDO="" + if command -v sudo >/dev/null && sudo -n true 2>/dev/null; then + SUDO="sudo -n" fi - done - - if [ -z "${GOOD_SHA}" ]; then - echo "ERROR: No ROCm-buildable SHA found in the last 500 commits of main" >&2 - exit 1 + ${SUDO} apt-get update + ${SUDO} apt-get install -y \ + build-essential \ + cmake \ + ninja-build \ + python3-dev \ + pkg-config \ + libopenblas-dev \ + libffi-dev \ + libssl-dev fi - echo "sha=${GOOD_SHA}" >> "${GITHUB_OUTPUT}" - echo "Selected PyTorch SHA: ${GOOD_SHA}" - - - name: Restore cached PyTorch wheel - id: cache-torch - uses: actions/cache@v4 - with: - path: .ci/torch-wheel - key: torch-rocm72-py312-${{ steps.pytorch.outputs.sha }} - - - name: Build PyTorch - if: steps.cache-torch.outputs.cache-hit != 'true' + - name: Create venv + install PyTorch nightly (ROCm) shell: bash run: | set -euxo pipefail + python -m venv .venv + . .venv/bin/activate + + python -m pip install -U pip setuptools wheel build packaging + + # Install torch from the ROCm nightly index, but allow deps from PyPI. + python -m pip install --pre torch \ + --index-url "${PYTORCH_INDEX_URL}" \ + --extra-index-url https://pypi.org/simple - # Create build script for the container - cat << 'EOF' > build_torch.sh - #!/bin/bash - set -euxo pipefail - : "${PYTORCH_SHA:?PYTORCH_SHA env var not set}" - - export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y \ - ca-certificates \ - curl \ - git \ - pkg-config \ - python3-dev \ - python3-venv \ - python-is-python3 \ - build-essential \ - cmake \ - ninja-build \ - libssl-dev \ - libopenblas-dev \ - zlib1g-dev \ - libffi-dev - - git config --global --add safe.directory '*' - - rm -rf /workspace/pytorch-src - git clone --recursive https://github.com/pytorch/pytorch.git /workspace/pytorch-src - cd /workspace/pytorch-src - git checkout "${PYTORCH_SHA}" - git submodule sync --recursive - git submodule update --init --recursive - - # Fail early with a clear error if the checkout is missing ROCm/HIP bits - test -f c10/hip/impl/hip_cmake_macros.h.in - test -f aten/src/ATen/hip/HIPConfig.h.in - test -d aten/src/THH - git rev-parse HEAD - git show -s --oneline - - python3 -m venv /venv - export PATH="/venv/bin:$PATH" - pip install -U pip setuptools wheel - - # PyTorch's requirements.txt may pull in the Python "cmake" package which - # ships CMake 4.x into /venv/bin. CMake 4 breaks third_party projects - # that still set cmake_minimum_required(VERSION < 3.5) (e.g. NNPACK). - # We must use system CMake from apt (Ubuntu 24.04: 3.28.x). python - << 'PY' - from pathlib import Path - src = Path("requirements.txt").read_text().splitlines() - out = [] - for line in src: - s = line.strip() - if not s or s.startswith("#"): - out.append(line) - continue - name = s.split()[0] - base = name.split("==")[0].split(">=")[0].split("<=")[0] - base = base.split("~=")[0].split("!=")[0] - if base in {"cmake", "ninja"}: - continue - out.append(line) - Path("requirements.no-cmake-ninja.txt").write_text( - "\n".join(out) + "\n" - ) + import torch + print("torch:", torch.__version__) + print("hip:", torch.version.hip) + print("git:", getattr(torch.version, "git_version", None)) PY - pip install -r requirements.no-cmake-ninja.txt - - # Defensive: if anything still pulled them in, remove venv-provided binaries. - pip uninstall -y cmake ninja || true - rm -f /venv/bin/cmake /venv/bin/ninja || true - hash -r - - cmake --version - ninja --version - - export ROCM_HOME=/opt/rocm - export PYTORCH_ROCM_ARCH=gfx942 - export CMAKE_PREFIX_PATH="/venv:/opt/rocm" - export USE_ROCM=1 - export USE_CUDA=0 - export BUILD_TEST=0 - export MAX_JOBS=16 - - export USE_NNPACK=0 - export USE_XNNPACK=0 - export USE_PYTORCH_QNNPACK=0 - - python setup.py bdist_wheel - - mkdir -p /workspace/dist - cp dist/*.whl /workspace/dist/ - EOF - chmod +x build_torch.sh - - # Run container in background, inject files, build, extract wheels - docker rm -f rocm-builder || true - docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity - - docker exec rocm-builder mkdir -p /workspace - docker cp build_torch.sh rocm-builder:/workspace/build_torch.sh - - docker exec -e PYTORCH_SHA='${{ steps.pytorch.outputs.sha }}' \ - rocm-builder /workspace/build_torch.sh - - mkdir -p .ci/torch-wheel - docker cp rocm-builder:/workspace/dist/. .ci/torch-wheel/ - docker rm -f rocm-builder - - - name: Build Apex + + - name: Build apex wheel shell: bash run: | set -euxo pipefail - - cat << 'EOF' > build_apex.sh - #!/bin/bash - set -euxo pipefail - export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y \ - ca-certificates \ - git \ - python3-venv \ - python3-dev \ - python-is-python3 \ - build-essential \ - cmake \ - ninja-build \ - libssl-dev - - git config --global --add safe.directory '*' - - python3 -m venv /venv - export PATH="/venv/bin:$PATH" - pip install -U pip setuptools wheel build - - cd /workspace - pip install .ci/torch-wheel/*.whl - pip install -r requirements.txt - - export ROCM_HOME=/opt/rocm - export PYTORCH_ROCM_ARCH=gfx942 + . .venv/bin/activate + + python -m pip install -r requirements.txt + + export ROCM_HOME="${ROCM_HOME}" + export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" + export APEX_BUILD_CPP_OPS=1 export APEX_BUILD_CUDA_OPS=1 - - make clean || true + python -m build --wheel --no-isolation . - EOF - chmod +x build_apex.sh - - docker rm -f rocm-builder || true - docker run -d --name rocm-builder rocm/dev-ubuntu-24.04:7.2-complete sleep infinity - - docker exec rocm-builder mkdir -p /workspace - docker cp . rocm-builder:/workspace/ - - docker exec rocm-builder /workspace/build_apex.sh - - mkdir -p dist - docker cp rocm-builder:/workspace/dist/. dist/ - docker rm -f rocm-builder - - - name: Upload wheels (torch + apex) + ls -la dist + + - name: Write build metadata + shell: bash + run: | + set -euxo pipefail + . .venv/bin/activate + + mkdir -p build-metadata + python - << 'PY' > build-metadata/pytorch.txt + import torch + print("torch:", torch.__version__) + print("hip:", torch.version.hip) + print("git:", getattr(torch.version, "git_version", None)) + PY + + - name: Upload wheel + metadata uses: actions/upload-artifact@v4 with: - name: wheels-py312-rocm72 + name: apex-wheel-py312-rocm-nightly if-no-files-found: error path: | - .ci/torch-wheel/*.whl dist/*.whl + build-metadata/pytorch.txt test: name: Test - runs-on: linux-apex-mi325-8 + runs-on: amd-gpu-mi235-8 needs: build timeout-minutes: 360 steps: - - name: Checkout apex + - name: Checkout uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 - - name: Download wheels + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + + - name: Ensure ROCm is on PATH + shell: bash + run: | + set -euxo pipefail + echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" + echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" + test -x "${ROCM_HOME}/bin/hipcc" + + - name: Download apex wheel artifact uses: actions/download-artifact@v4 with: - name: wheels-py312-rocm72 - path: .ci/wheels + name: apex-wheel-py312-rocm-nightly + path: .ci/artifacts - - name: Run Tests + - name: Create venv + install torch + apex wheel shell: bash run: | set -euxo pipefail - - cat << 'EOF' > run_tests_docker.sh - #!/bin/bash + python -m venv .venv + . .venv/bin/activate + + python -m pip install -U pip setuptools wheel + + python -m pip install --pre torch \ + --index-url "${PYTORCH_INDEX_URL}" \ + --extra-index-url https://pypi.org/simple + + python -m pip install -r requirements.txt + python -m pip install .ci/artifacts/dist/*.whl + + python - << 'PY' + import torch, apex + print("torch:", torch.__version__, "hip:", torch.version.hip) + print("apex:", getattr(apex, "__version__", "unknown"), "import OK") + PY + + - name: Run ROCm tests + shell: bash + run: | set -euxo pipefail - export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y \ - ca-certificates \ - git \ - python3-venv \ - python3-dev \ - python-is-python3 \ - build-essential \ - cmake \ - ninja-build \ - libssl-dev - - python3 -m venv /venv - export PATH="/venv/bin:$PATH" - pip install -U pip setuptools wheel - - cd /workspace - pip install .ci/wheels/torch*.whl - pip install -r requirements.txt - pip install .ci/wheels/apex*.whl - - python -c "import torch; print('torch:', torch.__version__, 'hip:', torch.version.hip)" - python -c "import apex; print('apex import OK')" - - export ROCM_HOME=/opt/rocm - export PYTORCH_ROCM_ARCH=gfx942 - export TORCH_EXTENSIONS_DIR=/workspace/.torch_extensions - export NPROC_PER_NODE=8 + . .venv/bin/activate + + export ROCM_HOME="${ROCM_HOME}" + export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" + export TORCH_EXTENSIONS_DIR="${PWD}/.torch_extensions" export MASTER_ADDR=127.0.0.1 export MASTER_PORT="$((10000 + RANDOM % 20000))" - + mkdir -p test-artifacts - bash tests/jit_build/run_tests.sh "condition" "11" 2>&1 | tee test-artifacts/ci-console.log - - # Manually move script logs so we can copy them out - cp results_jit_unit_test*.log test-artifacts/ || true - cp results_jit_unit_test*.csv test-artifacts/ || true - EOF - chmod +x run_tests_docker.sh - - docker rm -f rocm-tester || true - docker run -d --name rocm-tester --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-24.04:7.2-complete sleep infinity - - docker exec rocm-tester mkdir -p /workspace - docker cp . rocm-tester:/workspace/ - - # Run tests and capture exit code - set +e - docker exec rocm-tester /workspace/run_tests_docker.sh - EXIT_CODE=$? + + # Run the test script directly after installing the wheel. + set +e + bash tests/jit_build/run_tests.sh condition 11 2>&1 | tee test-artifacts/ci-console.log + EXIT_CODE=${PIPESTATUS[0]} set -e - - # Extract logs - mkdir -p test-artifacts - docker cp rocm-tester:/workspace/test-artifacts/. test-artifacts/ - docker rm -f rocm-tester - - # Fail CI if tests failed - exit $EXIT_CODE + + cp -f results_jit_unit_test*.log test-artifacts/ || true + cp -f results_jit_unit_test*.csv test-artifacts/ || true + + exit "${EXIT_CODE}" - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: - name: test-results-py312-rocm + name: test-results-py312-rocm-nightly if-no-files-found: warn - path: test-artifacts/** + path: test-artifacts/** \ No newline at end of file From a5e15d3f66cb502f9ffca92249605d9b522fceff Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 13:29:48 +0100 Subject: [PATCH 18/43] Set rocm --- .github/workflows/rocm-ci.yml | 43 +++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index b8f10e1cb..55dbff6b1 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -20,7 +20,11 @@ permissions: contents: read env: + PYTHON_VERSION: "3.12" PYTORCH_ROCM_ARCH: "gfx942" + PYTORCH_INDEX_URL: >- + ${{ github.event.inputs.pytorch_index_url || + 'https://download.pytorch.org/whl/nightly/rocm7.2' }} jobs: build: @@ -45,9 +49,27 @@ jobs: shell: bash run: | set -euxo pipefail + + if [ -z "${ROCM_HOME:-}" ]; then + if [ -n "${ROCM_PATH:-}" ] && [ -x "${ROCM_PATH}/bin/hipcc" ]; then + ROCM_HOME="${ROCM_PATH}" + elif command -v hipcc >/dev/null 2>&1; then + HIPCC="$(command -v hipcc)" + ROCM_HOME="$(cd "$(dirname "$HIPCC")/.." && pwd)" + elif [ -x /opt/rocm/bin/hipcc ]; then + ROCM_HOME="/opt/rocm" + elif [ -x /usr/local/rocm/bin/hipcc ]; then + ROCM_HOME="/usr/local/rocm" + else + echo "ERROR: Could not find ROCm (hipcc). Set ROCM_HOME or install ROCm." + exit 1 + fi + fi + + echo "ROCM_HOME=${ROCM_HOME}" >> "${GITHUB_ENV}" echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" - test -x "${ROCM_HOME}/bin/hipcc" + hipcc --version - name: Install build deps (best-effort) shell: bash @@ -154,9 +176,26 @@ jobs: shell: bash run: | set -euxo pipefail + if [ -z "${ROCM_HOME:-}" ]; then + if [ -n "${ROCM_PATH:-}" ] && [ -x "${ROCM_PATH}/bin/hipcc" ]; then + ROCM_HOME="${ROCM_PATH}" + elif command -v hipcc >/dev/null 2>&1; then + HIPCC="$(command -v hipcc)" + ROCM_HOME="$(cd "$(dirname "$HIPCC")/.." && pwd)" + elif [ -x /opt/rocm/bin/hipcc ]; then + ROCM_HOME="/opt/rocm" + elif [ -x /usr/local/rocm/bin/hipcc ]; then + ROCM_HOME="/usr/local/rocm" + else + echo "ERROR: Could not find ROCm (hipcc). Set ROCM_HOME or install ROCm." + exit 1 + fi + fi + + echo "ROCM_HOME=${ROCM_HOME}" >> "${GITHUB_ENV}" echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" - test -x "${ROCM_HOME}/bin/hipcc" + hipcc --version - name: Download apex wheel artifact uses: actions/download-artifact@v4 From 7e591435d5326edf13dea0774a33cc542bb42305 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 14:08:36 +0100 Subject: [PATCH 19/43] Add sanity check --- .github/workflows/rocm-ci.yml | 79 +++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 55dbff6b1..2b7bba0dc 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -44,34 +44,77 @@ jobs: with: python-version: ${{ env.PYTHON_VERSION }} cache: pip - + + - name: Sanity check + run: | + which hipcc || true + hipcc --version || true + echo "ROCM_HOME=$ROCM_HOME ROCM_PATH=$ROCM_PATH" + ls -la /opt | sed -n '1,200 + - name: Ensure ROCm is on PATH shell: bash run: | set -euxo pipefail - if [ -z "${ROCM_HOME:-}" ]; then - if [ -n "${ROCM_PATH:-}" ] && [ -x "${ROCM_PATH}/bin/hipcc" ]; then - ROCM_HOME="${ROCM_PATH}" - elif command -v hipcc >/dev/null 2>&1; then - HIPCC="$(command -v hipcc)" - ROCM_HOME="$(cd "$(dirname "$HIPCC")/.." && pwd)" - elif [ -x /opt/rocm/bin/hipcc ]; then - ROCM_HOME="/opt/rocm" - elif [ -x /usr/local/rocm/bin/hipcc ]; then - ROCM_HOME="/usr/local/rocm" - else - echo "ERROR: Could not find ROCm (hipcc). Set ROCM_HOME or install ROCm." - exit 1 + CANDIDATES=() + + # Prefer canonical ROCm locations on Linux. + if [ -x /opt/rocm/bin/hipcc ]; then + CANDIDATES+=("/opt/rocm") + fi + while IFS= read -r d; do + CANDIDATES+=("$d") + done < <(ls -d /opt/rocm-* 2>/dev/null | sort -V || true) + + # Ask hipconfig if available. + if command -v hipconfig >/dev/null 2>&1; then + CANDIDATES+=("$(hipconfig --rocmpath 2>/dev/null || true)") + fi + + # Respect provided env vars + CANDIDATES+=("${ROCM_PATH:-}" "${ROCM_HOME:-}") + + # Infer from hipcc realpath. + if command -v hipcc >/dev/null 2>&1; then + HIPCC_REAL="$(readlink -f "$(command -v hipcc)" || command -v hipcc)" + CANDIDATES+=("$(cd "$(dirname "$HIPCC_REAL")/.." && pwd)") + fi + + ROCM_HOME="" + for d in "${CANDIDATES[@]}"; do + [ -n "${d:-}" ] || continue + if [ -x "${d}/bin/hipcc" ] && [ -d "${d}/include" ]; then + ROCM_HOME="$d" fi + # Prefer a ROCm root that actually contains Thrust headers. + if [ -x "${d}/bin/hipcc" ] && [ -f "${d}/include/thrust/complex.h" ]; then + ROCM_HOME="$d" + break + fi + done + + if [ -z "${ROCM_HOME}" ]; then + echo "ERROR: Could not determine ROCm root (need hipcc)." + exit 1 fi echo "ROCM_HOME=${ROCM_HOME}" >> "${GITHUB_ENV}" + echo "ROCM_PATH=${ROCM_HOME}" >> "${GITHUB_ENV}" + + # Add toolchain to PATH for later steps. echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" - echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" - hipcc --version + if [ -d "${ROCM_HOME}/llvm/bin" ]; then + echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" + fi + if [ -d "${ROCM_HOME}/lib/llvm/bin" ]; then + echo "${ROCM_HOME}/lib/llvm/bin" >> "${GITHUB_PATH}" + fi + + "${ROCM_HOME}/bin/hipcc" --version + test -f "${ROCM_HOME}/include/thrust/complex.h" - - name: Install build deps (best-effort) + - name: Install build deps shell: bash run: | set -euxo pipefail @@ -92,7 +135,7 @@ jobs: libssl-dev fi - - name: Create venv + install PyTorch nightly (ROCm) + - name: Create venv + install PyTorch shell: bash run: | set -euxo pipefail From e606389ead43be0c38b9439f4e1818f7b6da5daa Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 14:11:02 +0100 Subject: [PATCH 20/43] set -euxo pipefail --- .github/workflows/rocm-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 2b7bba0dc..0e7f8b582 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -47,11 +47,12 @@ jobs: - name: Sanity check run: | + set -euxo pipefail which hipcc || true hipcc --version || true echo "ROCM_HOME=$ROCM_HOME ROCM_PATH=$ROCM_PATH" ls -la /opt | sed -n '1,200 - + - name: Ensure ROCm is on PATH shell: bash run: | From 7649d7d57ca9ead3b50d3248b3c5284ee960dbc6 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 15:22:12 +0100 Subject: [PATCH 21/43] typo --- .github/workflows/rocm-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 0e7f8b582..8c0f0af77 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -51,7 +51,7 @@ jobs: which hipcc || true hipcc --version || true echo "ROCM_HOME=$ROCM_HOME ROCM_PATH=$ROCM_PATH" - ls -la /opt | sed -n '1,200 + ls -la /opt | sed -n '1,200p' - name: Ensure ROCm is on PATH shell: bash From c2de6a2d6dd3250d20b3135aae38d8e17d308c63 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 16:55:11 +0100 Subject: [PATCH 22/43] Rewritten --- .github/workflows/rocm-ci.yml | 333 +++++++--------------------------- 1 file changed, 66 insertions(+), 267 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 8c0f0af77..e74aff820 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -1,303 +1,102 @@ -name: ROCm CI +name: Apex ROCm CI on: pull_request: + types: [opened, synchronize, ready_for_review] branches: [master] + schedule: + - cron: '0 0 * * 0' # weekly workflow_dispatch: - inputs: - pytorch_index_url: - description: >- - PyTorch nightly wheel index URL (e.g. - https://download.pytorch.org/whl/nightly/rocm7.2) - required: false - default: "https://download.pytorch.org/whl/nightly/rocm7.2" - -concurrency: - group: rocm-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: read env: - PYTHON_VERSION: "3.12" - PYTORCH_ROCM_ARCH: "gfx942" - PYTORCH_INDEX_URL: >- - ${{ github.event.inputs.pytorch_index_url || - 'https://download.pytorch.org/whl/nightly/rocm7.2' }} + DOCKER_IMAGE: "rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1" jobs: build: - name: Build apex wheel (ROCm) + name: Build Apex Wheel runs-on: build-only-apex - timeout-minutes: 180 - steps: - - name: Checkout + - name: Checkout repository uses: actions/checkout@v4 with: submodules: recursive - fetch-depth: 0 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - cache: pip - - - name: Sanity check + - name: Build Apex Wheel in Docker run: | - set -euxo pipefail - which hipcc || true - hipcc --version || true - echo "ROCM_HOME=$ROCM_HOME ROCM_PATH=$ROCM_PATH" - ls -la /opt | sed -n '1,200p' - - - name: Ensure ROCm is on PATH - shell: bash - run: | - set -euxo pipefail - - CANDIDATES=() - - # Prefer canonical ROCm locations on Linux. - if [ -x /opt/rocm/bin/hipcc ]; then - CANDIDATES+=("/opt/rocm") - fi - while IFS= read -r d; do - CANDIDATES+=("$d") - done < <(ls -d /opt/rocm-* 2>/dev/null | sort -V || true) - - # Ask hipconfig if available. - if command -v hipconfig >/dev/null 2>&1; then - CANDIDATES+=("$(hipconfig --rocmpath 2>/dev/null || true)") - fi - - # Respect provided env vars - CANDIDATES+=("${ROCM_PATH:-}" "${ROCM_HOME:-}") - - # Infer from hipcc realpath. - if command -v hipcc >/dev/null 2>&1; then - HIPCC_REAL="$(readlink -f "$(command -v hipcc)" || command -v hipcc)" - CANDIDATES+=("$(cd "$(dirname "$HIPCC_REAL")/.." && pwd)") - fi - - ROCM_HOME="" - for d in "${CANDIDATES[@]}"; do - [ -n "${d:-}" ] || continue - if [ -x "${d}/bin/hipcc" ] && [ -d "${d}/include" ]; then - ROCM_HOME="$d" - fi - # Prefer a ROCm root that actually contains Thrust headers. - if [ -x "${d}/bin/hipcc" ] && [ -f "${d}/include/thrust/complex.h" ]; then - ROCM_HOME="$d" - break - fi - done - - if [ -z "${ROCM_HOME}" ]; then - echo "ERROR: Could not determine ROCm root (need hipcc)." - exit 1 - fi - - echo "ROCM_HOME=${ROCM_HOME}" >> "${GITHUB_ENV}" - echo "ROCM_PATH=${ROCM_HOME}" >> "${GITHUB_ENV}" - - # Add toolchain to PATH for later steps. - echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" - if [ -d "${ROCM_HOME}/llvm/bin" ]; then - echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" - fi - if [ -d "${ROCM_HOME}/lib/llvm/bin" ]; then - echo "${ROCM_HOME}/lib/llvm/bin" >> "${GITHUB_PATH}" - fi - - "${ROCM_HOME}/bin/hipcc" --version - test -f "${ROCM_HOME}/include/thrust/complex.h" - - - name: Install build deps - shell: bash - run: | - set -euxo pipefail - if command -v apt-get >/dev/null; then - SUDO="" - if command -v sudo >/dev/null && sudo -n true 2>/dev/null; then - SUDO="sudo -n" - fi - ${SUDO} apt-get update - ${SUDO} apt-get install -y \ - build-essential \ - cmake \ - ninja-build \ - python3-dev \ - pkg-config \ - libopenblas-dev \ - libffi-dev \ - libssl-dev - fi - - - name: Create venv + install PyTorch - shell: bash - run: | - set -euxo pipefail - python -m venv .venv - . .venv/bin/activate - - python -m pip install -U pip setuptools wheel build packaging - - # Install torch from the ROCm nightly index, but allow deps from PyPI. - python -m pip install --pre torch \ - --index-url "${PYTORCH_INDEX_URL}" \ - --extra-index-url https://pypi.org/simple - - python - << 'PY' - import torch - print("torch:", torch.__version__) - print("hip:", torch.version.hip) - print("git:", getattr(torch.version, "git_version", None)) - PY - - - name: Build apex wheel - shell: bash - run: | - set -euxo pipefail - . .venv/bin/activate - - python -m pip install -r requirements.txt + docker run --rm -v ${{ github.workspace }}:/workspace -w /workspace ${{ env.DOCKER_IMAGE }} bash -c " + + pip install --upgrade pip + pip install build ninja wheel packaging - export ROCM_HOME="${ROCM_HOME}" - export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" + # Build apex wheel + python3 -m build --wheel --no-isolation -C--build-option=--cpp_ext -C--build-option=--cuda_ext - export APEX_BUILD_CPP_OPS=1 - export APEX_BUILD_CUDA_OPS=1 + # Fix permissions so GitHub Actions can upload the artifact + chown -R $(id -u):$(id -g) dist/ + " - python -m build --wheel --no-isolation . - ls -la dist - - - name: Write build metadata - shell: bash - run: | - set -euxo pipefail - . .venv/bin/activate - - mkdir -p build-metadata - python - << 'PY' > build-metadata/pytorch.txt - import torch - print("torch:", torch.__version__) - print("hip:", torch.version.hip) - print("git:", getattr(torch.version, "git_version", None)) - PY - - - name: Upload wheel + metadata + - name: Upload Wheel Artifact uses: actions/upload-artifact@v4 with: - name: apex-wheel-py312-rocm-nightly - if-no-files-found: error - path: | - dist/*.whl - build-metadata/pytorch.txt + name: apex-wheel + path: dist/*.whl + retention-days: 7 test: - name: Test - runs-on: amd-gpu-mi235-8 + name: Run Unit Tests and Benchmarks + runs-on: linux-apex-mi325-8 needs: build - timeout-minutes: 360 - steps: - - name: Checkout + - name: Checkout repository uses: actions/checkout@v4 with: submodules: recursive - fetch-depth: 0 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - cache: pip - - - name: Ensure ROCm is on PATH - shell: bash - run: | - set -euxo pipefail - if [ -z "${ROCM_HOME:-}" ]; then - if [ -n "${ROCM_PATH:-}" ] && [ -x "${ROCM_PATH}/bin/hipcc" ]; then - ROCM_HOME="${ROCM_PATH}" - elif command -v hipcc >/dev/null 2>&1; then - HIPCC="$(command -v hipcc)" - ROCM_HOME="$(cd "$(dirname "$HIPCC")/.." && pwd)" - elif [ -x /opt/rocm/bin/hipcc ]; then - ROCM_HOME="/opt/rocm" - elif [ -x /usr/local/rocm/bin/hipcc ]; then - ROCM_HOME="/usr/local/rocm" - else - echo "ERROR: Could not find ROCm (hipcc). Set ROCM_HOME or install ROCm." - exit 1 - fi - fi - - echo "ROCM_HOME=${ROCM_HOME}" >> "${GITHUB_ENV}" - echo "${ROCM_HOME}/bin" >> "${GITHUB_PATH}" - echo "${ROCM_HOME}/llvm/bin" >> "${GITHUB_PATH}" - hipcc --version - - - name: Download apex wheel artifact + - name: Download Wheel Artifact uses: actions/download-artifact@v4 with: - name: apex-wheel-py312-rocm-nightly - path: .ci/artifacts - - - name: Create venv + install torch + apex wheel - shell: bash - run: | - set -euxo pipefail - python -m venv .venv - . .venv/bin/activate - - python -m pip install -U pip setuptools wheel - - python -m pip install --pre torch \ - --index-url "${PYTORCH_INDEX_URL}" \ - --extra-index-url https://pypi.org/simple + name: apex-wheel + path: dist/ - python -m pip install -r requirements.txt - python -m pip install .ci/artifacts/dist/*.whl - - python - << 'PY' - import torch, apex - print("torch:", torch.__version__, "hip:", torch.version.hip) - print("apex:", getattr(apex, "__version__", "unknown"), "import OK") - PY - - - name: Run ROCm tests - shell: bash + - name: Run Tests in Docker run: | - set -euxo pipefail - . .venv/bin/activate - - export ROCM_HOME="${ROCM_HOME}" - export PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" - export TORCH_EXTENSIONS_DIR="${PWD}/.torch_extensions" - export MASTER_ADDR=127.0.0.1 - export MASTER_PORT="$((10000 + RANDOM % 20000))" - - mkdir -p test-artifacts - - # Run the test script directly after installing the wheel. - set +e - bash tests/jit_build/run_tests.sh condition 11 2>&1 | tee test-artifacts/ci-console.log - EXIT_CODE=${PIPESTATUS[0]} - set -e - - cp -f results_jit_unit_test*.log test-artifacts/ || true - cp -f results_jit_unit_test*.csv test-artifacts/ || true - - exit "${EXIT_CODE}" - - - name: Upload test results - if: always() + docker run --rm \ + --device=/dev/kfd --device=/dev/dri --group-add=video \ + -v ${{ github.workspace }}:/workspace -w /workspace \ + ${{ env.DOCKER_IMAGE }} bash -c " + + # Install the built wheel + pip install dist/apex-*.whl + + # Run LO unit tests + echo 'Running LO unit tests...' + cd tests/L0 + sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt + cd ../../ + + # Run extensions import tests + echo 'Running Extension Import tests...' + python3 tests/test_extension_import.py 2>&1 | tee log_import_results.txt + + # Run micro-benchmarking + echo 'Running Micro-benchmarking...' + python3 micro_benchmarking_pytorch.py --network resnet50 2>&1 | tee log_microbenchmark_results.txt + + # Extract failed tests from log file + grep -i 'fail\|error' log_L0_results.txt log_import_results.txt > extracted_failed_tests.txt || true + + # Fix permissions for artifacts + chown -R $(id -u):$(id -g) *.txt + " + + - name: Upload Test Logs and Extracted Failures + if: always() # Ensure logs upload even if tests fail uses: actions/upload-artifact@v4 with: - name: test-results-py312-rocm-nightly - if-no-files-found: warn - path: test-artifacts/** \ No newline at end of file + name: test-logs + path: | + log_L0_results.txt + log_import_results.txt + log_microbenchmark_results.txt + extracted_failed_tests.txt + retention-days: 14 From 215d53185867c7577058359b138c5b08b4a3b40f Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 23 Feb 2026 17:40:18 +0100 Subject: [PATCH 23/43] Fix tests --- .github/workflows/rocm-ci.yml | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index e74aff820..cbf52811e 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -24,14 +24,13 @@ jobs: - name: Build Apex Wheel in Docker run: | docker run --rm -v ${{ github.workspace }}:/workspace -w /workspace ${{ env.DOCKER_IMAGE }} bash -c " - pip install --upgrade pip pip install build ninja wheel packaging # Build apex wheel python3 -m build --wheel --no-isolation -C--build-option=--cpp_ext -C--build-option=--cuda_ext - # Fix permissions so GitHub Actions can upload the artifact + # Fix permissions for GitHub Actions artifact upload chown -R $(id -u):$(id -g) dist/ " @@ -43,7 +42,7 @@ jobs: retention-days: 7 test: - name: Run Unit Tests and Benchmarks + name: Run Unit Tests runs-on: linux-apex-mi325-8 needs: build steps: @@ -61,42 +60,54 @@ jobs: - name: Run Tests in Docker run: | docker run --rm \ - --device=/dev/kfd --device=/dev/dri --group-add=video \ + --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host \ -v ${{ github.workspace }}:/workspace -w /workspace \ ${{ env.DOCKER_IMAGE }} bash -c " - # Install the built wheel + # Enforce strict error handling for pipes so failures aren't masked + set -o pipefail + EXIT_CODE=0 + + # Install dependencies and the built wheel + pip install expecttest # Fixes ModuleNotFoundError pip install dist/apex-*.whl - # Run LO unit tests - echo 'Running LO unit tests...' + # Run L0 tests + echo 'Running L0 tests...' cd tests/L0 - sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt + sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt || EXIT_CODE=\$? cd ../../ - # Run extensions import tests - echo 'Running Extension Import tests...' - python3 tests/test_extension_import.py 2>&1 | tee log_import_results.txt + # Run contrib tests + echo 'Running Contrib tests...' + cd apex/contrib/test + python3 run_rocm_extensions.py 2>&1 | tee ../../../log_contrib_results.txt || EXIT_CODE=\$? + cd ../../.. - # Run micro-benchmarking - echo 'Running Micro-benchmarking...' - python3 micro_benchmarking_pytorch.py --network resnet50 2>&1 | tee log_microbenchmark_results.txt + echo 'Running Peer Halo Exchange tests...' + torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee log_halo_results.txt || EXIT_CODE=\$? - # Extract failed tests from log file - grep -i 'fail\|error' log_L0_results.txt log_import_results.txt > extracted_failed_tests.txt || true + # Run Distributed tests + echo 'Running Distributed Synced BatchNorm tests...' + cd tests/distributed/synced_batchnorm + sh unit_test.sh 2>&1 | tee ../../../log_syncbn_results.txt || EXIT_CODE=\$? + cd ../../.. - # Fix permissions for artifacts + # Fix permissions so GitHub Actions can upload the logs chown -R $(id -u):$(id -g) *.txt + + # Exit with the captured status code so the CI accurately reports failure + exit \$EXIT_CODE " - - name: Upload Test Logs and Extracted Failures - if: always() # Ensure logs upload even if tests fail + - name: Upload Test Logs + if: always() # Uploads logs even if tests fail uses: actions/upload-artifact@v4 with: name: test-logs path: | log_L0_results.txt - log_import_results.txt - log_microbenchmark_results.txt - extracted_failed_tests.txt + log_contrib_results.txt + log_halo_results.txt + log_syncbn_results.txt retention-days: 14 From 46c2da37735abeeab41c1aff101873dc459858e0 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 24 Feb 2026 13:40:10 +0100 Subject: [PATCH 24/43] Set large timeout for tests --- .github/workflows/rocm-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index cbf52811e..a56c285b1 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -43,6 +43,7 @@ jobs: test: name: Run Unit Tests + timeout-minutes: 720 runs-on: linux-apex-mi325-8 needs: build steps: From 03f3d2cdec4d622d27f3866bf0206be91c99ba61 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 24 Feb 2026 16:36:07 +0100 Subject: [PATCH 25/43] Split the steps --- .github/workflows/rocm-ci.yml | 71 +++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index a56c285b1..ae4273d54 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -27,10 +27,8 @@ jobs: pip install --upgrade pip pip install build ninja wheel packaging - # Build apex wheel python3 -m build --wheel --no-isolation -C--build-option=--cpp_ext -C--build-option=--cuda_ext - # Fix permissions for GitHub Actions artifact upload chown -R $(id -u):$(id -g) dist/ " @@ -58,51 +56,58 @@ jobs: name: apex-wheel path: dist/ - - name: Run Tests in Docker + - name: Start Background Docker Container run: | - docker run --rm \ + docker run -d --name apex-test-container \ --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host \ -v ${{ github.workspace }}:/workspace -w /workspace \ - ${{ env.DOCKER_IMAGE }} bash -c " - - # Enforce strict error handling for pipes so failures aren't masked - set -o pipefail - EXIT_CODE=0 - - # Install dependencies and the built wheel - pip install expecttest # Fixes ModuleNotFoundError + ${{ env.DOCKER_IMAGE }} sleep infinity + + - name: Install Dependencies and Built Wheel + run: | + docker exec apex-test-container bash -c " + pip install expecttest pip install dist/apex-*.whl + " - # Run L0 tests - echo 'Running L0 tests...' + - name: Run L0 tests + run: | + docker exec apex-test-container bash -c " cd tests/L0 - sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt || EXIT_CODE=\$? - cd ../../ + sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt + " - # Run contrib tests - echo 'Running Contrib tests...' + - name: Run Contrib tests + run: | + docker exec apex-test-container bash -c " cd apex/contrib/test - python3 run_rocm_extensions.py 2>&1 | tee ../../../log_contrib_results.txt || EXIT_CODE=\$? - cd ../../.. + python3 run_rocm_extensions.py 2>&1 | tee ../../../log_contrib_results.txt + " - echo 'Running Peer Halo Exchange tests...' - torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee log_halo_results.txt || EXIT_CODE=\$? + - name: Run Peer Halo Exchange tests + run: | + docker exec apex-test-container bash -c " + torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee log_halo_results.txt + " - # Run Distributed tests - echo 'Running Distributed Synced BatchNorm tests...' + - name: Run Distributed Synced BatchNorm tests + run: | + docker exec apex-test-container bash -c " cd tests/distributed/synced_batchnorm - sh unit_test.sh 2>&1 | tee ../../../log_syncbn_results.txt || EXIT_CODE=\$? - cd ../../.. - - # Fix permissions so GitHub Actions can upload the logs - chown -R $(id -u):$(id -g) *.txt - - # Exit with the captured status code so the CI accurately reports failure - exit \$EXIT_CODE + sh unit_test.sh 2>&1 | tee ../../../log_syncbn_results.txt " + - name: Fix Artifact Permissions + if: always() # Run even if a test step fails + run: | + docker exec apex-test-container bash -c "chown -R $(id -u):$(id -g) *.txt" + + - name: Cleanup Background Container + if: always() + run: docker rm -f apex-test-container + - name: Upload Test Logs - if: always() # Uploads logs even if tests fail + if: always() uses: actions/upload-artifact@v4 with: name: test-logs From 43fedde1edb772c3e4d9f90fbe8004d2e694aa35 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 11:06:13 +0100 Subject: [PATCH 26/43] Implement discussed features --- .github/workflows/rocm-ci.yml | 66 +++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index ae4273d54..14983065d 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -7,9 +7,40 @@ on: schedule: - cron: '0 0 * * 0' # weekly workflow_dispatch: + inputs: + apex_branch: + description: 'Apex branch to build' + required: false + default: 'master' + type: string + docker_image: + description: 'Docker image to use' + required: false + default: 'rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1' + type: string + run_l0: + description: 'Run L0 tests' + required: false + default: true + type: boolean + run_contrib: + description: 'Run Contrib tests' + required: false + default: true + type: boolean + run_halo: + description: 'Run Peer Halo Exchange tests' + required: false + default: true + type: boolean + run_syncbn: + description: 'Run Distributed Synced BatchNorm tests' + required: false + default: true + type: boolean env: - DOCKER_IMAGE: "rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1" + DOCKER_IMAGE: ${{ inputs.docker_image || 'rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1' }} jobs: build: @@ -19,6 +50,8 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + # Uses the specified branch on manual runs; defaults to the PR/Push context otherwise + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_branch || '' }} submodules: recursive - name: Build Apex Wheel in Docker @@ -48,6 +81,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_branch || '' }} submodules: recursive - name: Download Wheel Artifact @@ -60,47 +94,57 @@ jobs: run: | docker run -d --name apex-test-container \ --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host \ + -e OMP_NUM_THREADS=48 \ -v ${{ github.workspace }}:/workspace -w /workspace \ ${{ env.DOCKER_IMAGE }} sleep infinity - name: Install Dependencies and Built Wheel run: | docker exec apex-test-container bash -c " + set -e pip install expecttest pip install dist/apex-*.whl " - name: Run L0 tests + if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} run: | docker exec apex-test-container bash -c " + set -eo pipefail cd tests/L0 - sh run_rocm.sh 2>&1 | tee ../../log_L0_results.txt + sh run_rocm.sh 2>&1 | tee ../../L0_results.log " - name: Run Contrib tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} run: | docker exec apex-test-container bash -c " + set -eo pipefail cd apex/contrib/test - python3 run_rocm_extensions.py 2>&1 | tee ../../../log_contrib_results.txt + python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log " - name: Run Peer Halo Exchange tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_halo) }} run: | docker exec apex-test-container bash -c " - torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee log_halo_results.txt + set -eo pipefail + torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log " - name: Run Distributed Synced BatchNorm tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_syncbn) }} run: | docker exec apex-test-container bash -c " + set -eo pipefail cd tests/distributed/synced_batchnorm - sh unit_test.sh 2>&1 | tee ../../../log_syncbn_results.txt + sh unit_test.sh 2>&1 | tee ../../../syncbn_results.log " - name: Fix Artifact Permissions - if: always() # Run even if a test step fails + if: always() run: | - docker exec apex-test-container bash -c "chown -R $(id -u):$(id -g) *.txt" + docker exec apex-test-container bash -c "chown -R $(id -u):$(id -g) *.log" - name: Cleanup Background Container if: always() @@ -112,8 +156,8 @@ jobs: with: name: test-logs path: | - log_L0_results.txt - log_contrib_results.txt - log_halo_results.txt - log_syncbn_results.txt + L0_results.log + contrib_results.log + halo_results.log + syncbn_results.log retention-days: 14 From 2181dd57360dbb5dd1599d1c7e81140434404920 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 12:36:46 +0100 Subject: [PATCH 27/43] Fix tests --- .github/workflows/rocm-ci.yml | 2 +- .../peer_memory/peer_halo_exchange_module_tests.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 14983065d..45963c90b 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -102,7 +102,7 @@ jobs: run: | docker exec apex-test-container bash -c " set -e - pip install expecttest + pip install expecttest onnxscript pip install dist/apex-*.whl " diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index bd85354af..69dc22f4b 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -1,3 +1,4 @@ +import os import torch from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d import peer_memory_cuda as pm @@ -143,11 +144,14 @@ def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps): def main(): # for this trivial example peer_rank == rank and peer_group_size == world_size - + + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) torch.distributed.init_process_group("nccl") + rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() - torch.cuda.set_device(rank) + peer_ranks = [i for i in range(world_size)] pool = PeerMemoryPool(64*1024, 2*1024*1024, peer_ranks) From 8c056d5d79efadc92e05f65dde41637b6ab4be34 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 19:37:42 +0100 Subject: [PATCH 28/43] Fix tests more --- .github/workflows/rocm-ci.yml | 8 +++++-- .../peer_halo_exchange_module_tests.py | 22 ++++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 45963c90b..3f563957a 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -94,7 +94,9 @@ jobs: run: | docker run -d --name apex-test-container \ --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host \ - -e OMP_NUM_THREADS=48 \ + -e OMP_NUM_THREADS=8 \ + -e TORCH_NCCL_ASYNC_ERROR_HANDLING=1 \ + -e NCCL_DEBUG=WARN \ -v ${{ github.workspace }}:/workspace -w /workspace \ ${{ env.DOCKER_IMAGE }} sleep infinity @@ -129,7 +131,9 @@ jobs: run: | docker exec apex-test-container bash -c " set -eo pipefail - torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log + torchrun --nproc_per_node 8 \ + --bind_all \ + apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log " - name: Run Distributed Synced BatchNorm tests diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index 69dc22f4b..c2d1f7c1b 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -99,9 +99,11 @@ def single_test(peer_rank, peer_group_size, halo_ex, C, H, W, half_halo, dtype, nccl_halo_ex(peer_rank, peer_group_size, y2, half_halo, explicit_nhwc, H_split) list_y2.append(y2.clone()) y2.copy_(y3) - is_equal = [torch.all(torch.eq(yy,yy2)) for yy,yy2 in zip(list_y,list_y2)] - is_equal = torch.tensor(is_equal, dtype=torch.bool) - is_equal = torch.all(is_equal) + + # Stack the 100 CUDA tensors directly on the GPU. No PCIe syncs + is_equal_stack = torch.stack([torch.eq(yy, yy2).all() for yy, yy2 in zip(list_y, list_y2)]) + is_equal = is_equal_stack.all().item() + if peer_rank == 0: if memory_format == 1: memory_format_str = "explicit_nhwc" @@ -144,14 +146,22 @@ def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps): def main(): # for this trivial example peer_rank == rank and peer_group_size == world_size - + + # Force the AMD driver to only see the GPU assigned to this specific process local_rank = int(os.environ.get("LOCAL_RANK", 0)) - torch.cuda.set_device(local_rank) + os.environ["HIP_VISIBLE_DEVICES"] = str(local_rank) + os.environ["ROCR_VISIBLE_DEVICES"] = str(local_rank) + + # Because the process only sees 1 GPU now, it is always index 0 + torch.cuda.set_device(0) + + # Iinitialize the process group torch.distributed.init_process_group("nccl") - + rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() + torch.cuda.set_device(rank) peer_ranks = [i for i in range(world_size)] pool = PeerMemoryPool(64*1024, 2*1024*1024, peer_ranks) From fbba6e07f8f8b00013e7a797c6524b37a3948687 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 20:43:52 +0100 Subject: [PATCH 29/43] Try tests --- .github/workflows/rocm-ci.yml | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 3f563957a..d9f8846f7 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -108,23 +108,23 @@ jobs: pip install dist/apex-*.whl " - - name: Run L0 tests - if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} - run: | - docker exec apex-test-container bash -c " - set -eo pipefail - cd tests/L0 - sh run_rocm.sh 2>&1 | tee ../../L0_results.log - " - - - name: Run Contrib tests - if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} - run: | - docker exec apex-test-container bash -c " - set -eo pipefail - cd apex/contrib/test - python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log - " + # - name: Run L0 tests + # if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} + # run: | + # docker exec apex-test-container bash -c " + # set -eo pipefail + # cd tests/L0 + # sh run_rocm.sh 2>&1 | tee ../../L0_results.log + # " + + # - name: Run Contrib tests + # if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} + # run: | + # docker exec apex-test-container bash -c " + # set -eo pipefail + # cd apex/contrib/test + # python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log + # " - name: Run Peer Halo Exchange tests if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_halo) }} @@ -132,7 +132,6 @@ jobs: docker exec apex-test-container bash -c " set -eo pipefail torchrun --nproc_per_node 8 \ - --bind_all \ apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log " From c2ad547487ed819170a6d98cd65e29fa29496756 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 22:35:00 +0100 Subject: [PATCH 30/43] Removed the HIP_VISIBLE_DEVICES code --- .../peer_memory/peer_halo_exchange_module_tests.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index c2d1f7c1b..809210df6 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -146,22 +146,14 @@ def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps): def main(): # for this trivial example peer_rank == rank and peer_group_size == world_size - - # Force the AMD driver to only see the GPU assigned to this specific process - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - os.environ["HIP_VISIBLE_DEVICES"] = str(local_rank) - os.environ["ROCR_VISIBLE_DEVICES"] = str(local_rank) - - # Because the process only sees 1 GPU now, it is always index 0 - torch.cuda.set_device(0) - # Iinitialize the process group + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) torch.distributed.init_process_group("nccl") rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() - torch.cuda.set_device(rank) peer_ranks = [i for i in range(world_size)] pool = PeerMemoryPool(64*1024, 2*1024*1024, peer_ranks) @@ -173,6 +165,5 @@ def main(): H_split_tests(1,64,336,200, half_halo,rank,world_size,halo_ex,num_steps) W_split_tests(1,64,200,336, half_halo,rank,world_size,halo_ex,num_steps) - if __name__ == "__main__": main() From b8cb6bd0c0f6dbeb82aab2144c73a01a182405a5 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 25 Feb 2026 23:21:05 +0100 Subject: [PATCH 31/43] Lock the RCCL context --- .github/workflows/rocm-ci.yml | 39 ++++++++++--------- .../peer_halo_exchange_module_tests.py | 12 ++++-- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index d9f8846f7..d20bbac6f 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -108,31 +108,32 @@ jobs: pip install dist/apex-*.whl " - # - name: Run L0 tests - # if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} - # run: | - # docker exec apex-test-container bash -c " - # set -eo pipefail - # cd tests/L0 - # sh run_rocm.sh 2>&1 | tee ../../L0_results.log - # " - - # - name: Run Contrib tests - # if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} - # run: | - # docker exec apex-test-container bash -c " - # set -eo pipefail - # cd apex/contrib/test - # python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log - # " + - name: Run L0 tests + if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd tests/L0 + sh run_rocm.sh 2>&1 | tee ../../L0_results.log + " + + - name: Run Contrib tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd apex/contrib/test + python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log + " - name: Run Peer Halo Exchange tests if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_halo) }} run: | docker exec apex-test-container bash -c " set -eo pipefail - torchrun --nproc_per_node 8 \ - apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log + export HSA_FORCE_FINE_GRAIN_PCIE=1 + export HSA_ENABLE_SDMA=0 + torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log " - name: Run Distributed Synced BatchNorm tests diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index 809210df6..0468d6923 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -149,21 +149,25 @@ def main(): local_rank = int(os.environ.get("LOCAL_RANK", 0)) torch.cuda.set_device(local_rank) - torch.distributed.init_process_group("nccl") - + + # Bind the RCCL context + torch.distributed.init_process_group( + "nccl", + device_id=torch.device(f"cuda:{local_rank}") + ) + rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() peer_ranks = [i for i in range(world_size)] pool = PeerMemoryPool(64*1024, 2*1024*1024, peer_ranks) - num_steps = 100 - half_halo = 1 halo_ex = PeerHaloExchanger1d(peer_ranks, rank, pool, half_halo) H_split_tests(1,64,336,200, half_halo,rank,world_size,halo_ex,num_steps) W_split_tests(1,64,200,336, half_halo,rank,world_size,halo_ex,num_steps) + if __name__ == "__main__": main() From 1dcf247a2e9d72c840cc5c840c28baeb82e0059a Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 26 Feb 2026 14:14:33 +0100 Subject: [PATCH 32/43] Force CPU to wait for the GPUs, and we need to force all GPUs to wait for each other before anyone is allowed to reset the memory pool --- apex/contrib/peer_memory/peer_halo_exchange_module_tests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index 0468d6923..73a8d8df9 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -91,8 +91,11 @@ def single_test(peer_rank, peer_group_size, halo_ex, C, H, W, half_halo, dtype, halo_ex(y, H_split, explicit_nhwc, numSM) list_y.append(y.clone()) y.copy_(y3) + torch.cuda.synchronize() + torch.distributed.barrier() halo_ex.peer_pool.reset() torch.distributed.barrier() + y2 = y3.clone() list_y2 = [] for step in range(num_steps): From 7726b0cbe8dd73555ae92b06ad7f29d4326af714 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 26 Feb 2026 15:24:47 +0100 Subject: [PATCH 33/43] Revert --- apex/contrib/peer_memory/peer_halo_exchange_module_tests.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index 73a8d8df9..0468d6923 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -91,11 +91,8 @@ def single_test(peer_rank, peer_group_size, halo_ex, C, H, W, half_halo, dtype, halo_ex(y, H_split, explicit_nhwc, numSM) list_y.append(y.clone()) y.copy_(y3) - torch.cuda.synchronize() - torch.distributed.barrier() halo_ex.peer_pool.reset() torch.distributed.barrier() - y2 = y3.clone() list_y2 = [] for step in range(num_steps): From 0eec57fe16520a0fa06eacd52f4d419f6c52ef05 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 27 Feb 2026 16:08:44 +0100 Subject: [PATCH 34/43] Resolve comments --- .github/workflows/rocm-ci.yml | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index d20bbac6f..8f8dadce3 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -8,16 +8,21 @@ on: - cron: '0 0 * * 0' # weekly workflow_dispatch: inputs: - apex_branch: - description: 'Apex branch to build' + apex_gitref: + description: 'Apex branch or commit SHA to build' required: false default: 'master' type: string docker_image: description: 'Docker image to use' required: false - default: 'rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1' + default: 'rocm/pytorch:latest' type: string + run_extension: + description: 'Run Extension Import tests' + required: false + default: true + type: boolean run_l0: description: 'Run L0 tests' required: false @@ -40,7 +45,7 @@ on: type: boolean env: - DOCKER_IMAGE: ${{ inputs.docker_image || 'rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.9.1' }} + DOCKER_IMAGE: ${{ inputs.docker_image || 'rocm/pytorch:latest' }} jobs: build: @@ -51,7 +56,7 @@ jobs: uses: actions/checkout@v4 with: # Uses the specified branch on manual runs; defaults to the PR/Push context otherwise - ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_branch || '' }} + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_gitref || '' }} submodules: recursive - name: Build Apex Wheel in Docker @@ -81,7 +86,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_branch || '' }} + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_gitref || '' }} submodules: recursive - name: Download Wheel Artifact @@ -108,8 +113,17 @@ jobs: pip install dist/apex-*.whl " + - name: Run Extension Import tests + if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_extension) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd tests + python3 test_extension_import.py 2>&1 | tee ../../extension_import_results.log + " + - name: Run L0 tests - if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_l0 }} + if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_l0) }} run: | docker exec apex-test-container bash -c " set -eo pipefail @@ -118,7 +132,7 @@ jobs: " - name: Run Contrib tests - if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} + if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} run: | docker exec apex-test-container bash -c " set -eo pipefail @@ -160,8 +174,5 @@ jobs: with: name: test-logs path: | - L0_results.log - contrib_results.log - halo_results.log - syncbn_results.log + *.log retention-days: 14 From d64c451d730249668fb3a0337e4dc1241a7a6477 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Fri, 27 Feb 2026 17:06:03 +0100 Subject: [PATCH 35/43] Hausekepping --- .github/workflows/rocm-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 8f8dadce3..a119e35a6 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -114,12 +114,12 @@ jobs: " - name: Run Extension Import tests - if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_extension) }} + if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_extension }} run: | docker exec apex-test-container bash -c " set -eo pipefail cd tests - python3 test_extension_import.py 2>&1 | tee ../../extension_import_results.log + python3 test_extension_import.py 2>&1 | tee ../extension_import_results.log " - name: Run L0 tests @@ -132,7 +132,7 @@ jobs: " - name: Run Contrib tests - if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} run: | docker exec apex-test-container bash -c " set -eo pipefail From 2fbaa90644ad3e46ed7512d9cc009b0a34c52a84 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 9 Mar 2026 17:19:19 +0100 Subject: [PATCH 36/43] Run CI From 2958b4f627e653c71f4c3504ba41c33843504367 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 9 Mar 2026 17:23:42 +0100 Subject: [PATCH 37/43] Propagate import errors --- tests/test_extension_import.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py index 72d88688e..0f8b530ed 100644 --- a/tests/test_extension_import.py +++ b/tests/test_extension_import.py @@ -229,7 +229,14 @@ def test_extensions_import(self): error_display = error_message[:17] + "..." if len(error_message) > 20 else error_message print(f"{extension:<30} {success:<10} {error_display:<20}") print("-" * 60) - + + # Fail the test if any extensions failed to import + failed_extensions = [ext for ext, success, _ in results if not success] + self.assertEqual( + len(failed_extensions), 0, + f"{len(failed_extensions)} extension(s) failed to import: {', '.join(failed_extensions)}" + ) + if __name__ == '__main__': unittest.main() \ No newline at end of file From 82f689a370259619a71205cb52bb00304594493f Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 9 Mar 2026 19:09:03 +0100 Subject: [PATCH 38/43] Extension tests fix --- tests/test_extension_import.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py index 0f8b530ed..e5fc8ebfd 100644 --- a/tests/test_extension_import.py +++ b/tests/test_extension_import.py @@ -101,30 +101,36 @@ def get_environment(self): """ # Get current environment and ensure CUDA/PyTorch libraries are available env = os.environ.copy() - - # Add common CUDA library paths + ld_library_path = env.get('LD_LIBRARY_PATH', '') - cuda_paths = [ - '/usr/local/cuda/lib64', - '/usr/local/cuda/lib', - '/opt/conda/lib', - '/usr/lib/x86_64-linux-gnu' - ] - + extra_paths = [] + # Add PyTorch library path try: import torch torch_lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib') if os.path.exists(torch_lib_path): - cuda_paths.append(torch_lib_path) + extra_paths.append(torch_lib_path) except ImportError: pass - + + # Add ROCm library path if present + rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm') + rocm_lib = os.path.join(rocm_path, 'lib') + if os.path.exists(rocm_lib): + extra_paths.append(rocm_lib) + + # Add common CUDA library paths (only those that exist) + for path in ['/usr/local/cuda/lib64', '/usr/local/cuda/lib', + '/opt/conda/lib', '/usr/lib/x86_64-linux-gnu']: + if os.path.isdir(path): + extra_paths.append(path) + # Update LD_LIBRARY_PATH if ld_library_path: - env['LD_LIBRARY_PATH'] = ':'.join(cuda_paths) + ':' + ld_library_path + env['LD_LIBRARY_PATH'] = ':'.join(extra_paths) + ':' + ld_library_path else: - env['LD_LIBRARY_PATH'] = ':'.join(cuda_paths) + env['LD_LIBRARY_PATH'] = ':'.join(extra_paths) return env From ee7b996fcddd5aa30840e27ced0db156b4f5eea8 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Mon, 9 Mar 2026 20:38:51 +0100 Subject: [PATCH 39/43] Apply launch bounds unconditionally --- .../csrc/peer_memory/peer_memory_cuda.cu | 100 ++++++++---------- 1 file changed, 44 insertions(+), 56 deletions(-) diff --git a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu index 188900128..934de0500 100644 --- a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu +++ b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu @@ -27,6 +27,23 @@ namespace cg = cooperative_groups; } \ } while(0) +#define CUDACHECK_FATAL(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + TORCH_CHECK(false, \ + "CUDA error at ", __FILE__, ":", __LINE__, \ + " '", cudaGetErrorString(err), "'"); \ + } \ +} while(0) + +#ifdef USE_ROCM +#define LAUNCH_COOPERATIVE_KERNEL(kernel, grid, block, args, stream) \ + CUDACHECK_FATAL(hipLaunchCooperativeKernel((void*)(kernel), (grid), (block), (args), 0, (stream))) +#else +#define LAUNCH_COOPERATIVE_KERNEL(kernel, grid, block, args, stream) \ + CUDACHECK_FATAL(cudaLaunchCooperativeKernel((void*)(kernel), (grid), (block), (args), 0, (stream))) +#endif + // C++17 removes 'register' storage keyword #if __cplusplus < 201703L #define REGISTER register @@ -353,9 +370,7 @@ __device__ void clear_flag( } template -#if __CUDA_ARCH__ == 700 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 900 __launch_bounds__(128, 16) -#endif __global__ void push_pull_halos_1d_kernel( // top halo, const T* toh, int toh_stride_C, int toh_stride_H, int toh_stride_W, // top output halo @@ -647,31 +662,22 @@ void push_pull_halos_1d( }; if (top_zero) { int numBlocksPerSm; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else if (btm_zero) { int numBlocksPerSm; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else { int numBlocksPerSm; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } } else { // cannot do int4 transfers @@ -691,55 +697,37 @@ void push_pull_halos_1d( int numBlocksPerSm; if (is_nhwc) { if (top_zero) { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else if (btm_zero) { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } } else { if (top_zero) { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else if (btm_zero) { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } else { - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); + CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); + TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); dim3 grid(numSM*numBlocksPerSm,1,1); -#ifdef USE_ROCM - hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#else - cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); -#endif + LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); } } } From 0ec6f259837c50fdca30d3677ba62029a0382c4e Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 10 Mar 2026 12:59:49 +0100 Subject: [PATCH 40/43] Define USE_ROCM during JIT compilation --- .github/workflows/rocm-ci.yml | 8 +++++--- op_builder/builder.py | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index a119e35a6..47985f2cc 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -3,9 +3,11 @@ name: Apex ROCm CI on: pull_request: types: [opened, synchronize, ready_for_review] - branches: [master] - schedule: - - cron: '0 0 * * 0' # weekly + branches: + - master + - release/1.8.0 + - release/1.9.0 + - release/1.10.0 workflow_dispatch: inputs: apex_gitref: diff --git a/op_builder/builder.py b/op_builder/builder.py index 60e490b2b..86f28532e 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -586,6 +586,7 @@ def jit_load(self, verbose=True): if self.is_rocm_pytorch(): cxx_args.append("-D__HIP_PLATFORM_AMD__=1") + cxx_args.append("-DUSE_ROCM") os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch() cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size()) @@ -781,6 +782,7 @@ def nvcc_args(self): args += [ '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__', '-U__HIP_NO_HALF2_OPERATORS__', + '-DUSE_ROCM', '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR ] From b93977f80ca8683dbc46f964f813b5c43fee882c Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 10 Mar 2026 14:31:33 +0100 Subject: [PATCH 41/43] Revert some changes --- .../csrc/peer_memory/peer_memory_cuda.cu | 100 ++++++++++-------- .../peer_halo_exchange_module_tests.py | 25 ++--- op_builder/builder.py | 2 - 3 files changed, 64 insertions(+), 63 deletions(-) diff --git a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu index 934de0500..188900128 100644 --- a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu +++ b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu @@ -27,23 +27,6 @@ namespace cg = cooperative_groups; } \ } while(0) -#define CUDACHECK_FATAL(cmd) do { \ - cudaError_t err = cmd; \ - if( err != cudaSuccess ) { \ - TORCH_CHECK(false, \ - "CUDA error at ", __FILE__, ":", __LINE__, \ - " '", cudaGetErrorString(err), "'"); \ - } \ -} while(0) - -#ifdef USE_ROCM -#define LAUNCH_COOPERATIVE_KERNEL(kernel, grid, block, args, stream) \ - CUDACHECK_FATAL(hipLaunchCooperativeKernel((void*)(kernel), (grid), (block), (args), 0, (stream))) -#else -#define LAUNCH_COOPERATIVE_KERNEL(kernel, grid, block, args, stream) \ - CUDACHECK_FATAL(cudaLaunchCooperativeKernel((void*)(kernel), (grid), (block), (args), 0, (stream))) -#endif - // C++17 removes 'register' storage keyword #if __cplusplus < 201703L #define REGISTER register @@ -370,7 +353,9 @@ __device__ void clear_flag( } template +#if __CUDA_ARCH__ == 700 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 900 __launch_bounds__(128, 16) +#endif __global__ void push_pull_halos_1d_kernel( // top halo, const T* toh, int toh_stride_C, int toh_stride_H, int toh_stride_W, // top output halo @@ -662,22 +647,31 @@ void push_pull_halos_1d( }; if (top_zero) { int numBlocksPerSm; - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else if (btm_zero) { int numBlocksPerSm; - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else { int numBlocksPerSm; - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } } else { // cannot do int4 transfers @@ -697,37 +691,55 @@ void push_pull_halos_1d( int numBlocksPerSm; if (is_nhwc) { if (top_zero) { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else if (btm_zero) { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } } else { if (top_zero) { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else if (btm_zero) { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } else { - CUDACHECK_FATAL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0)); - TORCH_CHECK(numBlocksPerSm > 0, "push_pull_halos_1d: cooperative kernel requires at least 1 block per SM, got 0"); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, push_pull_halos_1d_kernel, numThreads, 0); dim3 grid(numSM*numBlocksPerSm,1,1); - LAUNCH_COOPERATIVE_KERNEL(push_pull_halos_1d_kernel, grid, block, kernelArgs, current_stream); +#ifdef USE_ROCM + hipLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#else + cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel, grid, block, kernelArgs, 0, current_stream); +#endif } } } diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py index 0468d6923..bd85354af 100644 --- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py +++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py @@ -1,4 +1,3 @@ -import os import torch from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d import peer_memory_cuda as pm @@ -99,11 +98,9 @@ def single_test(peer_rank, peer_group_size, halo_ex, C, H, W, half_halo, dtype, nccl_halo_ex(peer_rank, peer_group_size, y2, half_halo, explicit_nhwc, H_split) list_y2.append(y2.clone()) y2.copy_(y3) - - # Stack the 100 CUDA tensors directly on the GPU. No PCIe syncs - is_equal_stack = torch.stack([torch.eq(yy, yy2).all() for yy, yy2 in zip(list_y, list_y2)]) - is_equal = is_equal_stack.all().item() - + is_equal = [torch.all(torch.eq(yy,yy2)) for yy,yy2 in zip(list_y,list_y2)] + is_equal = torch.tensor(is_equal, dtype=torch.bool) + is_equal = torch.all(is_equal) if peer_rank == 0: if memory_format == 1: memory_format_str = "explicit_nhwc" @@ -146,22 +143,16 @@ def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps): def main(): # for this trivial example peer_rank == rank and peer_group_size == world_size - - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - torch.cuda.set_device(local_rank) - - # Bind the RCCL context - torch.distributed.init_process_group( - "nccl", - device_id=torch.device(f"cuda:{local_rank}") - ) - + + torch.distributed.init_process_group("nccl") rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() - + torch.cuda.set_device(rank) peer_ranks = [i for i in range(world_size)] pool = PeerMemoryPool(64*1024, 2*1024*1024, peer_ranks) + num_steps = 100 + half_halo = 1 halo_ex = PeerHaloExchanger1d(peer_ranks, rank, pool, half_halo) diff --git a/op_builder/builder.py b/op_builder/builder.py index 86f28532e..60e490b2b 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -586,7 +586,6 @@ def jit_load(self, verbose=True): if self.is_rocm_pytorch(): cxx_args.append("-D__HIP_PLATFORM_AMD__=1") - cxx_args.append("-DUSE_ROCM") os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch() cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size()) @@ -782,7 +781,6 @@ def nvcc_args(self): args += [ '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__', '-U__HIP_NO_HALF2_OPERATORS__', - '-DUSE_ROCM', '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR ] From 35e23c8b8b9f8c7c4d1e5e8e0a649bfc8a63e038 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 10 Mar 2026 18:43:31 +0100 Subject: [PATCH 42/43] Resolve comments --- .github/workflows/rocm-ci.yml | 44 ++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 47985f2cc..dc06f02df 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -60,10 +60,20 @@ jobs: # Uses the specified branch on manual runs; defaults to the PR/Push context otherwise ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_gitref || '' }} submodules: recursive + + - name: Pull Docker Image + run: | + docker pull ${{ env.DOCKER_IMAGE }} + + - name: Start Background Docker Container + run: | + docker run -d --name apex-build-container \ + -v ${{ github.workspace }}:/workspace -w /workspace \ + ${{ env.DOCKER_IMAGE }} sleep infinity - - name: Build Apex Wheel in Docker + - name: Build Apex Wheel run: | - docker run --rm -v ${{ github.workspace }}:/workspace -w /workspace ${{ env.DOCKER_IMAGE }} bash -c " + docker exec apex-build-container bash -c " pip install --upgrade pip pip install build ninja wheel packaging @@ -72,6 +82,23 @@ jobs: chown -R $(id -u):$(id -g) dist/ " + - name: Run Extension Import tests + if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_extension }} + run: | + docker exec apex-build-container bash -c " + set -eo pipefail + + pip install expecttest onnxscript + pip install dist/apex-*.whl + + cd tests + python3 test_extension_import.py 2>&1 | tee ../extension_import_results.log + " + + - name: Cleanup Build Container + if: always() + run: docker rm -f apex-build-container + - name: Upload Wheel Artifact uses: actions/upload-artifact@v4 with: @@ -96,6 +123,10 @@ jobs: with: name: apex-wheel path: dist/ + + - name: Pull Docker Image + run: | + docker pull ${{ env.DOCKER_IMAGE }}] - name: Start Background Docker Container run: | @@ -115,15 +146,6 @@ jobs: pip install dist/apex-*.whl " - - name: Run Extension Import tests - if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_extension }} - run: | - docker exec apex-test-container bash -c " - set -eo pipefail - cd tests - python3 test_extension_import.py 2>&1 | tee ../extension_import_results.log - " - - name: Run L0 tests if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_l0) }} run: | From c2259c04c34d2808d5d88bb77a215a20b4a10731 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 10 Mar 2026 20:40:58 +0100 Subject: [PATCH 43/43] Fix typo --- .github/workflows/rocm-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index dc06f02df..b5aa06faf 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -126,7 +126,7 @@ jobs: - name: Pull Docker Image run: | - docker pull ${{ env.DOCKER_IMAGE }}] + docker pull ${{ env.DOCKER_IMAGE }} - name: Start Background Docker Container run: |