diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml new file mode 100644 index 000000000..b5aa06faf --- /dev/null +++ b/.github/workflows/rocm-ci.yml @@ -0,0 +1,202 @@ +name: Apex ROCm CI + +on: + pull_request: + types: [opened, synchronize, ready_for_review] + branches: + - master + - release/1.8.0 + - release/1.9.0 + - release/1.10.0 + workflow_dispatch: + inputs: + apex_gitref: + description: 'Apex branch or commit SHA to build' + required: false + default: 'master' + type: string + docker_image: + description: 'Docker image to use' + required: false + default: 'rocm/pytorch:latest' + type: string + run_extension: + description: 'Run Extension Import tests' + required: false + default: true + type: boolean + run_l0: + description: 'Run L0 tests' + required: false + default: true + type: boolean + run_contrib: + description: 'Run Contrib tests' + required: false + default: true + type: boolean + run_halo: + description: 'Run Peer Halo Exchange tests' + required: false + default: true + type: boolean + run_syncbn: + description: 'Run Distributed Synced BatchNorm tests' + required: false + default: true + type: boolean + +env: + DOCKER_IMAGE: ${{ inputs.docker_image || 'rocm/pytorch:latest' }} + +jobs: + build: + name: Build Apex Wheel + runs-on: build-only-apex + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # Uses the specified branch on manual runs; defaults to the PR/Push context otherwise + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_gitref || '' }} + submodules: recursive + + - name: Pull Docker Image + run: | + docker pull ${{ env.DOCKER_IMAGE }} + + - name: Start Background Docker Container + run: | + docker run -d --name apex-build-container \ + -v ${{ github.workspace }}:/workspace -w /workspace \ + ${{ env.DOCKER_IMAGE }} sleep infinity + + - name: Build Apex Wheel + run: | + docker exec apex-build-container bash -c " + pip install --upgrade pip + pip install build ninja wheel packaging + + python3 -m build --wheel --no-isolation -C--build-option=--cpp_ext -C--build-option=--cuda_ext + + chown -R $(id -u):$(id -g) dist/ + " + + - name: Run Extension Import tests + if: ${{ github.event_name != 'workflow_dispatch' || inputs.run_extension }} + run: | + docker exec apex-build-container bash -c " + set -eo pipefail + + pip install expecttest onnxscript + pip install dist/apex-*.whl + + cd tests + python3 test_extension_import.py 2>&1 | tee ../extension_import_results.log + " + + - name: Cleanup Build Container + if: always() + run: docker rm -f apex-build-container + + - name: Upload Wheel Artifact + uses: actions/upload-artifact@v4 + with: + name: apex-wheel + path: dist/*.whl + retention-days: 7 + + test: + name: Run Unit Tests + timeout-minutes: 720 + runs-on: linux-apex-mi325-8 + needs: build + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.apex_gitref || '' }} + submodules: recursive + + - name: Download Wheel Artifact + uses: actions/download-artifact@v4 + with: + name: apex-wheel + path: dist/ + + - name: Pull Docker Image + run: | + docker pull ${{ env.DOCKER_IMAGE }} + + - name: Start Background Docker Container + run: | + docker run -d --name apex-test-container \ + --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host \ + -e OMP_NUM_THREADS=8 \ + -e TORCH_NCCL_ASYNC_ERROR_HANDLING=1 \ + -e NCCL_DEBUG=WARN \ + -v ${{ github.workspace }}:/workspace -w /workspace \ + ${{ env.DOCKER_IMAGE }} sleep infinity + + - name: Install Dependencies and Built Wheel + run: | + docker exec apex-test-container bash -c " + set -e + pip install expecttest onnxscript + pip install dist/apex-*.whl + " + + - name: Run L0 tests + if: ${{ (always()) && (github.event_name != 'workflow_dispatch' || inputs.run_l0) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd tests/L0 + sh run_rocm.sh 2>&1 | tee ../../L0_results.log + " + + - name: Run Contrib tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_contrib) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd apex/contrib/test + python3 run_rocm_extensions.py 2>&1 | tee ../../../contrib_results.log + " + + - name: Run Peer Halo Exchange tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_halo) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + export HSA_FORCE_FINE_GRAIN_PCIE=1 + export HSA_ENABLE_SDMA=0 + torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee halo_results.log + " + + - name: Run Distributed Synced BatchNorm tests + if: ${{ (success() || failure()) && (github.event_name != 'workflow_dispatch' || inputs.run_syncbn) }} + run: | + docker exec apex-test-container bash -c " + set -eo pipefail + cd tests/distributed/synced_batchnorm + sh unit_test.sh 2>&1 | tee ../../../syncbn_results.log + " + + - name: Fix Artifact Permissions + if: always() + run: | + docker exec apex-test-container bash -c "chown -R $(id -u):$(id -g) *.log" + + - name: Cleanup Background Container + if: always() + run: docker rm -f apex-test-container + + - name: Upload Test Logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-logs + path: | + *.log + retention-days: 14 diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py index 72d88688e..e5fc8ebfd 100644 --- a/tests/test_extension_import.py +++ b/tests/test_extension_import.py @@ -101,30 +101,36 @@ def get_environment(self): """ # Get current environment and ensure CUDA/PyTorch libraries are available env = os.environ.copy() - - # Add common CUDA library paths + ld_library_path = env.get('LD_LIBRARY_PATH', '') - cuda_paths = [ - '/usr/local/cuda/lib64', - '/usr/local/cuda/lib', - '/opt/conda/lib', - '/usr/lib/x86_64-linux-gnu' - ] - + extra_paths = [] + # Add PyTorch library path try: import torch torch_lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib') if os.path.exists(torch_lib_path): - cuda_paths.append(torch_lib_path) + extra_paths.append(torch_lib_path) except ImportError: pass - + + # Add ROCm library path if present + rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm') + rocm_lib = os.path.join(rocm_path, 'lib') + if os.path.exists(rocm_lib): + extra_paths.append(rocm_lib) + + # Add common CUDA library paths (only those that exist) + for path in ['/usr/local/cuda/lib64', '/usr/local/cuda/lib', + '/opt/conda/lib', '/usr/lib/x86_64-linux-gnu']: + if os.path.isdir(path): + extra_paths.append(path) + # Update LD_LIBRARY_PATH if ld_library_path: - env['LD_LIBRARY_PATH'] = ':'.join(cuda_paths) + ':' + ld_library_path + env['LD_LIBRARY_PATH'] = ':'.join(extra_paths) + ':' + ld_library_path else: - env['LD_LIBRARY_PATH'] = ':'.join(cuda_paths) + env['LD_LIBRARY_PATH'] = ':'.join(extra_paths) return env @@ -229,7 +235,14 @@ def test_extensions_import(self): error_display = error_message[:17] + "..." if len(error_message) > 20 else error_message print(f"{extension:<30} {success:<10} {error_display:<20}") print("-" * 60) - + + # Fail the test if any extensions failed to import + failed_extensions = [ext for ext, success, _ in results if not success] + self.assertEqual( + len(failed_extensions), 0, + f"{len(failed_extensions)} extension(s) failed to import: {', '.join(failed_extensions)}" + ) + if __name__ == '__main__': unittest.main() \ No newline at end of file