From c7e730580da070acc17cdf5520b35a7f2efed3be Mon Sep 17 00:00:00 2001 From: Corey Adams <6619961+coreyjadams@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:27:57 -0500 Subject: [PATCH 1/3] First pass adding ASV benchmarks --- .../workflows/github-nightly-benchmarks.yml | 200 ++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 .github/workflows/github-nightly-benchmarks.yml diff --git a/.github/workflows/github-nightly-benchmarks.yml b/.github/workflows/github-nightly-benchmarks.yml new file mode 100644 index 0000000000..2aa73af6ed --- /dev/null +++ b/.github/workflows/github-nightly-benchmarks.yml @@ -0,0 +1,200 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This CI runs nightly to execute the ASV benchmark suite (benchmarks/, +# configured by asv.conf.json) on a GPU runner. It benchmarks the current +# HEAD against the uv-managed `.venv` (the same pinned cu12 stack the rest of +# CI uses) and uploads the raw ASV results, the published HTML dashboard, and +# the generated benchmark plots as artifacts. Results are NOT persisted +# across runs: each night is a fresh, artifact-only run. +# +# ---------------------------------------------------------------------------- +# Why `asv run --python=same` instead of asv's own virtualenv: +# +# asv.conf.json declares `environment_type: virtualenv` with a matrix +# requiring `cuml-cu13` (CUDA 13). Letting asv build that environment +# inside this CUDA 12.8 container would (a) bypass the carefully pinned +# cu12 CUDA wheels the rest of CI uses and (b) mismatch the container's +# CUDA runtime. `--python=same` runs the benchmarks in the existing uv +# `.venv` and ignores the asv.conf.json matrix/build/install commands +# entirely, so no change to asv.conf.json is required. +# +# Cache usage: +# This workflow only RESTORES the uv download cache (and optionally the +# JIT compilation cache) published by the main nightly +# (github-nightly-uv.yml). It never saves/replaces those caches, so it +# cannot race the main nightly's delete-before-save on the shared +# `-latest` slots. Reusing the same UV_CACHE_KEY_PREFIX + EXTRAS_TAG is +# what lets the warm cache hit here. +# ---------------------------------------------------------------------------- + +name: Nightly Github ASV Benchmarks +on: + schedule: + # Run nightly at 4 AM UTC -- offset from the 2 AM main nightly so the two + # workflows do not contend for the same GPU runner pool. + - cron: '0 4 * * *' + workflow_dispatch: + # Allow manual triggering + +permissions: + contents: read + +# Serialise overlapping benchmark runs (manual + schedule, or two manuals). +# We do NOT cancel in-progress: a benchmark run is cheap to let finish and we +# would rather have last night's artifacts than none. +concurrency: + group: nightly-github-benchmarks + cancel-in-progress: false + +# The CUDA container's default shell is sh, which does not support +# `set -o pipefail`. Force bash everywhere. +defaults: + run: + shell: bash + +env: + # ---- Container baseline identity --------------------------------------- + # Keep these in lockstep with github-nightly-uv.yml so this workflow hits + # the warm uv download cache that nightly publishes. Keep CONTAINER_ID in + # sync with the `image:` tag below. + PYTHON_VERSION: "3.12" + UV_VERSION: "0.11.7" + CONTAINER_ID: "cuda12.8.1-cudnn-devel-ubuntu24.04" + # All feature extras + cu12 backend. Must match github-nightly-uv.yml so + # the restored uv cache is valid AND so the benchmark dependencies resolve: + # * cu12 -> torch, cuml-cu12 (knn/functional "cuml" impls) + # * nn-extras -> scipy (knn/functional "scipy" impls) + # * mesh-extras -> matplotlib (plot_functional_benchmarks.py) + EXTRAS_TAG: "cu12,natten-cu12,utils-extras,mesh-extras,nn-extras,model-extras,datapipes-extras,uq-extras,gnns,sym,transformer-engine-cu12" + + # ---- Cache key prefixes ------------------------------------------------ + # Must match github-nightly-uv.yml to reuse the warm caches it publishes. + UV_CACHE_KEY_PREFIX: "uv-cache-nightly-cuda12.8.1-cudnn-devel-ubuntu24.04-py3.12-uv0.11.7-fullextras" + JIT_CACHE_KEY_PREFIX: "jit-cache-cuda12.8.1-cudnn-devel-ubuntu24.04-py3.12" + JIT_CACHE_DIR: "/root/.cache/jit" + + # ---- uv read-only defaults -------------------------------------------- + # UV_FROZEN=1 -> all uv invocations refuse to mutate the lockfile. + # UV_NO_SYNC=1 -> `uv run` will not implicitly sync. The explicit + # `uv sync` inside setup-uv-env is unaffected by this flag. + UV_FROZEN: "1" + UV_NO_SYNC: "1" + + PYVISTA_OFF_SCREEN: "true" + +jobs: + benchmarks: + name: ASV Benchmarks + runs-on: linux-amd64-gpu-h100-latest-1 + container: + image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04 + # /dev/shm defaults to 64 MiB in docker; 2 GiB matches the rest of CI + # and the PyTorch container default. + options: --shm-size=2g + + steps: + # First checkout brings the repo files (including the composite actions + # below) into the workspace. The bare CUDA image has no git yet, so + # actions/checkout falls back to a tarball download with no .git/. + - uses: actions/checkout@v5 + + - name: Bootstrap cuDNN CI container + uses: ./.github/actions/bootstrap-cudnn-ci + with: + python-version: ${{ env.PYTHON_VERSION }} + uv-version: ${{ env.UV_VERSION }} + + # Re-run checkout now that bootstrap-cudnn-ci has installed git. This + # populates a real .git/ directory, which asv requires (asv.conf.json + # sets "dvcs": "git") to resolve the HEAD commit hash for + # `asv run --python=same`. fetch-depth: 1 is enough because we only + # benchmark the current HEAD. + - name: Re-checkout with git history for asv + uses: actions/checkout@v5 + + # Restore the warm uv download cache (published by the main nightly) and + # rebuild .venv from the frozen lockfile. setup-uv-env is restore-only; + # we deliberately do NOT add a replace-cache step so this workflow cannot + # race the main nightly's delete-before-save on the `-latest` slot. + - name: Setup uv environment from cache + uses: ./.github/actions/setup-uv-env + with: + uv-cache-key-prefix: ${{ env.UV_CACHE_KEY_PREFIX }} + uv-cache-key-suffix: "latest" + extras: ${{ env.EXTRAS_TAG }} + + # Restore compiled JIT artifacts (warp, triton, inductor) from the main + # nightly so kernel compilation in the functional/warp benchmarks is + # skipped when source hasn't changed. Restore-only and fail-open: a miss + # only costs compilation time. + - name: Restore JIT compilation cache + id: jit-cache-restore + uses: actions/cache/restore@v5 + with: + path: ${{ env.JIT_CACHE_DIR }} + key: ${{ env.JIT_CACHE_KEY_PREFIX }}-latest + + # asv prompts interactively for machine metadata on first use; --yes + # accepts the autodetected defaults and writes ~/.asv-machine.json. + - name: Configure asv machine + run: | + uv run --no-sync asv machine --yes + + # Benchmark the installed physicsnemo in .venv against the current HEAD. + # --launch-method spawn matches benchmarks/run_benchmarks.sh (required for + # CUDA compatibility). --python=same runs in the existing uv env and + # ignores the asv.conf.json matrix. `uv run --no-sync` puts .venv/bin on + # PATH and keeps the env read-only (UV_FROZEN/UV_NO_SYNC). + - name: Run ASV benchmarks + env: + WARP_CACHE_PATH: ${{ env.JIT_CACHE_DIR }}/warp + TRITON_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/triton + TORCHINDUCTOR_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/inductor + run: | + uv run --no-sync asv run --launch-method spawn --python=same + + # Generate the browsable HTML dashboard from the fresh results. + - name: Publish ASV HTML report + if: ${{ !cancelled() }} + run: | + uv run --no-sync asv publish + + # Generate the functional benchmark bar plots under docs/img/. Runs even + # if publish failed, as long as at least one result JSON was produced. + - name: Generate functional benchmark plots + if: ${{ !cancelled() }} + run: | + uv run --no-sync python benchmarks/physicsnemo/nn/functional/plot_functional_benchmarks.py + + - name: Upload ASV results and HTML report + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: asv-benchmarks-nightly + path: | + .asv/results + .asv/html + retention-days: 30 + + - name: Upload functional benchmark plots + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: asv-benchmark-plots-nightly + path: docs/img/nn/functional/** + retention-days: 30 + if-no-files-found: ignore From a3305b84638fe5fd8d6f5ae499a247c72c6ecff4 Mon Sep 17 00:00:00 2001 From: Corey Adams <6619961+coreyjadams@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:41:02 -0500 Subject: [PATCH 2/3] Add push trigger TEMP to enable testing of asv ci --- .github/workflows/github-nightly-benchmarks.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/github-nightly-benchmarks.yml b/.github/workflows/github-nightly-benchmarks.yml index 2aa73af6ed..a1209a78c8 100644 --- a/.github/workflows/github-nightly-benchmarks.yml +++ b/.github/workflows/github-nightly-benchmarks.yml @@ -43,6 +43,13 @@ name: Nightly Github ASV Benchmarks on: + # TEMPORARY: run on every push to the development branch so the workflow can + # be exercised before it lands on the default branch (workflow_dispatch and + # schedule are only registered once the file is on the default branch). + # REMOVE this push trigger before merging. + push: + branches: + - asv-benchmarks-in-ci schedule: # Run nightly at 4 AM UTC -- offset from the 2 AM main nightly so the two # workflows do not contend for the same GPU runner pool. From f0adba0b4bb0e63148c001d13f80e0627e9d0aca Mon Sep 17 00:00:00 2001 From: Corey Adams <6619961+coreyjadams@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:59:29 -0500 Subject: [PATCH 3/3] fix some issues in the asv github workflow --- .../workflows/github-nightly-benchmarks.yml | 60 ++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github-nightly-benchmarks.yml b/.github/workflows/github-nightly-benchmarks.yml index a1209a78c8..5f54029abf 100644 --- a/.github/workflows/github-nightly-benchmarks.yml +++ b/.github/workflows/github-nightly-benchmarks.yml @@ -155,6 +155,23 @@ jobs: path: ${{ env.JIT_CACHE_DIR }} key: ${{ env.JIT_CACHE_KEY_PREFIX }}-latest + # asv shells out to git for every command (the container runs as root but + # the checked-out tree is owned by a different uid, which otherwise trips + # git's "detected dubious ownership" guard and fails `asv publish`). Also + # ensure a local `main` ref exists: asv resolves conf.branches (["main"] + # in asv.conf.json) to build its commit list, which fails on a feature + # branch where `main` was never fetched. Results are still stored under + # the real HEAD sha via --set-commit-hash below, so pointing `main` at + # HEAD here only satisfies asv's branch lookup. + - name: Prepare git for asv + run: | + set -euo pipefail + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git config --global --add safe.directory '*' + if ! git rev-parse --verify --quiet main >/dev/null; then + git branch main HEAD + fi + # asv prompts interactively for machine metadata on first use; --yes # accepts the autodetected defaults and writes ~/.asv-machine.json. - name: Configure asv machine @@ -166,13 +183,54 @@ jobs: # CUDA compatibility). --python=same runs in the existing uv env and # ignores the asv.conf.json matrix. `uv run --no-sync` puts .venv/bin on # PATH and keeps the env read-only (UV_FROZEN/UV_NO_SYNC). + # + # --set-commit-hash is REQUIRED: with an existing environment (--python= + # same) asv skips saving results entirely unless a commit hash is pinned + # (see asv/commands/run.py: skip_save is True for ExistingEnvironment when + # set_commit_hash is None). Without it, .asv/results stays empty and the + # publish/plot steps have nothing to consume. --no-pull avoids an + # unnecessary `git fetch` against the shallow checkout. + # + # Verbosity: + # -v / --verbose : per-benchmark progress and asv internals. + # --show-stderr : surface each benchmark process's stdout/stderr so + # a slow or hanging benchmark is visible live in the + # job log instead of only the terminal summary line. + # PYTHONUNBUFFERED=1: flush asv/benchmark output immediately so the live + # log is not held back by stdio buffering. + # + # Exit-code handling: asv returns 2 when one or more benchmarks fail (e.g. + # the functional KNN "scipy" cases, which cannot run on CUDA inputs). + # Those are individual benchmark failures, not an infrastructure problem, + # so we downgrade exit code 2 to a warning and still publish the partial + # results. Any other nonzero exit (config/usage/infra errors) remains + # fatal. - name: Run ASV benchmarks env: WARP_CACHE_PATH: ${{ env.JIT_CACHE_DIR }}/warp TRITON_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/triton TORCHINDUCTOR_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/inductor + PYTHONUNBUFFERED: "1" run: | - uv run --no-sync asv run --launch-method spawn --python=same + commit_hash="$(git rev-parse HEAD)" + # GitHub's default bash shell runs with `set -e`; disable it around + # the asv call so we can inspect the exit code instead of aborting. + set +e + uv run --no-sync asv run \ + --launch-method spawn \ + --python=same \ + --set-commit-hash "$commit_hash" \ + --no-pull \ + --verbose \ + --show-stderr + rc=$? + set -e + if [ "$rc" -eq 2 ]; then + echo "::warning::asv reported one or more failed benchmarks (exit code 2); publishing partial results." + elif [ "$rc" -ne 0 ]; then + echo "::error::asv run failed with exit code $rc." + exit "$rc" + fi # Generate the browsable HTML dashboard from the fresh results. - name: Publish ASV HTML report