pytorch · luhenry · May 23, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-set -exu
+set -eu
 
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"

diff --git a/.ci/scripts/test_riscv_qemu.sh b/.ci/scripts/test_riscv_qemu.sh
@@ -4,50 +4,65 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# CI wrapper: install RISC-V cross-compile + qemu-user tooling, then run the
-# RISC-V smoke test (export, cross-compile, qemu-user execution) via
-# examples/riscv/run.sh. The bundled-IO comparison and Test_result: PASS
-# check are done by run.sh.
+# CI wrapper: install riscv32/64 cross-compile + qemu tooling, then drive
+# examples/riscv/run.sh which does the export, cross-compile, qemu run, and
+# bundled-IO PASS check.
 
 set -eu
 
 script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 et_root_dir=$(realpath "${script_dir}/../..")
 
 model="add"
-xnnpack=false
+backend="portable"
 quantize=false
+os="linux"
+arch="rv64"
+qemu_cpu_ext=""
 verbose_xnnpack=false
 debug_xnnpack=false
+build_dir=
 
 usage() {
     cat <<EOF
 Usage: $(basename "$0") [options]
 Options:
-  --model=<NAME>     Which model to export and run (default: add)
-  --xnnpack          Enable the XNNPACK backend (AOT partitioner + runtime)
-  --quantize         Produce an 8-bit quantized model
-  --verbose-xnnpack  Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch
-  --debug-xnnpack    Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
-  -h, --help         Show this help
+  --model=<NAME>          Which model to export and run (default: ${model})
+  --quantize              Produce an 8-bit quantized model
+  --backend=<NAME>        AOT backend (portable|xnnpack) (default: ${backend})
+  --os=<NAME>             Target OS (linux|baremetal) (default: ${os})
+  --arch=<NAME>           Target arch (rv32|rv64) (default: ${arch})
+  --qemu-cpu-ext=<EXT>    QEMU -cpu extensions (no rv32/rv64 prefix, default: none)
+  --build-dir=<DIR>       Build/output directory for this configuration (required)
+  --verbose-xnnpack       Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch
+  --debug-xnnpack         Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
+  -h, --help              Show this help
 EOF
 }
 
 for arg in "$@"; do
     case $arg in
         --model=*) model="${arg#*=}" ;;
-        --xnnpack) xnnpack=true ;;
         --quantize) quantize=true ;;
+        --backend=*) backend="${arg#*=}" ;;
+        --os=*) os="${arg#*=}" ;;
+        --arch=*) arch="${arg#*=}" ;;
+        --qemu-cpu-ext=*) qemu_cpu_ext="${arg#*=}" ;;
+        --build-dir=*) build_dir="${arg#*=}" ;;
         --debug-xnnpack) debug_xnnpack=true ;;
         --verbose-xnnpack) verbose_xnnpack=true ;;
         -h|--help) usage; exit 0 ;;
         *) echo "Unknown option: $arg" >&2; usage; exit 1 ;;
     esac
 done
 
+if [[ -z "${build_dir}" ]]; then
+    echo "[test_riscv_qemu.sh] --build-dir is required" >&2; usage; exit 1
+fi
+
 run_extra_args=()
-if ${xnnpack}; then
-    run_extra_args+=(--xnnpack)
+if [ -n "${qemu_cpu_ext}" ]; then
+    run_extra_args+=(--qemu-cpu-ext="${qemu_cpu_ext}")
 fi
 if ${quantize}; then
     run_extra_args+=(--quantize)
@@ -59,5 +74,8 @@ if ${verbose_xnnpack}; then
     run_extra_args+=(--verbose-xnnpack)
 fi
 
-bash "${et_root_dir}/examples/riscv/setup.sh"
-bash "${et_root_dir}/examples/riscv/run.sh" --model="${model}" "${run_extra_args[@]}"
+bash "${et_root_dir}/examples/riscv/setup-${os}.sh"
+bash "${et_root_dir}/examples/riscv/run.sh" \
+    --model="${model}" --backend="${backend}" --os="${os}" --arch="${arch}" \
+    --build-dir="${build_dir}" \
+    "${run_extra_args[@]}"
diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml
@@ -13,35 +13,44 @@ on:
         type: number
         default: 30
       model:
-        description: 'Which model to run. Possible values are: add, mv2 (mobilenetv2)'
+        description: 'Which model to run (add, mv2, mobilebert, llama2, resnet18, yolo26)'
         required: false
         type: string
         default: 'add'
-      xnnpack:
-        description: 'Whether to enable XNNPACK'
-        required: false
-        type: boolean
-        default: false
       quantize:
         description: 'Produce an 8-bit quantized model'
         required: false
         type: boolean
         default: false
-      qemu-cpu:
-        description: 'Configuration(s) for the CPU to emulate with QEMU, expecting a JSON array'
-        required: true
+      backend:
+        description: 'AOT backend to lower to (portable|xnnpack)'
+        required: false
         type: string
-      docker-image:
-        description: 'The docker image to use for this job'
+        default: 'portable'
+      os:
+        description: 'Target OS for the runner (linux|baremetal)'
         required: false
         type: string
+        default: 'linux'
+      arch:
+        description: 'Target architecture (rv32|rv64)'
+        required: false
+        type: string
+        default: 'rv64'
+      qemu-cpu-ext:
+        description: >-
+          JSON array of QEMU -cpu *extension* strings (no rv32/rv64 prefix).
+          The script splices each entry with `arch` to form the final -cpu
+          value. Use [""] for plain base-ISA runs.
+        required: true
+        type: string
 
 jobs:
   run:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
-      docker-image: ci-image:executorch-ubuntu-24.04-gcc14
+      docker-image: ${{ inputs.os == 'linux' && 'ci-image:executorch-ubuntu-24.04-gcc14' || 'ci-image:executorch-ubuntu-26.04-gcc15' }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: ${{ inputs.timeout }}
@@ -55,20 +64,26 @@ jobs:
         # Allows failure in `echo | jq | while read` pipeline to bubble up and fail the workflow
         set -o pipefail
 
-        echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do
-          export QEMU_CPU="${qemu_cpu}"
-          export GCC_VERSION=14
+        echo '${{ inputs.qemu-cpu-ext }}' | jq -r '.[]' | while IFS= read -r qemu_cpu_ext; do
+          variant_slug="${qemu_cpu_ext//,/_}"; variant_slug="${variant_slug//=/_}"; variant_slug="${variant_slug:-base}"
+          build_dir="riscv_test/${{ inputs.model }}${{ inputs.quantize && '_q' || '' }}/${{ inputs.backend }}/${{ inputs.os }}-${{ inputs.arch }}-${variant_slug}"
+
           bash .ci/scripts/test_riscv_qemu.sh \
             --model="${{ inputs.model }}" \
-            ${{ inputs.xnnpack && '--xnnpack --verbose-xnnpack' || '' }} \
+            --backend="${{ inputs.backend }}" \
+            --os="${{ inputs.os }}" \
+            --arch="${{ inputs.arch }}" \
+            --qemu-cpu-ext="${qemu_cpu_ext}" \
+            --build-dir="${build_dir}" \
+            ${{ inputs.backend == 'xnnpack' && '--verbose-xnnpack' || '' }} \
             ${{ inputs.quantize && '--quantize' || '' }}
 
-          # We only generate riscv_test/${{ inputs.model }}_riscv.etdump.json from `--verbose-xnnpack`.
-          if ${{ inputs.xnnpack }}; then
-            # Generate markdown table from riscv_test/${{ inputs.model }}_riscv.etdump.json, sorted by sum_ms
+          # We only generate run.etdump.json from `--verbose-xnnpack`.
+          if [[ "${{ inputs.backend }}" == "xnnpack" ]]; then
+            # Generate markdown table from ${build_dir}/run.etdump.json, sorted by sum_ms
             (
-              etdump_json="riscv_test/${{ inputs.model }}_riscv.etdump.json"
-              echo "### Model=${{ inputs.model }} XNNPACK=${{ inputs.xnnpack }} Quantize=${{ inputs.quantize }} QEMU_CPU='${QEMU_CPU}'"
+              etdump_json="${build_dir}/run.etdump.json"
+              echo "### Model=${{ inputs.model }} Quantize=${{ inputs.quantize }} Backend=${{ inputs.backend }} OS=${{ inputs.os }} Arch=${{ inputs.arch }}${qemu_cpu_ext:+,${qemu_cpu_ext}}"
               jq -r '
                 def r3: (. * 1000 | round) / 1000;
                 ["Section","Op","Count","Sum (ms)","Avg (ms)","Max (ms)","Microkernels"],

diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml
@@ -10,8 +10,9 @@ on:
   pull_request:
     paths:
       - .github/workflows/riscv64.yml
+      - .github/workflows/_test_riscv.yml
       - .ci/scripts/test_riscv_qemu.sh
-      - tools/cmake/preset/riscv64_linux.cmake
+      - tools/cmake/preset/riscv_*.cmake
       - examples/riscv/**
   workflow_dispatch:
   schedule:
@@ -35,33 +36,42 @@ jobs:
           - llama2
           - resnet18
           - yolo26
-        xnnpack: [true, false]
         quantize: [true, false]
+        backend: [portable, xnnpack]
+        os: [linux, baremetal]
+        arch: [rv64, rv32]
         exclude:
-          # We only enable quantization with XNNPACK
-          - xnnpack: false
-            quantize: true
-          # We don't test quantization for Yolo26
-          - model: yolo26
-            quantize: true
+          # Disable quantization testing with Portable Kernels
+          - { backend: portable, quantize: true }
+          # XNNPACK needs pthreads + dynamic loading (no baremetal)
+          - { backend: xnnpack, os: baremetal }
+          # No quantization recipe for Yolo26.
+          - { model: yolo26, quantize: true }
+          # No riscv32-linux-gnu cross is packaged on Ubuntu.
+          - { os: linux, arch: rv32 }
     permissions:
       id-token: write
       contents: read
     with:
       model: ${{ matrix.model }}
-      xnnpack: ${{ matrix.xnnpack }}
       quantize: ${{ matrix.quantize }}
-      # If XNNPACK, test with multiple RVV length, disabled otherwise
-      qemu-cpu: >-
+      backend: ${{ matrix.backend }}
+      os: ${{ matrix.os }}
+      arch: ${{ matrix.arch }}
+      # JSON array of QEMU -cpu *extension* strings (no rv32/rv64 prefix - that
+      # comes from `arch`). The script splices them as `<arch>,<ext>`. xnnpack
+      # benefits from RVV so it sweeps multiple vlen; everything else just uses
+      # the plain base ISA.
+      qemu-cpu-ext: >-
         ${{
           case(
-            matrix.xnnpack, '[
-              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=128,elen=64,vext_spec=v1.0",
-              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=256,elen=64,vext_spec=v1.0",
-              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0"
+            matrix.backend == 'xnnpack', '[
+              "v=true,vext_spec=v1.0,vlen=128",
+              "v=true,vext_spec=v1.0,vlen=256",
+              "v=true,vext_spec=v1.0,vlen=512"
             ]',
             '[
-              "rv64,zba=true,zbb=true,zbs=true,v=false"
+              "v=false"
             ]'
           )
         }}
@@ -318,7 +318,7 @@
       "displayName": "Build ExecuTorch for riscv64 Linux (cross-compile)",
       "inherits": ["common"],
       "cacheVariables": {
-        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv64_linux.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_linux.cmake",
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv64-linux-gnu-toolchain.cmake"
       },
       "condition": {
@@ -327,6 +327,24 @@
         "rhs": "Linux"
       }
     },
+    {
+      "name": "riscv64-baremetal",
+      "displayName": "Build ExecuTorch for riscv64 baremetal (cross-compile)",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_baremetal.cmake",
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv64-unknown-elf-toolchain.cmake"
+      }
+    },
+    {
+      "name": "riscv32-baremetal",
+      "displayName": "Build ExecuTorch for riscv32 baremetal (cross-compile)",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_baremetal.cmake",
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv32-unknown-elf-toolchain.cmake"
+      }
+    },
     {
       "name": "mlx",
       "displayName": "Build MLX delegate",

diff --git a/examples/riscv/README.md b/examples/riscv/README.md
@@ -1,41 +1,36 @@
 # RISC-V
 
-Cross-compile `executor_runner` for `riscv64-linux-gnu` and run it under
-`qemu-user-static` against a small bundled program. The end-to-end check
-mirrors the Arm Cortex-M e2e flow: a `Test_result: PASS` line in stdout from
-the bundled-IO comparison path is the pass criterion.
+End-to-end smoke tests that cross-compile ExecuTorch for RISC-V and run a bundled program under QEMU. A `Test_result: PASS` line emitted by the bundled-IO comparison path is the pass criterion.
 
-This is the Phase 1 deliverable for the RISC-V Support RFC at
-[pytorch/executorch#18991][rfc]. The cross-compile and runner artifacts
-(toolchain file, preset, AOT script) are designed to carry over unchanged
-to a hardware-runner job once one becomes available; only the invocation
-step (qemu-user vs. native) would change.
-
-[rfc]: https://github.com/pytorch/executorch/issues/18991
+Part of the RISC-V Support RFC, [pytorch/executorch#18991](https://github.com/pytorch/executorch/issues/18991).
 
 ## Quick start (Ubuntu / Debian)
 
 ```bash
-examples/riscv/setup.sh        # apt: gcc-riscv64-linux-gnu, qemu-user-static
-examples/riscv/run.sh          # export, cross-compile, run under qemu-user
+examples/riscv/setup-linux.sh       # apt: gcc cross riscv64-linux-gnu + qemu-user
+examples/riscv/setup-baremetal.sh   # apt: gcc cross riscv64-unknown-elf + qemu-system + picolibc
+examples/riscv/run.sh               # export, cross-compile, run under qemu
 ```
 
-The driver does three steps:
+`run.sh` accepts:
+
+| Flag | Values | Default | Notes |
+|---|---|---|---|
+| `--model=<N>` | `add`, `mv2`, `mobilebert`, `llama2`, `resnet18`, `yolo26` | `add` | which model to export |
+| `--quantize` | flag | off | XNNPACK quantizer (requires `--backend=xnnpack`) |
+| `--backend=<N>` | `portable`, `xnnpack` | `portable` | xnnpack is linux-only |
+| `--os=<N>` | `linux`, `baremetal` | `linux` | qemu-user vs qemu-system + semihosting |
+| `--arch=<N>` | `rv32`, `rv64` | `rv64` | valid <os>-<arch> pairs are `linux-rv64`, `baremetal-rv32`, `baremetal-rv64` |
+| `--qemu-cpu-ext=<S>` | e.g. `v=true,vlen=128` | empty | extensions appended after the arch base |
+
+## Pipelines
+
+**linux**: `aot_riscv.py` → `cmake --preset riscv64-linux` → `executor_runner` under `qemu-riscv64`. Portable kernels + (optional) XNNPACK delegate.
+
+**baremetal**: `aot_riscv.py` → `cmake -S examples/riscv/baremetal` (standalone project; pulls executorch in via `add_subdirectory`) → `executor_runner_baremetal.elf` under `qemu-system-riscv64 -machine virt -bios none -semihosting-config target=native`.
 
-1. `python examples/riscv/aot_riscv.py` exports a `torch.add` module to
-   `riscv_test/add_riscv.bpte` (a BundledProgram with reference outputs
-   embedded for two test cases).
-2. `cmake --preset riscv64-linux` configures the cross-build using
-   `examples/riscv/riscv64-linux-gnu-toolchain.cmake` and
-   `tools/cmake/preset/riscv64_linux.cmake`. `executor_runner` is built
-   against portable kernels with `ET_BUNDLE_IO_ENABLED` defined.
-3. `qemu-riscv64-static` invokes the runner with `--model_path` pointing at
-   the `.bpte`. The runner detects the bundle, runs every embedded test case,
-   and emits `Test_result: PASS` (or `FAIL`) per case.
+The baremetal runner embeds the `.bpte` directly in `.rodata` via the same `examples/arm/executor_runner/pte_to_header.py` Cortex-M uses; semihosting SYS_WRITE0 / SYS_EXIT carry log output and exit status to the host.
 
 ## CI
 
-`.github/workflows/_test_riscv_qemu.yml` is a reusable `workflow_call`
-job (mirroring `_test_cortex_m_e2e.yml`) invoked from `pull.yml` to run on
-every PR. It runs on the standard `linux.2xlarge` x86_64 runner using the
-`executorch-ubuntu-22.04-gcc11` docker image.
+`.github/workflows/riscv64.yml` is the entry point; it fans out into `_test_riscv.yml` over a `(model, backend, os, arch, quantize)` matrix and sweeps `qemu-cpu-ext` per backend. Runs on the `executorch-ubuntu-26.04-gcc15` docker image (needed for the `riscv64-unknown-elf` picolibc + libstdc++ packages - see [setup-linux.sh](setup-linux.sh) or [setup-baremetal.sh](setup-baremetal.sh)).