Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ jobs:

env:
RELEASE_REPO: hw-native-sys/PTOAS
RELEASE_VER: 0.31
RELEASE_TAG: v0.31
RELEASE_VER: 0.37
RELEASE_TAG: v0.37
CLI_DIR: /installers/ptoas-cli
PTOISA_COMMIT: 0af942568a4f2868673da0a35b0f5b64f27a20d5
PTOISA_COMMIT: 933ad5d84c98377ca19f1de2e6616ba79136056a

steps:
- name: Install system packages
Expand Down
8 changes: 4 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ RUN pip install --no-cache-dir \
pytest pybind11 nanobind setuptools wheel \
ipython jupyterlab matplotlib pandas

# This specific commit might not be found as it has been forced push over
ARG PTOISA_COMMIT=4e27a104f948e883e0bef44670252381bff794c5
# This commit is dated 2024-05-12
ARG PTOISA_COMMIT=933ad5d84c98377ca19f1de2e6616ba79136056a
WORKDIR /sources
RUN git clone --single-branch --branch master https://gitcode.com/cann/pto-isa.git \
&& cd pto-isa && git checkout $PTOISA_COMMIT
Expand All @@ -29,8 +29,8 @@ ARG CACHE_BURST=1
# ARG ARCH=x86_64
ARG ARCH=aarch64
ARG RELEASE_REPO=hw-native-sys/PTOAS
ARG RELEASE_VER=0.36
ARG RELEASE_TAG=v${RELEASE_VER}
ARG RELEASE_VER=0.37
ARG RELEASE_TAG=v0.37
ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz

Expand Down
82 changes: 82 additions & 0 deletions examples/aot/flash_attention/140tflops/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Flash Attention 140 TFLOP/s DSL Builders

This directory has two PTODSL Flash Attention builders:

- `fa_dsl_builder.py`: default `TILE_S1=256` builder.
- `fa_dsl_builder_tile512.py`: experimental `TILE_S1=512` builder.

Both compile scripts write the same runtime artifact:

```text
build_artifacts/fa_dsl.so
```

`run.py` always loads that file. Compile first, then run.

## Build

Default 256-tile kernel:

```bash
bash compile.sh
```

Experimental 512-tile kernel:

```bash
bash compile_tile512.sh
```

The 512-tile builder uses `TILE_S1=512` and `QK_PRELOAD=4`, so it requires
`S1 >= 2048`. The default `run.py` sweep skips `S1=1024` for this builder.

## Run

Run one or more specific sequence lengths:

```bash
python run.py --s1-values 8192
python run.py --s1-values 8192,131072
```

Benchmark PTODSL perf only for one sequence length:

```bash
python run.py --perf-mode 131072
```

## Vector Barrier Removal Experiment

Both compile scripts accept selected generated-C++ vector barrier removals:

```bash
bash compile.sh --remove-vec-barriers line1,line2,...
bash compile_tile512.sh --remove-vec-barriers line1,line2,...
```

This only removes lines containing:

```cpp
pipe_barrier(PIPE_V);
```

The patched C++ is emitted as:

```text
build_artifacts/fa_dsl_patched.cpp
```

The compiled shared object is still:

```text
build_artifacts/fa_dsl.so
```

### Known Useful 256-Tile Variant

Use:

```bash
bash compile.sh --remove-vec-barriers 1264,1267,1272,1275,1279,1282,1311,1313,1316,1320,1322,1325,1328,1330,1333,1362,1364,1367,1371,1373,1376,1379,1381,1384,1390
python run.py --perf-mode 131072
```
2 changes: 1 addition & 1 deletion examples/aot/flash_attention/140tflops/caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ extern "C" void call_kernel(
(void)fftsLen;

call_both<<<blockDim, nullptr, stream>>>(
(__gm__ int64_t *)fftsAddr,
(__gm__ uint64_t *)fftsAddr,
(__gm__ float *)gmSlotBuffer,
(__gm__ half *)gmSlotBuffer,
(__gm__ half *)q,
Expand Down
43 changes: 20 additions & 23 deletions examples/aot/flash_attention/140tflops/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,39 @@
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/compile_common.sh"

ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
PTO_LIB_PATH="${PTO_LIB_PATH:-/sources/pto-isa}"
NPU_ARCH="${NPU_ARCH:-dav-2201}"

PTO_LEVEL="${PTO_LEVEL:-}"
MLIR_PATH="${ARTIFACT_DIR}/fa_dsl.mlir"
GENERATED_CPP="${ARTIFACT_DIR}/fa_dsl.cpp"
PATCHED_CPP="${ARTIFACT_DIR}/fa_dsl_patched.cpp"
LIB_PATH="${ARTIFACT_DIR}/fa_dsl.so"
FA_DSL_BUILDER="${FA_DSL_BUILDER:-fa_dsl_builder.py}"
BUILDER_PATH="${SCRIPT_DIR}/${FA_DSL_BUILDER}"
PTOAS_SYNC_ARGS=(--enable-insert-sync)
RUNTIME_BUILDER_PATH="${ARTIFACT_DIR}/fa_dsl_runtime_builder.py"
BUILDER_PATH="${SCRIPT_DIR}/fa_dsl_builder.py"

if [[ $# -gt 1 ]]; then
echo "Usage: $0 [--manual-sync]" >&2
exit 2
fi

if [[ $# -eq 1 ]]; then
case "$1" in
--manual-sync)
PTOAS_SYNC_ARGS=()
;;
*)
echo "Usage: $0 [--manual-sync]" >&2
exit 2
;;
esac
fi
parse_common_compile_args "$@"

mkdir -p "${ARTIFACT_DIR}"
rm -f "${MLIR_PATH}" "${GENERATED_CPP}" "${LIB_PATH}"
rm -f "${MLIR_PATH}" "${GENERATED_CPP}" "${PATCHED_CPP}" "${LIB_PATH}" "${RUNTIME_BUILDER_PATH}"

if [[ ! -f "${BUILDER_PATH}" ]]; then
echo "Builder not found: ${BUILDER_PATH}" >&2
exit 2
fi

python "${BUILDER_PATH}" > "${MLIR_PATH}"
ptoas --pto-arch=a3 "${PTOAS_SYNC_ARGS[@]}" "${MLIR_PATH}" > "${GENERATED_CPP}"

PTOAS_ARGS=(--pto-arch=a3)
if [[ -n "${PTO_LEVEL}" ]]; then
PTOAS_ARGS+=("--pto-level=${PTO_LEVEL}")
fi
PTOAS_ARGS+=("${PTOAS_SYNC_ARGS[@]}")

ptoas "${PTOAS_ARGS[@]}" "${MLIR_PATH}" > "${GENERATED_CPP}"
maybe_patch_vec_barriers "${GENERATED_CPP}" "${PATCHED_CPP}" "${REMOVE_VEC_BARRIER_LINES}"

bisheng \
-I"${PTO_LIB_PATH}/include" \
Expand All @@ -54,9 +49,11 @@ bisheng \
-cce-enable-mix \
--npu-arch="${NPU_ARCH}" -DMEMORY_BASE \
-std=gnu++17 \
-DKERNEL_CPP="\"${GENERATED_CPP}\"" \
-DKERNEL_CPP="\"${PATCHED_CPP}\"" \
"${SCRIPT_DIR}/caller.cpp" \
-o "${LIB_PATH}"

echo "Generated ${GENERATED_CPP}."
echo "Built ${LIB_PATH}."
cp "${BUILDER_PATH}" "${RUNTIME_BUILDER_PATH}"
echo "Runtime builder ${RUNTIME_BUILDER_PATH}."
56 changes: 56 additions & 0 deletions examples/aot/flash_attention/140tflops/compile_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash

parse_common_compile_args() {
PTOAS_SYNC_ARGS=(--enable-insert-sync)
REMOVE_VEC_BARRIER_LINES=""

while [[ $# -gt 0 ]]; do
case "$1" in
--remove-vec-barriers)
if [[ $# -lt 2 || -z "$2" ]]; then
echo "--remove-vec-barriers requires a comma-separated line list" >&2
exit 2
fi
REMOVE_VEC_BARRIER_LINES="$2"
shift 2
;;
*)
echo "Usage: $0 [--remove-vec-barriers line1,line2,...]" >&2
exit 2
;;
esac
done
}

maybe_patch_vec_barriers() {
local src_cpp="$1"
local dst_cpp="$2"
local raw_lines="$3"

if [[ -z "${raw_lines}" ]]; then
PATCHED_CPP="${src_cpp}"
return
fi

python - "${src_cpp}" "${dst_cpp}" "${raw_lines}" <<'PY'
from pathlib import Path
import sys

src = Path(sys.argv[1])
dst = Path(sys.argv[2])
remove_lines = {int(part.strip()) for part in sys.argv[3].split(",") if part.strip()}

lines = src.read_text().splitlines()
patched = []
for i, line in enumerate(lines, start=1):
if i in remove_lines and "pipe_barrier(PIPE_V);" in line:
patched.append(" /* removed PIPE_V barrier via --remove-vec-barriers */")
else:
patched.append(line)

dst.write_text("\n".join(patched) + "\n")
print(f"Patched generated C++ -> {dst}")
PY

PATCHED_CPP="${dst_cpp}"
}
56 changes: 56 additions & 0 deletions examples/aot/flash_attention/140tflops/compile_tile512.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/compile_common.sh"

ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
PTO_LIB_PATH="${PTO_LIB_PATH:-/sources/pto-isa}"
NPU_ARCH="${NPU_ARCH:-dav-2201}"
PTO_LEVEL="${PTO_LEVEL:-}"

MLIR_PATH="${ARTIFACT_DIR}/fa_dsl.mlir"
GENERATED_CPP="${ARTIFACT_DIR}/fa_dsl.cpp"
PATCHED_CPP="${ARTIFACT_DIR}/fa_dsl_patched.cpp"
LIB_PATH="${ARTIFACT_DIR}/fa_dsl.so"
RUNTIME_BUILDER_PATH="${ARTIFACT_DIR}/fa_dsl_runtime_builder.py"
BUILDER_PATH="${SCRIPT_DIR}/fa_dsl_builder_tile512.py"

parse_common_compile_args "$@"

mkdir -p "${ARTIFACT_DIR}"
rm -f "${MLIR_PATH}" "${GENERATED_CPP}" "${PATCHED_CPP}" "${LIB_PATH}" "${RUNTIME_BUILDER_PATH}"

python "${BUILDER_PATH}" > "${MLIR_PATH}"

PTOAS_ARGS=(--pto-arch=a3)
if [[ -n "${PTO_LEVEL}" ]]; then
PTOAS_ARGS+=("--pto-level=${PTO_LEVEL}")
fi
PTOAS_ARGS+=("${PTOAS_SYNC_ARGS[@]}")

ptoas "${PTOAS_ARGS[@]}" "${MLIR_PATH}" > "${GENERATED_CPP}"
maybe_patch_vec_barriers "${GENERATED_CPP}" "${PATCHED_CPP}" "${REMOVE_VEC_BARRIER_LINES}"

bisheng \
-I"${PTO_LIB_PATH}/include" \
-fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
-Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-xcce -Xhost-start -Xhost-end \
-mllvm -cce-aicore-stack-size=0x8000 \
-mllvm -cce-aicore-function-stack-size=0x8000 \
-mllvm -cce-aicore-record-overflow=true \
-mllvm -cce-aicore-addr-transform \
-mllvm -cce-aicore-dcci-insert-for-scalar=false \
-cce-enable-mix \
--npu-arch="${NPU_ARCH}" -DMEMORY_BASE \
-std=gnu++17 \
-DKERNEL_CPP="\"${PATCHED_CPP}\"" \
"${SCRIPT_DIR}/caller.cpp" \
-o "${LIB_PATH}"

cp "${BUILDER_PATH}" "${RUNTIME_BUILDER_PATH}"

echo "Generated ${GENERATED_CPP}."
echo "Built ${LIB_PATH}."
echo "Runtime builder ${RUNTIME_BUILDER_PATH}."
Loading
Loading