Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f1b28f4
Fix dangling config reference causing SIGFPE on all models
bong-water-water-bong Jun 24, 2026
325b9e8
Add BitNet 1.58-bit ternary model support
bong-water-water-bong Jun 24, 2026
b42d8fd
Clean up: mark unused out_features param in dequantize_bitnet_weight
bong-water-water-bong Jun 24, 2026
12987b5
Support Bonsai 1-bit Qwen3 loading
bong-water-water-bong Jun 25, 2026
1be3dca
Add BitNet dequantization to Llama loader
bong-water-water-bong Jun 25, 2026
f3ea92a
Support all 1.58-bit and 1-bit model variants (Falcon-E, Bonsai)
bong-water-water-bong Jun 25, 2026
b04281d
Fix code review: ensure hidden_act defaults to relu2 for BitNet models
bong-water-water-bong Jun 25, 2026
25afb47
Auto-configure ROCm Tensile library paths
bong-water-water-bong Jun 25, 2026
ba75d26
Fix Lille-130m weight loading
bong-water-water-bong Jun 25, 2026
16d9eb8
Auto-configure ROCm Tensile library paths + fix lille-130m weight prefix
bong-water-water-bong Jun 25, 2026
4ebbd85
Fix OpenELM: use explicit num_query_heads/ffn_multipliers from config
bong-water-water-bong Jun 25, 2026
44c902d
Fix quantized lm_head/embed_as_linear: use linear_forward in all models
bong-water-water-bong Jun 25, 2026
26aad7e
Fix MXFP4 quantization support (issue #10)
bong-water-water-bong Jun 25, 2026
59e8b78
Fix BitNet chat template capitalize filter and short-name model aliasing
bong-water-water-bong Jun 25, 2026
d14e188
BitNet: runtime quantized matmul (repack ternary → 2-bit affine) + gr…
bong-water-water-bong Jun 25, 2026
dba1381
BitNet: runtime quantized matmul — final improvements
bong-water-water-bong Jun 25, 2026
d0d33ad
BitNet: fall back to dequantize-at-load for correctness
bong-water-water-bong Jun 25, 2026
ef551f8
BitNet: dequantize-at-load with thorough analysis of quantized path
bong-water-water-bong Jun 25, 2026
9bd0848
BitNet: fix 2-bit runtime repack layout
bong-water-water-bong Jun 25, 2026
7b0c42a
Falcon-E: support inverse-scale BitLinear checkpoints
bong-water-water-bong Jun 25, 2026
fa6fc89
docs: universal HF loading path design spec
bong-water-water-bong Jun 26, 2026
90f61a6
Universal HuggingFace loading path phase 1-3
bong-water-water-bong Jun 26, 2026
72acd40
Universal HF loading: fix review findings
bong-water-water-bong Jun 26, 2026
a1445d1
Universal HF loading: auto-quantize, quantization_config, GGUF skeleton
bong-water-water-bong Jun 26, 2026
9ab50ae
GGUF integration + auto-quantize verified
bong-water-water-bong Jun 26, 2026
b08a19c
Server + ModelManager: --auto-quantize and GGUF flags
bong-water-water-bong Jun 26, 2026
20370ee
Server --auto-quantize + generic HF weight remapping
bong-water-water-bong Jun 26, 2026
560c622
GGUF: full quant format support (Q4_0..Q6_K, K-quants)
bong-water-water-bong Jun 26, 2026
049d031
PyTorch .bin → safetensors converter
bong-water-water-bong Jun 26, 2026
ec6896b
1-bit model support: sub-norm detection + key remapping
bong-water-water-bong Jun 26, 2026
3bca870
Generic Llama fallback for unknown model types
bong-water-water-bong Jun 26, 2026
d03f974
1-bit activation quantization + weight pre-quantization
bong-water-water-bong Jun 26, 2026
a24022b
Architecture registration system + PyTorch trust_remote_code
bong-water-water-bong Jun 26, 2026
a9cd8f9
Edge case hardening: clear error messages for bad paths
bong-water-water-bong Jun 26, 2026
7b0208b
Add NPU backend: IRON JIT GEMM on AMD XDNA NPU
bong-water-water-bong Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 91 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,55 @@ FetchContent_Declare(
mlx
# Repo + branch — always build against the latest ROCm backend work.
GIT_REPOSITORY https://github.com/NripeshN/mlx.git
GIT_TAG rocm-support
GIT_TAG 6abf0b7e # rocm-support (pinned working ExecUpdate commit)
GIT_SHALLOW FALSE
)
FetchContent_MakeAvailable(mlx)
# Fetch MLX, apply local patches, then add it. Patching must happen before
# add_subdirectory()/FetchContent_MakeAvailable so CMakeLists.txt changes (for
# example removing unsupported ROCm clang flags) affect generated build files.
FetchContent_GetProperties(mlx)
if(NOT mlx_POPULATED)
FetchContent_Populate(mlx)
endif()
set(MLX_SOURCE_DIR "${mlx_SOURCE_DIR}")

if(MLX_BUILD_ROCM AND MLX_SOURCE_DIR AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx-rocm-build.patch")
execute_process(
COMMAND git apply --check "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx-rocm-build.patch"
WORKING_DIRECTORY "${MLX_SOURCE_DIR}"
RESULT_VARIABLE PATCH_CHECK_RESULT
ERROR_QUIET
OUTPUT_QUIET
)
if(PATCH_CHECK_RESULT EQUAL 0)
message(STATUS "Applying mlx-rocm-build.patch...")
execute_process(
COMMAND git apply "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx-rocm-build.patch"
WORKING_DIRECTORY "${MLX_SOURCE_DIR}"
RESULT_VARIABLE PATCH_RESULT
)
if(PATCH_RESULT EQUAL 0)
message(STATUS "Patch applied successfully")
else()
message(FATAL_ERROR "Failed to apply mlx-rocm-build.patch")
endif()
else()
execute_process(
COMMAND git apply --reverse --check "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx-rocm-build.patch"
WORKING_DIRECTORY "${MLX_SOURCE_DIR}"
RESULT_VARIABLE PATCH_REVERSE_CHECK_RESULT
ERROR_QUIET
OUTPUT_QUIET
)
if(PATCH_REVERSE_CHECK_RESULT EQUAL 0)
message(STATUS "mlx-rocm-build.patch already applied, skipping")
else()
message(FATAL_ERROR "mlx-rocm-build.patch does not apply to fetched MLX source")
endif()
endif()
endif()

add_subdirectory("${mlx_SOURCE_DIR}" "${mlx_BINARY_DIR}")

# nlohmann/json (MLX may already provide this)
if(NOT TARGET nlohmann_json::nlohmann_json)
Expand Down Expand Up @@ -113,6 +158,8 @@ add_library(mlx-lm-common
src/common/base_config.cpp
src/common/hub_api.cpp
src/common/safetensors.cpp
src/common/gguf_loader.cpp
src/common/registry.cpp
src/common/switch_layers.cpp
src/common/ssm_utils.cpp
src/common/rope_utils.cpp
Expand All @@ -135,6 +182,11 @@ target_link_libraries(mlx-lm-common PUBLIC
tokenizers_cpp
)
target_include_directories(mlx-lm-common PUBLIC ${minja_SOURCE_DIR}/include)
# Patched minja headers (capitalize filter, etc.) take precedence over the
# upstream minja version fetched by FetchContent.
target_include_directories(mlx-lm-common BEFORE PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/common/patched
)

# Propagate ROCm flag as compile definition so C++ code can use #if defined(MLX_BUILD_ROCM)
if(MLX_BUILD_ROCM)
Expand Down Expand Up @@ -188,6 +240,7 @@ add_library(mlx-lm-llm
src/llm/models/lfm2.cpp
src/llm/models/nemotron_h.cpp
src/llm/models/granite_moe_hybrid.cpp
src/llm/models/bitnet.cpp
)
target_link_libraries(mlx-lm-llm PUBLIC mlx-lm-common)

Expand Down Expand Up @@ -223,6 +276,33 @@ target_link_libraries(mlx-lm-vlm PUBLIC mlx-lm-common)
# stb include path (header-only)
target_include_directories(mlx-lm-common PUBLIC ${stb_SOURCE_DIR})

# NPU backend (optional, requires XRT)
# NPU backend (optional, requires IRON Python stack + XRT)
if(MLX_LM_BUILD_NPU)
# The NPU backend uses the IRON JIT via Python subprocess.
# Install IRON: pip install mlir-aie
# Set NPU_INSTALL_DIR to the mlir-aie installation prefix.

# Copy JIT helper to build directory
configure_file(
src/npu/npu_jit.py
${CMAKE_BINARY_DIR}/bin/npu_jit.py
COPYONLY
)

add_library(mlx-lm-npu STATIC
src/npu/npu_backend.cpp
)
target_include_directories(mlx-lm-npu PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
target_compile_definitions(mlx-lm-npu PUBLIC
MLX_BUILD_NPU
NPU_INSTALL_DIR="${CMAKE_BINARY_DIR}"
)
message(STATUS "NPU backend enabled (JIT path)")
endif()

if(MLX_LM_BUILD_EXAMPLES)
add_executable(chat examples/chat.cpp)
target_link_libraries(chat PRIVATE mlx-lm-llm mlx-lm-common mlx-lm-core)
Expand All @@ -231,6 +311,10 @@ if(MLX_LM_BUILD_EXAMPLES)
target_compile_definitions(chat PRIVATE MLX_BUILD_ROCM)
target_link_libraries(chat PRIVATE hip::host)
endif()
if(MLX_LM_BUILD_NPU AND TARGET mlx-lm-npu)
target_link_libraries(chat PRIVATE mlx-lm-npu)
target_compile_definitions(chat PRIVATE MLX_BUILD_NPU)
endif()

add_executable(diagnose examples/diagnose.cpp)
target_link_libraries(diagnose PRIVATE mlx-lm-llm mlx-lm-common mlx-lm-core)
Expand Down Expand Up @@ -258,6 +342,11 @@ if(MLX_LM_BUILD_EXAMPLES)
add_executable(test_sdpa_ref examples/test_sdpa_ref.cpp)
target_link_libraries(test_sdpa_ref PRIVATE mlx)

if(MLX_LM_BUILD_NPU AND TARGET mlx-lm-npu)
add_executable(test_npu examples/test_npu.cpp)
target_link_libraries(test_npu PRIVATE mlx-lm-npu)
endif()

add_executable(server
examples/server.cpp
src/common/server.cpp
Expand Down
72 changes: 72 additions & 0 deletions benchmark_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
# Comprehensive benchmark across all fixed models on Strix Halo (gfx1151)
set -e

export ROCm_DIR=/tmp/rocm_sdk_core
source /tmp/rocm_venv/bin/activate
export LD_LIBRARY_PATH=$ROCm_DIR/lib:$LD_LIBRARY_PATH

CHAT=/home/bcloud/lemon-mlx-engine/build/chat
MAX_TOKENS=100
PROMPT="What is the capital of France? Explain in one sentence."

echo "╔══════════════════════════════════════════════════════════════════════════╗"
echo "║ BENCHMARK: lemon-mlx-engine on Strix Halo (gfx1151) ║"
echo "║ Commit 26aad7e — All fixes applied ║"
echo "╚══════════════════════════════════════════════════════════════════════════╝"
echo ""
echo "Prompt: \"$PROMPT\""
echo "Max tokens: $MAX_TOKENS, Temperature: 0.0 (greedy)"
echo ""

benchmark() {
local name="$1"
local model_path="$2"
shift 2
local extra_args="$@"

echo "──────────────────────────────────────────────────────────────────────────"
echo "▶ $name"
echo " Path: $model_path"
[ -n "$extra_args" ] && echo " Args: $extra_args"
echo ""

local output
output=$(echo "$PROMPT" | timeout 120 $CHAT "$model_path" --max-tokens $MAX_TOKENS --temperature 0.0 $extra_args 2>&1) || true

echo "$output" | grep -E "(Loading model|bound HIP|Model loaded|Prompt:|Generation:|Assistant:|Error|error|Fatal|Segmentation|Unsupported)" | head -10
echo ""
}

# 1. BASELINE: Llama-3.2-1B-Instruct-4bit
benchmark "Llama-3.2-1B-Instruct-4bit (baseline)" /home/bcloud/models/llama-1b

# 2. BitNet b1.58-2B-4T (1.58-bit ternary)
benchmark "BitNet b1.58-2B-4T (1.58-bit ternary)" /home/bcloud/models/bitnet-2b

# 3. Bonsai 1.7B (1-bit affine)
benchmark "Bonsai 1.7B (1-bit)" /home/bcloud/models/bonsai-1.7b

# 4. Bonsai 4B (1-bit affine)
benchmark "Bonsai 4B (1-bit)" /home/bcloud/models/bonsai-4b

# 5. Bonsai 8B (1-bit affine) — needs more VRAM
benchmark "Bonsai 8B (1-bit)" /home/bcloud/models/bonsai-8b

# 6. Qwen3-1.7B MXFP4 (issue #10 fix)
benchmark "Qwen3-1.7B-MLX-MXFP4 (MXFP4 quant)" /home/bcloud/models/qwen3-1.7b-mxfp4

# 7. OpenELM-3B (issue #7 segfault fix)
benchmark "OpenELM-3B (issue #7 segfault fix)" /home/bcloud/models/openelm-3b --raw

# 8. Granite-4.0-H-Tiny (issue #6 crash fix)
benchmark "Granite-4.0-H-Tiny (issue #6 crash fix)" /home/bcloud/models/granite-4.0-h-tiny --raw

# 9. Lille-130M (issue #9 dequant fix)
benchmark "Lille-130M (issue #9 dequant fix)" /home/bcloud/models/lille-130m --raw

# 10. Falcon-E-3B (1.58-bit, inverse-scale BitLinear)
benchmark "Falcon-E-3B (1.58-bit, inverse-scale BitLinear)" /home/bcloud/models/falcon-e-3b

echo "════════════════════════════════════════════════════════════════════════════"
echo "Benchmark complete."
Loading
Loading