Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# ─── BitNet CPU kernel CI ──────────────────────────────────────────────────────
#
# Builds the bitnet.cpp project with all L2-L5 math kernels enabled and runs
# the kernel unit test suite. No model download (full smoke/perplexity happens
# locally or in a separate nightly workflow).
#
# Why this exists:
# - Clang ≥ 18 is required for SIMD kernels (per CLAUDE.md).
# - 3rdparty/llama.cpp is a fork (branch `merge-dev`); submodule init is
# critical for the build.
# - GCC 14 may not be installed in the runner image; we explicitly install
# libstdc++-14-dev so Clang 18 can find its system C++ headers.
#
# Trigger: every push to main, every PR.

name: kernel-ci

on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:

jobs:
build-and-test:
name: build + test (Ubuntu, clang-18)
runs-on: ubuntu-24.04
timeout-minutes: 30

steps:
- name: Checkout (with submodules)
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 1

- name: Apply dispatch patch (combined 05)
run: |
echo "Applying combined patch 05 (L3 ACDC + L5 HRR + L4 K_i8 cache + FaseIII rect + LLaMA gate)..."
chmod +x ./scripts/apply-dispatch-patches.sh
./scripts/apply-dispatch-patches.sh
echo "Verifying idempotence..."
./scripts/apply-dispatch-patches.sh --check
shell: bash

- name: Install build dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
clang-18 \
cmake \
ninja-build \
libstdc++-14-dev \
python3 \
python3-pip \
python3-venv

- name: Create Python venv and install test dependencies
# Use an isolated venv to avoid PEP-668 conflicts between apt numpy/scipy
# and PyPI packages (safetensors has no numpy dep; still isolate for safety).
run: |
python3 -m venv .venv
.venv/bin/pip install --no-cache-dir numpy scipy safetensors

- name: Configure (Release, all kernels + ACDC_RECT)
# BITNET_ENABLE_ACDC_RECT defaults ON → 16 tests in CI.
# Python3_EXECUTABLE points to the venv so test_extract_acdc_diagonal
# finds the installed numpy/safetensors.
run: |
cmake -B build -G Ninja \
-DCMAKE_C_COMPILER=clang-18 \
-DCMAKE_CXX_COMPILER=clang++-18 \
-DCMAKE_BUILD_TYPE=Release \
-DBITNET_L2_WHT=ON \
-DBITNET_L3_ACDC=ON \
-DBITNET_L4_TROPICAL=ON \
-DBITNET_L5_HRR=ON \
-DBITNET_L6_RAG=ON \
-DBITNET_BUILD_TESTS=ON \
-DPython3_EXECUTABLE=$(pwd)/.venv/bin/python3

- name: Build (compiles L1 + L2-L6 + all test targets)
# Single build step — cmake discovers all targets from CMakeLists.txt.
# No hardcoded --target list: avoids breakage when targets are added/renamed.
run: cmake --build build --config Release -j$(nproc)

- name: ctest — 16/16 kernel unit tests
# BITNET_ENABLE_ACDC_RECT=ON (default) adds test_acdc_rect → 16 tests.
# -j$(nproc): parallel execution; --output-on-failure: full log on fail.
# PYTHON3_EXECUTABLE env var ensures the venv Python is used for
# test_extract_acdc_diagonal (the add_test() COMMAND is cmake-resolved).
run: |
ctest --test-dir build \
--output-on-failure \
-j$(nproc) \
--timeout 120

- name: NO-06 — telemetry audit (zero hits required)
# Persona D4: binário nunca envia dados a endpoints externos.
# Any match = CI failure.
run: |
HITS=$(grep -rn \
"telemetry\|upload_data\|send_metrics\|POST.*http" \
src/ utils/ run_inference*.py setup_env.py 2>/dev/null | \
grep -v "^Binary\|\.pyc" || true)
if [ -n "$HITS" ]; then
echo "::error::NO-06 FAIL — telemetry code found:"
echo "$HITS"
exit 1
fi
echo "NO-06 PASS — 0 telemetry hits"

- name: NO-07 — cloud URL audit (zero hits in production code)
# Ensures no hard-coded HTTP endpoints in C/C++ production sources.
# URLs in comments (// http) and docs are excluded.
run: |
HITS=$(grep -rn "http://\|https://" \
src/ include/ \
--include="*.cpp" --include="*.h" | \
grep -v "//.*http\|/\*.*http\| \* http" || true)
if [ -n "$HITS" ]; then
echo "::error::NO-07 FAIL — cloud URLs in production code:"
echo "$HITS"
exit 1
fi
echo "NO-07 PASS — 0 cloud URL hits"

- name: Cross-validation C ↔ Python (L3/L4/L5)
# Verifies that the Python reference implementations match the C kernels
# to rtol=1e-5, atol=1e-7. No model required.
# --build-dir points to the cmake output dir (build/tests/), not the
# local development build (build_tests/).
run: |
.venv/bin/python3 tests/cross_validation.py \
--all \
--build-dir build/tests
echo "Cross-validation: PASS"

- name: Air-gapped boot test (AC-11)
# Verifies that the built llama-cli binary runs without making any
# network syscalls. This enforces persona D4 (no telemetry, no cloud)
# at the CI level. The script is in tests/test_air_gapped_boot.sh;
# it auto-skips if no model file is provided (which is the case in CI).
# Result: SKIPPED is acceptable in CI; PASS requires a real model.
run: |
chmod +x tests/test_air_gapped_boot.sh
bash tests/test_air_gapped_boot.sh 2>&1 | tee /tmp/air_gapped.log
rc=${PIPESTATUS[0]}
if [ $rc -ne 0 ]; then
echo "::error::AC-11 air-gapped boot FAILED (rc=$rc)"
cat /tmp/air_gapped.log
exit $rc
fi
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
path = 3rdparty/llama.cpp
url = https://github.com/Eddie-Wang1120/llama.cpp.git
branch = merge-dev
ignore = dirty
43 changes: 39 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,22 @@ endif()

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

# option list
option(BITNET_ARM_TL1 "bitnet.cpp: use tl1 on arm platform" OFF)
option(BITNET_X86_TL2 "bitnet.cpp: use tl2 on x86 platform" OFF)

# ─── Level 1: kernel format ──────────────────────────────────────────────────
option(BITNET_ARM_TL1 "bitnet.cpp: use TL1 lookup-table kernel (ARM64)" OFF)
option(BITNET_X86_TL2 "bitnet.cpp: use TL2 lookup-table kernel (x86_64)" OFF)

# ─── Level 2-5: math research kernels ────────────────────────────────────────
option(BITNET_L2_WHT "bitnet.cpp: WHT zero-mul GEMV (Level 2)" ON)
option(BITNET_L3_ACDC "bitnet.cpp: FWHT+ACDC O(n log n) layers (Level 3)" ON)
option(BITNET_L4_TROPICAL "bitnet.cpp: Tropical attention (max,+) (Level 4)" ON)
option(BITNET_L5_HRR "bitnet.cpp: Holographic memory HRR (Level 5)" ON)
option(BITNET_L6_RAG "bitnet.cpp: CPU-RAG flat-index ANN engine (Level 6)" ON)
option(BITNET_RAG_SHARED "bitnet.cpp: build bitnet_rag as a shared lib (ctypes)" OFF)
option(BITNET_BUILD_TESTS "bitnet.cpp: build kernel unit tests" ON)
# FWHT parallel (OpenMP): opt-in. Default OFF so the ggml inference path (which
# runs inside a ggml thread-pool callback) is never affected. Enable only for
# standalone benchmarks / extraction tools that run outside ggml.
option(BITNET_FWHT_OMP "bitnet.cpp: OpenMP-parallel fwht_f32_parallel() (benchmark use)" OFF)

set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
Expand All @@ -38,10 +50,33 @@ endif()

find_package(Threads REQUIRED)

# ─── src/ ─────────────────────────────────────────────────────────────────────
# Compiles L2-L5 into the bitnet_math OBJECT library.
# Sets BITNET_MATH_TARGET in this scope (empty string if no levels enabled).
add_subdirectory(src)

# ─── 3rdparty/llama.cpp ───────────────────────────────────────────────────────
# Defines the ggml target (which already contains L1 kernels via hardcoded paths).
set(LLAMA_BUILD_SERVER ON CACHE BOOL "Build llama.cpp server" FORCE)
add_subdirectory(3rdparty/llama.cpp)

# ─── Wire L2-L5 into ggml ────────────────────────────────────────────────────
# After both subdirectories are processed, both `bitnet_math` and `ggml` exist.
# We add the OBJECT library to ggml so L2-L5 symbols are available in all
# llama.cpp binaries (llama-cli, llama-server, llama-bench, etc.)
# without any extra linker flags on the caller side.
if (BITNET_MATH_TARGET)
target_link_libraries(ggml PUBLIC ${BITNET_MATH_TARGET})
message(STATUS "BitNet: L2-L5 kernels linked into ggml target")
endif()

# ─── Tests ────────────────────────────────────────────────────────────────────
# Standalone unit tests for L2-L5 kernels. Add -DBITNET_BUILD_TESTS=OFF to skip.
if (BITNET_BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
endif()

# install

include(GNUInstallDirs)
Expand Down
25 changes: 25 additions & 0 deletions include/bitnet-lut-kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* bitnet-lut-kernels.h — Lookup-table GEMM kernel stubs
*
* This file is normally generated by:
* python utils/codegen_tl1.py (ARM64 TL1 kernels)
* python utils/codegen_tl2.py (x86_64 TL2 kernels)
*
* Or automatically via:
* python setup_env.py -md <model_dir> -q tl1
* python setup_env.py -md <model_dir> -q tl2
*
* This stub allows cmake to configure and build with I2_S kernels (default)
* without running codegen first. TL1/TL2 functionality is disabled when
* neither GGML_BITNET_ARM_TL1 nor GGML_BITNET_X86_TL2 is defined.
*/

#pragma once

#if defined(GGML_BITNET_ARM_TL1)
#error "TL1 kernels not generated yet. Run: python utils/codegen_tl1.py"
#endif

#if defined(GGML_BITNET_X86_TL2)
#error "TL2 kernels not generated yet. Run: python utils/codegen_tl2.py"
#endif
94 changes: 94 additions & 0 deletions include/ggml-bitnet-common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* ggml-bitnet-common.h — Shared utilities across L2-L5 math kernels
*
* ─────────────────────────────────────────────────────────────────────────
* WHY THIS HEADER IS SMALL
* ─────────────────────────────────────────────────────────────────────────
*
* The natural impulse when seeing three "butterfly" implementations
* (L2 WHT, L3 FWHT, L5 FFT) is to extract a shared `butterfly_step()`
* abstraction. After actually reading all three, that abstraction is
* *not* a clean win — see the taxonomy below.
*
* The only piece that genuinely duplicates across kernels is the
* "smallest power of 2 ≥ n" rounding utility (needed by L3 FWHT and
* L5 FFT to pad their input vectors to a power of 2). Extracting
* that, plus a few other small bits, is the right scope for a
* "shared common" header. The butterfly operations themselves stay
* per-kernel for clarity and to allow per-algorithm SIMD tricks
* (e.g. L3 processes 8 float32 pairs at once in pure AVX2 add/sub;
* L5 needs twiddle multiplications and complex number handling).
*
* ─────────────────────────────────────────────────────────────────────────
* ALGORITHM TAXONOMY (L2 / L3 / L5)
* ─────────────────────────────────────────────────────────────────────────
*
* L2 WHT (src/ggml-bitnet-wht.cpp)
* Algorithm: selection-mask dot product on I2_S packed bytes.
* NOT a Cooley-Tukey butterfly. The "Hadamard domain"
* trick is: H·x with H ∈ {±1} is computed via
* `(w==+1 ? x : 0) − (w==−1 ? x : 0)` per element, with
* 32-wide AVX2 compare/select on packed bytes.
* Zero muls, no bit-reversal, in-place.
*
* L3 FWHT (src/ggml-bitnet-fwht.cpp)
* Algorithm: in-order Cooley-Tukey radix-2 butterfly, real-valued.
* Twiddles are always ±1 (Hadamard matrix), so the inner operation
* is pure (a+b, a-b) — no multiplications.
* In-order (no bit-reversal — only the DIF variant of FFT
* needs it; L3 uses a DIT-like structure because the input
* order is the natural one for the final-form H matrix).
* Variants: f32 and i32, scalar + AVX2 + NEON.
*
* L5 FFT (src/ggml-bitnet-hrr.cpp)
* Algorithm: Cooley-Tukey radix-2 DIF, complex-valued, with
* twiddle factors exp(−2πi·k/N). Bit-reversal permutation on
* input (Decimation In Frequency requires input in bit-reversed
* order for the output to be in natural order).
* Twiddles require complex multiplications (4 mults + 2 adds
* per butterfly, or 3 mults + 3 adds with the standard trick).
* The first log₂(N) stages have twiddles in {±1, ±i} and could
* avoid multiplications, but we don't bother (FMAs are cheap).
*
* Conclusion: there is no common butterfly() to share. L2 is
* fundamentally different (selection mask, not butterfly), and L3/L5
* differ on twiddle handling, value type (real vs complex), and
* permutation (in-order vs bit-reversed). Forcing a shared API
* would obscure the math more than it would simplify the code.
*
* ─────────────────────────────────────────────────────────────────────────
* WHAT IS SHARED
* ─────────────────────────────────────────────────────────────────────────
*
* - bitnet_next_pow2: smallest power of 2 ≥ n (used by L3, L5 to pad)
* - BITNET_L* build-flag summary (re-exported here for convenience)
* - The taxonomy comment above (so future agents don't make the
* same "let's extract a butterfly" mistake)
*/

#pragma once

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

/* ── bitnet_next_pow2 ────────────────────────────────────────────────────
*
* Returns the smallest power of 2 that is ≥ n. For n ≤ 1, returns 1.
*
* Used by:
* - L3 FWHT (src/ggml-bitnet-fwht.cpp): pads activation vectors
* to power-of-2 length before applying the butterfly.
* - L5 FFT (src/ggml-bitnet-hrr.cpp): pads HRR vectors to power-of-2
* length for the radix-2 Cooley-Tukey FFT.
*
* L2 WHT does NOT use this (operates on fixed QK block size).
* L4 tropical does NOT use this (operates per-token, not on fixed FFT blocks).
*/
int bitnet_next_pow2(int n);

#ifdef __cplusplus
}
#endif
Loading