microsoft · peder1981 · Jun 21, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,154 @@
+# ─── BitNet CPU kernel CI ──────────────────────────────────────────────────────
+#
+# Builds the bitnet.cpp project with all L2-L5 math kernels enabled and runs
+# the kernel unit test suite. No model download (full smoke/perplexity happens
+# locally or in a separate nightly workflow).
+#
+# Why this exists:
+#   - Clang ≥ 18 is required for SIMD kernels (per CLAUDE.md).
+#   - 3rdparty/llama.cpp is a fork (branch `merge-dev`); submodule init is
+#     critical for the build.
+#   - GCC 14 may not be installed in the runner image; we explicitly install
+#     libstdc++-14-dev so Clang 18 can find its system C++ headers.
+#
+# Trigger: every push to main, every PR.
+
+name: kernel-ci
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    name: build + test (Ubuntu, clang-18)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout (with submodules)
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 1
+
+      - name: Apply dispatch patch (combined 05)
+        run: |
+          echo "Applying combined patch 05 (L3 ACDC + L5 HRR + L4 K_i8 cache + FaseIII rect + LLaMA gate)..."
+          chmod +x ./scripts/apply-dispatch-patches.sh
+          ./scripts/apply-dispatch-patches.sh
+          echo "Verifying idempotence..."
+          ./scripts/apply-dispatch-patches.sh --check
+        shell: bash
+
+      - name: Install build dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            clang-18 \
+            cmake \
+            ninja-build \
+            libstdc++-14-dev \
+            python3 \
+            python3-pip \
+            python3-venv
+
+      - name: Create Python venv and install test dependencies
+        # Use an isolated venv to avoid PEP-668 conflicts between apt numpy/scipy
+        # and PyPI packages (safetensors has no numpy dep; still isolate for safety).
+        run: |
+          python3 -m venv .venv
+          .venv/bin/pip install --no-cache-dir numpy scipy safetensors
+
+      - name: Configure (Release, all kernels + ACDC_RECT)
+        # BITNET_ENABLE_ACDC_RECT defaults ON → 16 tests in CI.
+        # Python3_EXECUTABLE points to the venv so test_extract_acdc_diagonal
+        # finds the installed numpy/safetensors.
+        run: |
+          cmake -B build -G Ninja \
+            -DCMAKE_C_COMPILER=clang-18 \
+            -DCMAKE_CXX_COMPILER=clang++-18 \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DBITNET_L2_WHT=ON \
+            -DBITNET_L3_ACDC=ON \
+            -DBITNET_L4_TROPICAL=ON \
+            -DBITNET_L5_HRR=ON \
+            -DBITNET_L6_RAG=ON \
+            -DBITNET_BUILD_TESTS=ON \
+            -DPython3_EXECUTABLE=$(pwd)/.venv/bin/python3
+
+      - name: Build (compiles L1 + L2-L6 + all test targets)
+        # Single build step — cmake discovers all targets from CMakeLists.txt.
+        # No hardcoded --target list: avoids breakage when targets are added/renamed.
+        run: cmake --build build --config Release -j$(nproc)
+
+      - name: ctest — 16/16 kernel unit tests
+        # BITNET_ENABLE_ACDC_RECT=ON (default) adds test_acdc_rect → 16 tests.
+        # -j$(nproc): parallel execution; --output-on-failure: full log on fail.
+        # PYTHON3_EXECUTABLE env var ensures the venv Python is used for
+        # test_extract_acdc_diagonal (the add_test() COMMAND is cmake-resolved).
+        run: |
+          ctest --test-dir build \
+            --output-on-failure \
+            -j$(nproc) \
+            --timeout 120
+
+      - name: NO-06 — telemetry audit (zero hits required)
+        # Persona D4: binário nunca envia dados a endpoints externos.
+        # Any match = CI failure.
+        run: |
+          HITS=$(grep -rn \
+            "telemetry\|upload_data\|send_metrics\|POST.*http" \
+            src/ utils/ run_inference*.py setup_env.py 2>/dev/null | \
+            grep -v "^Binary\|\.pyc" || true)
+          if [ -n "$HITS" ]; then
+            echo "::error::NO-06 FAIL — telemetry code found:"
+            echo "$HITS"
+            exit 1
+          fi
+          echo "NO-06 PASS — 0 telemetry hits"
+
+      - name: NO-07 — cloud URL audit (zero hits in production code)
+        # Ensures no hard-coded HTTP endpoints in C/C++ production sources.
+        # URLs in comments (// http) and docs are excluded.
+        run: |
+          HITS=$(grep -rn "http://\|https://" \
+            src/ include/ \
+            --include="*.cpp" --include="*.h" | \
+            grep -v "//.*http\|/\*.*http\| \* http" || true)
+          if [ -n "$HITS" ]; then
+            echo "::error::NO-07 FAIL — cloud URLs in production code:"
+            echo "$HITS"
+            exit 1
+          fi
+          echo "NO-07 PASS — 0 cloud URL hits"
+
+      - name: Cross-validation C ↔ Python (L3/L4/L5)
+        # Verifies that the Python reference implementations match the C kernels
+        # to rtol=1e-5, atol=1e-7. No model required.
+        # --build-dir points to the cmake output dir (build/tests/), not the
+        # local development build (build_tests/).
+        run: |
+          .venv/bin/python3 tests/cross_validation.py \
+            --all \
+            --build-dir build/tests
+          echo "Cross-validation: PASS"
+
+      - name: Air-gapped boot test (AC-11)
+        # Verifies that the built llama-cli binary runs without making any
+        # network syscalls.  This enforces persona D4 (no telemetry, no cloud)
+        # at the CI level.  The script is in tests/test_air_gapped_boot.sh;
+        # it auto-skips if no model file is provided (which is the case in CI).
+        # Result: SKIPPED is acceptable in CI; PASS requires a real model.
+        run: |
+          chmod +x tests/test_air_gapped_boot.sh
+          bash tests/test_air_gapped_boot.sh 2>&1 | tee /tmp/air_gapped.log
+          rc=${PIPESTATUS[0]}
+          if [ $rc -ne 0 ]; then
+            echo "::error::AC-11 air-gapped boot FAILED (rc=$rc)"
+            cat /tmp/air_gapped.log
+            exit $rc
+          fi
diff --git a/.gitmodules b/.gitmodules
@@ -2,3 +2,4 @@
 	path = 3rdparty/llama.cpp
 	url = https://github.com/Eddie-Wang1120/llama.cpp.git
 	branch = merge-dev
+	ignore = dirty
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,10 +11,22 @@ endif()
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
-# option list
-option(BITNET_ARM_TL1    "bitnet.cpp: use tl1 on arm platform"    OFF)
-option(BITNET_X86_TL2    "bitnet.cpp: use tl2 on x86 platform"    OFF)
-
+# ─── Level 1: kernel format ──────────────────────────────────────────────────
+option(BITNET_ARM_TL1    "bitnet.cpp: use TL1 lookup-table kernel (ARM64)"   OFF)
+option(BITNET_X86_TL2    "bitnet.cpp: use TL2 lookup-table kernel (x86_64)"  OFF)
+
+# ─── Level 2-5: math research kernels ────────────────────────────────────────
+option(BITNET_L2_WHT      "bitnet.cpp: WHT zero-mul GEMV (Level 2)"           ON)
+option(BITNET_L3_ACDC     "bitnet.cpp: FWHT+ACDC O(n log n) layers (Level 3)" ON)
+option(BITNET_L4_TROPICAL "bitnet.cpp: Tropical attention (max,+) (Level 4)"  ON)
+option(BITNET_L5_HRR      "bitnet.cpp: Holographic memory HRR (Level 5)"      ON)
+option(BITNET_L6_RAG      "bitnet.cpp: CPU-RAG flat-index ANN engine (Level 6)" ON)
+option(BITNET_RAG_SHARED  "bitnet.cpp: build bitnet_rag as a shared lib (ctypes)" OFF)
+option(BITNET_BUILD_TESTS "bitnet.cpp: build kernel unit tests"               ON)
+# FWHT parallel (OpenMP): opt-in. Default OFF so the ggml inference path (which
+# runs inside a ggml thread-pool callback) is never affected.  Enable only for
+# standalone benchmarks / extraction tools that run outside ggml.
+option(BITNET_FWHT_OMP    "bitnet.cpp: OpenMP-parallel fwht_f32_parallel() (benchmark use)" OFF)
 
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
@@ -38,10 +50,33 @@ endif()
 
 find_package(Threads REQUIRED)
 
+# ─── src/ ─────────────────────────────────────────────────────────────────────
+# Compiles L2-L5 into the bitnet_math OBJECT library.
+# Sets BITNET_MATH_TARGET in this scope (empty string if no levels enabled).
 add_subdirectory(src)
+
+# ─── 3rdparty/llama.cpp ───────────────────────────────────────────────────────
+# Defines the ggml target (which already contains L1 kernels via hardcoded paths).
 set(LLAMA_BUILD_SERVER ON CACHE BOOL "Build llama.cpp server" FORCE)
 add_subdirectory(3rdparty/llama.cpp)
 
+# ─── Wire L2-L5 into ggml ────────────────────────────────────────────────────
+# After both subdirectories are processed, both `bitnet_math` and `ggml` exist.
+# We add the OBJECT library to ggml so L2-L5 symbols are available in all
+# llama.cpp binaries (llama-cli, llama-server, llama-bench, etc.)
+# without any extra linker flags on the caller side.
+if (BITNET_MATH_TARGET)
+    target_link_libraries(ggml PUBLIC ${BITNET_MATH_TARGET})
+    message(STATUS "BitNet: L2-L5 kernels linked into ggml target")
+endif()
+
+# ─── Tests ────────────────────────────────────────────────────────────────────
+# Standalone unit tests for L2-L5 kernels. Add -DBITNET_BUILD_TESTS=OFF to skip.
+if (BITNET_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif()
+
 # install
 
 include(GNUInstallDirs)

diff --git a/include/bitnet-lut-kernels.h b/include/bitnet-lut-kernels.h
@@ -0,0 +1,25 @@
+/*
+ * bitnet-lut-kernels.h — Lookup-table GEMM kernel stubs
+ *
+ * This file is normally generated by:
+ *   python utils/codegen_tl1.py  (ARM64 TL1 kernels)
+ *   python utils/codegen_tl2.py  (x86_64 TL2 kernels)
+ *
+ * Or automatically via:
+ *   python setup_env.py -md <model_dir> -q tl1
+ *   python setup_env.py -md <model_dir> -q tl2
+ *
+ * This stub allows cmake to configure and build with I2_S kernels (default)
+ * without running codegen first.  TL1/TL2 functionality is disabled when
+ * neither GGML_BITNET_ARM_TL1 nor GGML_BITNET_X86_TL2 is defined.
+ */
+
+#pragma once
+
+#if defined(GGML_BITNET_ARM_TL1)
+#error "TL1 kernels not generated yet. Run: python utils/codegen_tl1.py"
+#endif
+
+#if defined(GGML_BITNET_X86_TL2)
+#error "TL2 kernels not generated yet. Run: python utils/codegen_tl2.py"
+#endif
diff --git a/include/ggml-bitnet-common.h b/include/ggml-bitnet-common.h
@@ -0,0 +1,94 @@
+/*
+ * ggml-bitnet-common.h — Shared utilities across L2-L5 math kernels
+ *
+ * ─────────────────────────────────────────────────────────────────────────
+ * WHY THIS HEADER IS SMALL
+ * ─────────────────────────────────────────────────────────────────────────
+ *
+ * The natural impulse when seeing three "butterfly" implementations
+ * (L2 WHT, L3 FWHT, L5 FFT) is to extract a shared `butterfly_step()`
+ * abstraction. After actually reading all three, that abstraction is
+ * *not* a clean win — see the taxonomy below.
+ *
+ * The only piece that genuinely duplicates across kernels is the
+ * "smallest power of 2 ≥ n" rounding utility (needed by L3 FWHT and
+ * L5 FFT to pad their input vectors to a power of 2). Extracting
+ * that, plus a few other small bits, is the right scope for a
+ * "shared common" header. The butterfly operations themselves stay
+ * per-kernel for clarity and to allow per-algorithm SIMD tricks
+ * (e.g. L3 processes 8 float32 pairs at once in pure AVX2 add/sub;
+ * L5 needs twiddle multiplications and complex number handling).
+ *
+ * ─────────────────────────────────────────────────────────────────────────
+ * ALGORITHM TAXONOMY (L2 / L3 / L5)
+ * ─────────────────────────────────────────────────────────────────────────
+ *
+ *   L2 WHT (src/ggml-bitnet-wht.cpp)
+ *       Algorithm: selection-mask dot product on I2_S packed bytes.
+ *                 NOT a Cooley-Tukey butterfly. The "Hadamard domain"
+ *                 trick is: H·x with H ∈ {±1} is computed via
+ *                 `(w==+1 ? x : 0) − (w==−1 ? x : 0)` per element, with
+ *                 32-wide AVX2 compare/select on packed bytes.
+ *       Zero muls, no bit-reversal, in-place.
+ *
+ *   L3 FWHT (src/ggml-bitnet-fwht.cpp)
+ *       Algorithm: in-order Cooley-Tukey radix-2 butterfly, real-valued.
+ *       Twiddles are always ±1 (Hadamard matrix), so the inner operation
+ *       is pure (a+b, a-b) — no multiplications.
+ *       In-order (no bit-reversal — only the DIF variant of FFT
+ *       needs it; L3 uses a DIT-like structure because the input
+ *       order is the natural one for the final-form H matrix).
+ *       Variants: f32 and i32, scalar + AVX2 + NEON.
+ *
+ *   L5 FFT (src/ggml-bitnet-hrr.cpp)
+ *       Algorithm: Cooley-Tukey radix-2 DIF, complex-valued, with
+ *       twiddle factors exp(−2πi·k/N). Bit-reversal permutation on
+ *       input (Decimation In Frequency requires input in bit-reversed
+ *       order for the output to be in natural order).
+ *       Twiddles require complex multiplications (4 mults + 2 adds
+ *       per butterfly, or 3 mults + 3 adds with the standard trick).
+ *       The first log₂(N) stages have twiddles in {±1, ±i} and could
+ *       avoid multiplications, but we don't bother (FMAs are cheap).
+ *
+ *   Conclusion: there is no common butterfly() to share. L2 is
+ *   fundamentally different (selection mask, not butterfly), and L3/L5
+ *   differ on twiddle handling, value type (real vs complex), and
+ *   permutation (in-order vs bit-reversed). Forcing a shared API
+ *   would obscure the math more than it would simplify the code.
+ *
+ * ─────────────────────────────────────────────────────────────────────────
+ * WHAT IS SHARED
+ * ─────────────────────────────────────────────────────────────────────────
+ *
+ *   - bitnet_next_pow2: smallest power of 2 ≥ n (used by L3, L5 to pad)
+ *   - BITNET_L* build-flag summary (re-exported here for convenience)
+ *   - The taxonomy comment above (so future agents don't make the
+ *     same "let's extract a butterfly" mistake)
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ── bitnet_next_pow2 ────────────────────────────────────────────────────
+ *
+ * Returns the smallest power of 2 that is ≥ n. For n ≤ 1, returns 1.
+ *
+ * Used by:
+ *   - L3 FWHT (src/ggml-bitnet-fwht.cpp): pads activation vectors
+ *     to power-of-2 length before applying the butterfly.
+ *   - L5 FFT  (src/ggml-bitnet-hrr.cpp): pads HRR vectors to power-of-2
+ *     length for the radix-2 Cooley-Tukey FFT.
+ *
+ * L2 WHT does NOT use this (operates on fixed QK block size).
+ * L4 tropical does NOT use this (operates per-token, not on fixed FFT blocks).
+ */
+int bitnet_next_pow2(int n);
+
+#ifdef __cplusplus
+}
+#endif