ans: order-0 rANS coder + WGSL asset compression

Adds src/util/ans.{h,cc}, a per-chunk-adaptive order-0 rANS entropy coder. Decoder is always built; encoder is gated on ANS_ENABLE_ENCODER (tools only). Both sides take an optional 256-entry initial_counts table to seed the adaptive model. The per-chunk initial state is (1 << kBits). Higher initial states (e.g. with a signature packed into the upper bits) force a renorm-emit at iter 0 that the decoder never consumes, corrupting multi-chunk streams once stats become skewed. Asset pipeline: - AssetRecord gains 'compression' and 'uncompressed_size' fields. - asset_packer scans every WGSL file to build a corpus-wide byte histogram, then ANS-encodes each shader using that histogram as the seed. Histogram and accessor are emitted alongside the asset table. Round-trip verification runs at pack time for every compressed asset; failures fall back to uncompressed storage. - asset_manager decompresses on first GetAsset(), caches the heap-allocated buffer, and DropAsset / ReloadAssetsFromFile free it along with the procedural cache. - Disk-load (dev) builds are unchanged: WGSL paths stay as filenames. Tests: - src/tests/util/test_ans.cc: roundtrip variants (empty, single byte, single-symbol run, all-zeros, random uniform/skewed, repeated ASCII), seeded-vs-uniform compression, rejection of mismatched counts / corruption / truncation, PeekUncompressedSize. - 37/37 dev, 36/36 STRIP_ALL. Compression observed: WGSL shaders shrink to ~0.62-0.71x in the main workspace (81 of 105 assets qualify). Docs: - doc/ANS.md (new): algorithm, bitstream, API, asset pipeline integration, compression numbers, limitations, tests. - doc/ASSET_SYSTEM.md: new Compression section + updated technical guarantees for compressed assets. - doc/COMPLETED.md: May 2026 entry. - PROJECT_CONTEXT.md: Build status line mentions WGSL ANS compression. - CLAUDE.md, GEMINI.md: tier-3 build doc list includes ANS.md.
author: skal <pascal.massimino@gmail.com> 2026-05-14 19:09:39 +0200
committer: skal <pascal.massimino@gmail.com> 2026-05-14 19:11:28 +0200
commit: 6ef8f578817ee0134fd5867ca3b80590e3eb2368 (patch)
tree: 5550607e5c4a16ca237bfa4430ac1ef1f5d80c5d /src/util/ans.cc
parent: 4bcbe13dab5ffb64d93cc61956f07ee5168a84c9 (diff)
1 files changed, 215 insertions, 0 deletions
diff --git a/src/util/ans.cc b/src/util/ans.cc
new file mode 100644
index 0000000..aa79e4b
--- /dev/null
+++ b/src/util/ans.cc
@@ -0,0 +1,215 @@
+// This file is part of the 64k demo project.
+// Order-0 rANS coder. See ans.h for the bitstream format.
+
+#include "util/ans.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace ans {
+
+namespace {
+
+// Per-chunk adaptive byte model. Probabilities sum to 1<<kBits after
+// normalize(); raw counts in 'stats' keep adapting across chunks.
+struct Stats {
+  uint32_t stats[kNumSymbols];
+  uint32_t cumul[kNumSymbols + 1];
+
+  // Seeds 'stats' from caller counts (or all-ones if null) and builds the
+  // normalized cumulative table. stats_to_cumul() floors zero counts, so
+  // 'init' doesn't have to.
+  void init(const uint32_t* initial_counts) {
+    if (initial_counts) {
+      std::memcpy(stats, initial_counts, sizeof(stats));
+    } else {
+      std::fill_n(stats, kNumSymbols, 1u);
+    }
+    normalize();
+  }
+
+  // Rebuilds 'cumul' from 'stats'. Every slot is floored at 1 so each
+  // symbol remains reachable even if it has been seen zero times.
+  void stats_to_cumul() {
+    uint32_t acc = 0;
+    for (int i = 0; i < kNumSymbols; ++i) {
+      cumul[i] = acc;
+      acc += stats[i] ? stats[i] : 1u;
+    }
+    cumul[kNumSymbols] = acc;
+  }
+
+  // Rescales 'cumul' so cumul[NUM_SYMBOLS] == (1 << kBits). Ceil-multiply
+  // preserves monotonicity and prevents any active slot from collapsing.
+  void normalize() {
+    stats_to_cumul();
+    const uint32_t target = 1u << kBits;
+    while (cumul[kNumSymbols] != target) {
+      const double ratio = (double)target / (double)cumul[kNumSymbols];
+      for (int i = 0; i <= kNumSymbols; ++i) {
+        cumul[i] = (uint32_t)std::ceil((double)cumul[i] * ratio);
+      }
+    }
+  }
+
+  // Decoder lookup: locate the slot containing 's' in [0, 1<<kBits).
+  // Returns the symbol, its probability, and the residual offset within
+  // the slot. Increments the symbol's count on the fly.
+  void decode_lookup(uint32_t s, uint8_t* sym, uint32_t* p, uint32_t* r) {
+    // upper_bound(s) - 1; cumul is strictly non-decreasing.
+    const uint32_t* it =
+        std::upper_bound(cumul, cumul + kNumSymbols + 1, s);
+    const int c = (int)(it - cumul) - 1;
+    stats[c] += 1;
+    *sym = (uint8_t)c;
+    *p = cumul[c + 1] - cumul[c];
+    *r = s - cumul[c];
+  }
+
+  // Encoder lookup: probability and base offset for symbol 'c'.
+  void encode_lookup(int c, uint32_t* p, uint32_t* base) {
+    stats[c] += 1;
+    *p = cumul[c + 1] - cumul[c];
+    *base = cumul[c];
+  }
+};
+
+// Big-endian primitive readers/writers.
+inline uint16_t ReadU16BE(const uint8_t* p) {
+  return (uint16_t)((p[0] << 8) | p[1]);
+}
+inline uint32_t ReadU32BE(const uint8_t* p) {
+  return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) |
+         ((uint32_t)p[2] << 8) | (uint32_t)p[3];
+}
+
+#if defined(ANS_ENABLE_ENCODER)
+inline void AppendU16BE(std::vector<uint8_t>* dst, uint16_t v) {
+  dst->push_back((uint8_t)(v >> 8));
+  dst->push_back((uint8_t)(v & 0xff));
+}
+inline void AppendU32BE(std::vector<uint8_t>* dst, uint32_t v) {
+  dst->push_back((uint8_t)(v >> 24));
+  dst->push_back((uint8_t)(v >> 16));
+  dst->push_back((uint8_t)(v >> 8));
+  dst->push_back((uint8_t)(v & 0xff));
+}
+#endif
+
+}  // namespace
+
+uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size) {
+  if (!src || src_size < 4) return 0;
+  return ReadU32BE(src);
+}
+
+bool Decode(const uint8_t* src, size_t src_size,
+            uint8_t* dst, size_t dst_capacity,
+            size_t* out_size,
+            const uint32_t* initial_counts) {
+  if (out_size) *out_size = 0;
+  if (!src || src_size < 4) return false;
+
+  const uint8_t* p = src;
+  const uint8_t* end = src + src_size;
+
+  const uint32_t output_size = ReadU32BE(p);
+  p += 4;
+  if (output_size > dst_capacity) return false;
+  if (output_size > 0 && !dst) return false;
+
+  Stats stats;
+  stats.init(initial_counts);
+
+  uint32_t t = 0;
+  while (t < output_size) {
+    const uint32_t chunk = std::min<uint32_t>(kChunkSize, output_size - t);
+
+    if (end - p < 4) return false;
+    uint32_t s = ReadU32BE(p);
+    p += 4;
+
+    for (uint32_t i = 0; i < chunk; ++i) {
+      if (s <= kMask) {
+        // Pull in one renorm word.
+        if (end - p < 2) return false;
+        s = (s << kBits) | (uint32_t)ReadU16BE(p);
+        p += 2;
+      }
+      uint8_t sym;
+      uint32_t proba, residual;
+      stats.decode_lookup(s & kMask, &sym, &proba, &residual);
+      dst[t + i] = sym;
+      s = proba * (s >> kBits) + residual;
+    }
+    // Final-state sanity check: catches stream corruption and model mismatch.
+    if (s != kInitState) return false;
+    stats.normalize();
+    t += chunk;
+  }
+
+  if (out_size) *out_size = output_size;
+  return true;
+}
+
+#if defined(ANS_ENABLE_ENCODER)
+
+void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts) {
+  if (!src || !out_counts) return;
+  for (size_t i = 0; i < size; ++i) out_counts[src[i]] += 1;
+}
+
+bool Encode(const uint8_t* src, size_t size,
+            std::vector<uint8_t>* dst,
+            const uint32_t* initial_counts) {
+  if (!dst) return false;
+  dst->clear();
+  if (size > 0xffffffffu) return false;  // header is u32
+
+  AppendU32BE(dst, (uint32_t)size);
+  if (size == 0) return true;
+
+  Stats stats;
+  stats.init(initial_counts);
+
+  uint16_t tmp[kChunkSize];
+
+  size_t t = 0;
+  while (t < size) {
+    // Initial state is the bare signature (not shifted): the decoder ends
+    // each chunk with s == kInitState, so symmetry requires the encoder to
+    // start in the same low-state. Starting with (kInitState << kBits) would
+    // force a spurious renorm at iter 0 that the decoder never consumes,
+    // corrupting any stream with more than one chunk.
+    uint32_t s = kInitState;
+    const size_t chunk = std::min<size_t>(kChunkSize, size - t);
+    size_t pos = chunk;
+    // Encode the chunk in reverse so the decoder consumes symbols in order.
+    for (size_t k = chunk; k-- > 0;) {
+      const uint8_t sym = src[t + k];
+      uint32_t proba, base;
+      stats.encode_lookup(sym, &proba, &base);
+      if (s >= (proba << kBits)) {
+        // Flush low word and shift down to keep s/proba bounded in u32.
+        --pos;
+        tmp[pos] = (uint16_t)(s & kMask);
+        s >>= kBits;
+      }
+      s = ((s / proba) << kBits) + (s % proba) + base;
+    }
+    // Invariant: final state must be > kMask so the decoder's first read is
+    // a valid renormalized state.
+    if (s <= kMask) return false;
+    AppendU32BE(dst, s);
+    for (size_t k = pos; k < chunk; ++k) AppendU16BE(dst, tmp[k]);
+
+    stats.normalize();
+    t += chunk;
+  }
+  return true;
+}
+
+#endif  // ANS_ENABLE_ENCODER
+
+}  // namespace ans
author	skal <pascal.massimino@gmail.com>	2026-05-14 19:09:39 +0200
committer	skal <pascal.massimino@gmail.com>	2026-05-14 19:11:28 +0200
commit	6ef8f578817ee0134fd5867ca3b80590e3eb2368 (patch)
tree	5550607e5c4a16ca237bfa4430ac1ef1f5d80c5d /src/util/ans.cc
parent	4bcbe13dab5ffb64d93cc61956f07ee5168a84c9 (diff)