summaryrefslogtreecommitdiff
path: root/src/util/ans.cc
diff options
context:
space:
mode:
authorskal <pascal.massimino@gmail.com>2026-05-14 19:09:39 +0200
committerskal <pascal.massimino@gmail.com>2026-05-14 19:11:28 +0200
commit6ef8f578817ee0134fd5867ca3b80590e3eb2368 (patch)
tree5550607e5c4a16ca237bfa4430ac1ef1f5d80c5d /src/util/ans.cc
parent4bcbe13dab5ffb64d93cc61956f07ee5168a84c9 (diff)
ans: order-0 rANS coder + WGSL asset compression
Adds src/util/ans.{h,cc}, a per-chunk-adaptive order-0 rANS entropy coder. Decoder is always built; encoder is gated on ANS_ENABLE_ENCODER (tools only). Both sides take an optional 256-entry initial_counts table to seed the adaptive model. The per-chunk initial state is (1 << kBits). Higher initial states (e.g. with a signature packed into the upper bits) force a renorm-emit at iter 0 that the decoder never consumes, corrupting multi-chunk streams once stats become skewed. Asset pipeline: - AssetRecord gains 'compression' and 'uncompressed_size' fields. - asset_packer scans every WGSL file to build a corpus-wide byte histogram, then ANS-encodes each shader using that histogram as the seed. Histogram and accessor are emitted alongside the asset table. Round-trip verification runs at pack time for every compressed asset; failures fall back to uncompressed storage. - asset_manager decompresses on first GetAsset(), caches the heap-allocated buffer, and DropAsset / ReloadAssetsFromFile free it along with the procedural cache. - Disk-load (dev) builds are unchanged: WGSL paths stay as filenames. Tests: - src/tests/util/test_ans.cc: roundtrip variants (empty, single byte, single-symbol run, all-zeros, random uniform/skewed, repeated ASCII), seeded-vs-uniform compression, rejection of mismatched counts / corruption / truncation, PeekUncompressedSize. - 37/37 dev, 36/36 STRIP_ALL. Compression observed: WGSL shaders shrink to ~0.62-0.71x in the main workspace (81 of 105 assets qualify). Docs: - doc/ANS.md (new): algorithm, bitstream, API, asset pipeline integration, compression numbers, limitations, tests. - doc/ASSET_SYSTEM.md: new Compression section + updated technical guarantees for compressed assets. - doc/COMPLETED.md: May 2026 entry. - PROJECT_CONTEXT.md: Build status line mentions WGSL ANS compression. - CLAUDE.md, GEMINI.md: tier-3 build doc list includes ANS.md.
Diffstat (limited to 'src/util/ans.cc')
-rw-r--r--src/util/ans.cc215
1 files changed, 215 insertions, 0 deletions
diff --git a/src/util/ans.cc b/src/util/ans.cc
new file mode 100644
index 0000000..aa79e4b
--- /dev/null
+++ b/src/util/ans.cc
@@ -0,0 +1,215 @@
+// This file is part of the 64k demo project.
+// Order-0 rANS coder. See ans.h for the bitstream format.
+
+#include "util/ans.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace ans {
+
+namespace {
+
+// Per-chunk adaptive byte model. Probabilities sum to 1<<kBits after
+// normalize(); raw counts in 'stats' keep adapting across chunks.
+struct Stats {
+ uint32_t stats[kNumSymbols];
+ uint32_t cumul[kNumSymbols + 1];
+
+ // Seeds 'stats' from caller counts (or all-ones if null) and builds the
+ // normalized cumulative table. stats_to_cumul() floors zero counts, so
+ // 'init' doesn't have to.
+ void init(const uint32_t* initial_counts) {
+ if (initial_counts) {
+ std::memcpy(stats, initial_counts, sizeof(stats));
+ } else {
+ std::fill_n(stats, kNumSymbols, 1u);
+ }
+ normalize();
+ }
+
+ // Rebuilds 'cumul' from 'stats'. Every slot is floored at 1 so each
+ // symbol remains reachable even if it has been seen zero times.
+ void stats_to_cumul() {
+ uint32_t acc = 0;
+ for (int i = 0; i < kNumSymbols; ++i) {
+ cumul[i] = acc;
+ acc += stats[i] ? stats[i] : 1u;
+ }
+ cumul[kNumSymbols] = acc;
+ }
+
+ // Rescales 'cumul' so cumul[NUM_SYMBOLS] == (1 << kBits). Ceil-multiply
+ // preserves monotonicity and prevents any active slot from collapsing.
+ void normalize() {
+ stats_to_cumul();
+ const uint32_t target = 1u << kBits;
+ while (cumul[kNumSymbols] != target) {
+ const double ratio = (double)target / (double)cumul[kNumSymbols];
+ for (int i = 0; i <= kNumSymbols; ++i) {
+ cumul[i] = (uint32_t)std::ceil((double)cumul[i] * ratio);
+ }
+ }
+ }
+
+ // Decoder lookup: locate the slot containing 's' in [0, 1<<kBits).
+ // Returns the symbol, its probability, and the residual offset within
+ // the slot. Increments the symbol's count on the fly.
+ void decode_lookup(uint32_t s, uint8_t* sym, uint32_t* p, uint32_t* r) {
+ // upper_bound(s) - 1; cumul is strictly non-decreasing.
+ const uint32_t* it =
+ std::upper_bound(cumul, cumul + kNumSymbols + 1, s);
+ const int c = (int)(it - cumul) - 1;
+ stats[c] += 1;
+ *sym = (uint8_t)c;
+ *p = cumul[c + 1] - cumul[c];
+ *r = s - cumul[c];
+ }
+
+ // Encoder lookup: probability and base offset for symbol 'c'.
+ void encode_lookup(int c, uint32_t* p, uint32_t* base) {
+ stats[c] += 1;
+ *p = cumul[c + 1] - cumul[c];
+ *base = cumul[c];
+ }
+};
+
+// Big-endian primitive readers/writers.
+inline uint16_t ReadU16BE(const uint8_t* p) {
+ return (uint16_t)((p[0] << 8) | p[1]);
+}
+inline uint32_t ReadU32BE(const uint8_t* p) {
+ return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) |
+ ((uint32_t)p[2] << 8) | (uint32_t)p[3];
+}
+
+#if defined(ANS_ENABLE_ENCODER)
+inline void AppendU16BE(std::vector<uint8_t>* dst, uint16_t v) {
+ dst->push_back((uint8_t)(v >> 8));
+ dst->push_back((uint8_t)(v & 0xff));
+}
+inline void AppendU32BE(std::vector<uint8_t>* dst, uint32_t v) {
+ dst->push_back((uint8_t)(v >> 24));
+ dst->push_back((uint8_t)(v >> 16));
+ dst->push_back((uint8_t)(v >> 8));
+ dst->push_back((uint8_t)(v & 0xff));
+}
+#endif
+
+} // namespace
+
+uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size) {
+ if (!src || src_size < 4) return 0;
+ return ReadU32BE(src);
+}
+
+bool Decode(const uint8_t* src, size_t src_size,
+ uint8_t* dst, size_t dst_capacity,
+ size_t* out_size,
+ const uint32_t* initial_counts) {
+ if (out_size) *out_size = 0;
+ if (!src || src_size < 4) return false;
+
+ const uint8_t* p = src;
+ const uint8_t* end = src + src_size;
+
+ const uint32_t output_size = ReadU32BE(p);
+ p += 4;
+ if (output_size > dst_capacity) return false;
+ if (output_size > 0 && !dst) return false;
+
+ Stats stats;
+ stats.init(initial_counts);
+
+ uint32_t t = 0;
+ while (t < output_size) {
+ const uint32_t chunk = std::min<uint32_t>(kChunkSize, output_size - t);
+
+ if (end - p < 4) return false;
+ uint32_t s = ReadU32BE(p);
+ p += 4;
+
+ for (uint32_t i = 0; i < chunk; ++i) {
+ if (s <= kMask) {
+ // Pull in one renorm word.
+ if (end - p < 2) return false;
+ s = (s << kBits) | (uint32_t)ReadU16BE(p);
+ p += 2;
+ }
+ uint8_t sym;
+ uint32_t proba, residual;
+ stats.decode_lookup(s & kMask, &sym, &proba, &residual);
+ dst[t + i] = sym;
+ s = proba * (s >> kBits) + residual;
+ }
+ // Final-state sanity check: catches stream corruption and model mismatch.
+ if (s != kInitState) return false;
+ stats.normalize();
+ t += chunk;
+ }
+
+ if (out_size) *out_size = output_size;
+ return true;
+}
+
+#if defined(ANS_ENABLE_ENCODER)
+
+void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts) {
+ if (!src || !out_counts) return;
+ for (size_t i = 0; i < size; ++i) out_counts[src[i]] += 1;
+}
+
+bool Encode(const uint8_t* src, size_t size,
+ std::vector<uint8_t>* dst,
+ const uint32_t* initial_counts) {
+ if (!dst) return false;
+ dst->clear();
+ if (size > 0xffffffffu) return false; // header is u32
+
+ AppendU32BE(dst, (uint32_t)size);
+ if (size == 0) return true;
+
+ Stats stats;
+ stats.init(initial_counts);
+
+ uint16_t tmp[kChunkSize];
+
+ size_t t = 0;
+ while (t < size) {
+ // Initial state is the bare signature (not shifted): the decoder ends
+ // each chunk with s == kInitState, so symmetry requires the encoder to
+ // start in the same low-state. Starting with (kInitState << kBits) would
+ // force a spurious renorm at iter 0 that the decoder never consumes,
+ // corrupting any stream with more than one chunk.
+ uint32_t s = kInitState;
+ const size_t chunk = std::min<size_t>(kChunkSize, size - t);
+ size_t pos = chunk;
+ // Encode the chunk in reverse so the decoder consumes symbols in order.
+ for (size_t k = chunk; k-- > 0;) {
+ const uint8_t sym = src[t + k];
+ uint32_t proba, base;
+ stats.encode_lookup(sym, &proba, &base);
+ if (s >= (proba << kBits)) {
+ // Flush low word and shift down to keep s/proba bounded in u32.
+ --pos;
+ tmp[pos] = (uint16_t)(s & kMask);
+ s >>= kBits;
+ }
+ s = ((s / proba) << kBits) + (s % proba) + base;
+ }
+ // Invariant: final state must be > kMask so the decoder's first read is
+ // a valid renormalized state.
+ if (s <= kMask) return false;
+ AppendU32BE(dst, s);
+ for (size_t k = pos; k < chunk; ++k) AppendU16BE(dst, tmp[k]);
+
+ stats.normalize();
+ t += chunk;
+ }
+ return true;
+}
+
+#endif // ANS_ENABLE_ENCODER
+
+} // namespace ans