From 6ef8f578817ee0134fd5867ca3b80590e3eb2368 Mon Sep 17 00:00:00 2001 From: skal Date: Thu, 14 May 2026 19:09:39 +0200 Subject: ans: order-0 rANS coder + WGSL asset compression Adds src/util/ans.{h,cc}, a per-chunk-adaptive order-0 rANS entropy coder. Decoder is always built; encoder is gated on ANS_ENABLE_ENCODER (tools only). Both sides take an optional 256-entry initial_counts table to seed the adaptive model. The per-chunk initial state is (1 << kBits). Higher initial states (e.g. with a signature packed into the upper bits) force a renorm-emit at iter 0 that the decoder never consumes, corrupting multi-chunk streams once stats become skewed. Asset pipeline: - AssetRecord gains 'compression' and 'uncompressed_size' fields. - asset_packer scans every WGSL file to build a corpus-wide byte histogram, then ANS-encodes each shader using that histogram as the seed. Histogram and accessor are emitted alongside the asset table. Round-trip verification runs at pack time for every compressed asset; failures fall back to uncompressed storage. - asset_manager decompresses on first GetAsset(), caches the heap-allocated buffer, and DropAsset / ReloadAssetsFromFile free it along with the procedural cache. - Disk-load (dev) builds are unchanged: WGSL paths stay as filenames. Tests: - src/tests/util/test_ans.cc: roundtrip variants (empty, single byte, single-symbol run, all-zeros, random uniform/skewed, repeated ASCII), seeded-vs-uniform compression, rejection of mismatched counts / corruption / truncation, PeekUncompressedSize. - 37/37 dev, 36/36 STRIP_ALL. Compression observed: WGSL shaders shrink to ~0.62-0.71x in the main workspace (81 of 105 assets qualify). Docs: - doc/ANS.md (new): algorithm, bitstream, API, asset pipeline integration, compression numbers, limitations, tests. - doc/ASSET_SYSTEM.md: new Compression section + updated technical guarantees for compressed assets. - doc/COMPLETED.md: May 2026 entry. - PROJECT_CONTEXT.md: Build status line mentions WGSL ANS compression. - CLAUDE.md, GEMINI.md: tier-3 build doc list includes ANS.md. --- src/tests/util/test_ans.cc | 189 +++++++++++++++++++++++++++++++++++++++ src/util/ans.cc | 215 +++++++++++++++++++++++++++++++++++++++++++++ src/util/ans.h | 58 ++++++++++++ src/util/asset_manager.cc | 44 +++++++++- src/util/asset_manager.h | 18 +++- 5 files changed, 519 insertions(+), 5 deletions(-) create mode 100644 src/tests/util/test_ans.cc create mode 100644 src/util/ans.cc create mode 100644 src/util/ans.h (limited to 'src') diff --git a/src/tests/util/test_ans.cc b/src/tests/util/test_ans.cc new file mode 100644 index 0000000..108c46d --- /dev/null +++ b/src/tests/util/test_ans.cc @@ -0,0 +1,189 @@ +// This file is part of the 64k demo project. +// Tests for the ANS (rANS) entropy coder in util/ans.h. +// Encoder is enabled in tests via ANS_ENABLE_ENCODER. + +#include "util/ans.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +bool RoundtripCheck(const std::vector& input, + const uint32_t* initial_counts, + const char* label) { + std::vector compressed; + if (!ans::Encode(input.data(), input.size(), &compressed, initial_counts)) { + fprintf(stderr, "[%s] Encode failed\n", label); + return false; + } + std::vector decoded(input.size()); + size_t decoded_size = 0; + if (!ans::Decode(compressed.data(), compressed.size(), decoded.data(), + decoded.size(), &decoded_size, initial_counts)) { + fprintf(stderr, "[%s] Decode failed\n", label); + return false; + } + if (decoded_size != input.size() || + std::memcmp(decoded.data(), input.data(), input.size()) != 0) { + fprintf(stderr, "[%s] payload mismatch\n", label); + return false; + } + const double ratio = + input.empty() ? 0.0 : (double)compressed.size() / (double)input.size(); + fprintf(stderr, "[%s] OK: %zu -> %zu bytes (%.3f x)\n", label, input.size(), + compressed.size(), ratio); + return true; +} + +// Covers: empty / single byte / single-symbol run / all-zeros / random uniform +// / random skewed / repeated ASCII. Each is a roundtrip with default (uniform) +// initial counts. +void TestRoundtripVariants() { + std::mt19937 rng_uniform(12345); + std::vector random_uniform(64 * 1024); + for (auto& b : random_uniform) b = (uint8_t)(rng_uniform() & 0xff); + + std::mt19937 rng_skewed(67890); + std::vector random_skewed(32 * 1024); + for (auto& b : random_skewed) { + // 90% byte 'A', 10% other ASCII letters. + const uint32_t r = rng_skewed(); + b = (r % 10 == 0) ? (uint8_t)('B' + (r >> 8) % 25) : (uint8_t)'A'; + } + + const char* ascii = + "@group(0) @binding(0) var smplr: sampler;\n" + "@group(0) @binding(1) var tex: texture_2d;\n" + "@fragment fn fs_main(@builtin(position) p: vec4f) -> @location(0) " + "vec4f {\n" + " let uv = p.xy / vec2f(1280.0, 720.0);\n" + " return textureSample(tex, smplr, uv);\n" + "}\n"; + std::vector ascii_block; + for (int i = 0; i < 50; ++i) { // cross chunk boundary + ascii_block.insert(ascii_block.end(), ascii, ascii + std::strlen(ascii)); + } + + struct Case { + const char* label; + std::vector data; + }; + const Case cases[] = { + {"empty", {}}, + {"single-byte", {0x42}}, + {"single-symbol-run", std::vector(4096, 'A')}, + {"all-zeros", std::vector(8192, 0)}, + {"random-uniform", random_uniform}, + {"random-skewed", random_skewed}, + {"ascii-shader-text", ascii_block}, + }; + for (const Case& c : cases) { + assert(RoundtripCheck(c.data, nullptr, c.label)); + } +} + +// Seeded initial counts should compress a matching corpus at least as well +// as uniform init, and roundtrip identically. +void TestSeededInitialCounts() { + std::vector corpus; + const char* sample = "fn main() { let x = vec3f(1.0, 2.0, 3.0); }\n"; + for (int i = 0; i < 200; ++i) { + corpus.insert(corpus.end(), sample, sample + std::strlen(sample)); + } + uint32_t hist[256] = {}; + ans::Histogram(corpus.data(), corpus.size(), hist); + + std::vector seeded_out, uniform_out; + assert(ans::Encode(corpus.data(), corpus.size(), &seeded_out, hist)); + assert(ans::Encode(corpus.data(), corpus.size(), &uniform_out, nullptr)); + + std::vector decoded(corpus.size()); + size_t decoded_size = 0; + assert(ans::Decode(seeded_out.data(), seeded_out.size(), decoded.data(), + decoded.size(), &decoded_size, hist)); + assert(decoded_size == corpus.size()); + assert(std::memcmp(decoded.data(), corpus.data(), corpus.size()) == 0); + + fprintf(stderr, "[seeded-vs-uniform] seeded=%zu uniform=%zu raw=%zu\n", + seeded_out.size(), uniform_out.size(), corpus.size()); + assert(seeded_out.size() <= uniform_out.size()); +} + +// The chunk-end state check must reject: a mismatched model (decoder uses a +// different initial distribution), a single bit-flip in the payload, and a +// truncated stream. +void TestRejection() { + std::mt19937 rng(1); + + // 1) Mismatched models. + { + std::vector v(4096); + for (auto& b : v) b = (uint8_t)('a' + (rng() % 26)); + uint32_t hist[256] = {}; + ans::Histogram(v.data(), v.size(), hist); + + std::vector encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, hist)); + + std::vector decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] mismatched-counts OK\n"); + } + + // 2) Corruption. + { + std::vector v(2048); + for (auto& b : v) b = (uint8_t)(rng() & 0xff); + std::vector encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + encoded[encoded.size() / 2] ^= 0x55; // flip a payload byte + + std::vector decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] corruption OK\n"); + } + + // 3) Truncation. + { + std::vector v(4096); + for (size_t i = 0; i < v.size(); ++i) v[i] = (uint8_t)i; + std::vector encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + encoded.resize(encoded.size() - 8); + + std::vector decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] truncation OK\n"); + } +} + +void TestPeekSize() { + std::vector v(1234, 'Q'); + std::vector encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + assert(ans::PeekUncompressedSize(encoded.data(), encoded.size()) == + v.size()); +} + +} // namespace + +int main() { + TestRoundtripVariants(); + TestSeededInitialCounts(); + TestRejection(); + TestPeekSize(); + fprintf(stderr, "All ANS tests passed.\n"); + return 0; +} diff --git a/src/util/ans.cc b/src/util/ans.cc new file mode 100644 index 0000000..aa79e4b --- /dev/null +++ b/src/util/ans.cc @@ -0,0 +1,215 @@ +// This file is part of the 64k demo project. +// Order-0 rANS coder. See ans.h for the bitstream format. + +#include "util/ans.h" + +#include +#include +#include + +namespace ans { + +namespace { + +// Per-chunk adaptive byte model. Probabilities sum to 1<* dst, uint16_t v) { + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +inline void AppendU32BE(std::vector* dst, uint32_t v) { + dst->push_back((uint8_t)(v >> 24)); + dst->push_back((uint8_t)(v >> 16)); + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +#endif + +} // namespace + +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size) { + if (!src || src_size < 4) return 0; + return ReadU32BE(src); +} + +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts) { + if (out_size) *out_size = 0; + if (!src || src_size < 4) return false; + + const uint8_t* p = src; + const uint8_t* end = src + src_size; + + const uint32_t output_size = ReadU32BE(p); + p += 4; + if (output_size > dst_capacity) return false; + if (output_size > 0 && !dst) return false; + + Stats stats; + stats.init(initial_counts); + + uint32_t t = 0; + while (t < output_size) { + const uint32_t chunk = std::min(kChunkSize, output_size - t); + + if (end - p < 4) return false; + uint32_t s = ReadU32BE(p); + p += 4; + + for (uint32_t i = 0; i < chunk; ++i) { + if (s <= kMask) { + // Pull in one renorm word. + if (end - p < 2) return false; + s = (s << kBits) | (uint32_t)ReadU16BE(p); + p += 2; + } + uint8_t sym; + uint32_t proba, residual; + stats.decode_lookup(s & kMask, &sym, &proba, &residual); + dst[t + i] = sym; + s = proba * (s >> kBits) + residual; + } + // Final-state sanity check: catches stream corruption and model mismatch. + if (s != kInitState) return false; + stats.normalize(); + t += chunk; + } + + if (out_size) *out_size = output_size; + return true; +} + +#if defined(ANS_ENABLE_ENCODER) + +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts) { + if (!src || !out_counts) return; + for (size_t i = 0; i < size; ++i) out_counts[src[i]] += 1; +} + +bool Encode(const uint8_t* src, size_t size, + std::vector* dst, + const uint32_t* initial_counts) { + if (!dst) return false; + dst->clear(); + if (size > 0xffffffffu) return false; // header is u32 + + AppendU32BE(dst, (uint32_t)size); + if (size == 0) return true; + + Stats stats; + stats.init(initial_counts); + + uint16_t tmp[kChunkSize]; + + size_t t = 0; + while (t < size) { + // Initial state is the bare signature (not shifted): the decoder ends + // each chunk with s == kInitState, so symmetry requires the encoder to + // start in the same low-state. Starting with (kInitState << kBits) would + // force a spurious renorm at iter 0 that the decoder never consumes, + // corrupting any stream with more than one chunk. + uint32_t s = kInitState; + const size_t chunk = std::min(kChunkSize, size - t); + size_t pos = chunk; + // Encode the chunk in reverse so the decoder consumes symbols in order. + for (size_t k = chunk; k-- > 0;) { + const uint8_t sym = src[t + k]; + uint32_t proba, base; + stats.encode_lookup(sym, &proba, &base); + if (s >= (proba << kBits)) { + // Flush low word and shift down to keep s/proba bounded in u32. + --pos; + tmp[pos] = (uint16_t)(s & kMask); + s >>= kBits; + } + s = ((s / proba) << kBits) + (s % proba) + base; + } + // Invariant: final state must be > kMask so the decoder's first read is + // a valid renormalized state. + if (s <= kMask) return false; + AppendU32BE(dst, s); + for (size_t k = pos; k < chunk; ++k) AppendU16BE(dst, tmp[k]); + + stats.normalize(); + t += chunk; + } + return true; +} + +#endif // ANS_ENABLE_ENCODER + +} // namespace ans diff --git a/src/util/ans.h b/src/util/ans.h new file mode 100644 index 0000000..53c34b1 --- /dev/null +++ b/src/util/ans.h @@ -0,0 +1,58 @@ +// This file is part of the 64k demo project. +// Asymmetric Numeral System (rANS) order-0 entropy coder. +// Decoder is always built; encoder is gated by ANS_ENABLE_ENCODER. +// Bitstream is big-endian and chunk-adaptive; see the per-chunk format in +// the implementation file. + +#ifndef ANS_H_ +#define ANS_H_ + +#include +#include + +#if defined(ANS_ENABLE_ENCODER) +#include +#endif + +namespace ans { + +// Fixed parameters; must match the encoder/decoder on both ends. +constexpr int kBits = 16; +constexpr uint32_t kMask = (1u << kBits) - 1u; +constexpr int kNumSymbols = 256; +constexpr int kChunkSize = 1024; +// Initial state per chunk. Doubles as a chunk-end integrity check: any +// decoder/encoder model divergence drives the state away from this constant, +// which is verified after every chunk. +constexpr uint32_t kInitState = 1u << kBits; + +// Reads the original (uncompressed) size from a bitstream header. +// Returns 0 on malformed input. +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size); + +// Decodes 'src_size' bytes from 'src' into 'dst' (capacity 'dst_capacity'). +// 'initial_counts' (256 entries) seeds the per-chunk adaptive model; pass +// nullptr for the uniform default (all-ones). +// Returns true on success and writes the decoded size to '*out_size'. +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts = nullptr); + +#if defined(ANS_ENABLE_ENCODER) +// Encodes 'src[0..size]' into '*dst' (cleared and re-filled). +// 'initial_counts' has the same semantics as in Decode(). +// Returns true on success. +bool Encode(const uint8_t* src, size_t size, + std::vector* dst, + const uint32_t* initial_counts = nullptr); + +// Computes a byte histogram over 'src[0..size]', accumulating into +// 'out_counts[256]' (caller must zero-initialize before the first call). +// Useful for deriving a corpus-wide initial distribution from many files. +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts); +#endif // ANS_ENABLE_ENCODER + +} // namespace ans + +#endif // ANS_H_ diff --git a/src/util/asset_manager.cc b/src/util/asset_manager.cc index 26a82bf..264ddda 100644 --- a/src/util/asset_manager.cc +++ b/src/util/asset_manager.cc @@ -2,6 +2,7 @@ // It implements the generic asset retrieval logic with runtime caching. #include "util/asset_manager.h" +#include "util/ans.h" #include "util/asset_manager_utils.h" #include "util/check_return.h" #if defined(USE_TEST_ASSETS) @@ -114,6 +115,34 @@ const uint8_t* GetAsset(AssetId asset_id, size_t* out_size) { AssetRecord cached_record = source_record; + if (source_record.compression == AssetCompression::ANS_ASCII) { + // Decompress on first access. Buffer is null-terminated for C-string use. + const size_t uncomp = source_record.uncompressed_size; + uint8_t* buffer = new (std::nothrow) uint8_t[uncomp + 1]; + CHECK_RETURN_BEGIN(!buffer, nullptr) + if (out_size) *out_size = 0; + ERROR_MSG("Failed to allocate buffer for ANS-compressed asset id=%u", + index); + return nullptr; + CHECK_RETURN_END + size_t got = 0; + if (!ans::Decode(source_record.data, source_record.size, buffer, uncomp, + &got, GetAnsAsciiHistogram()) || + got != uncomp) { + delete[] buffer; + if (out_size) *out_size = 0; + ERROR_MSG("ANS decode failed for asset id=%u", index); + return nullptr; + } + buffer[uncomp] = 0; + cached_record.data = buffer; + cached_record.size = uncomp; + cached_record.uncompressed_size = uncomp; + g_asset_cache[index] = cached_record; + if (out_size) *out_size = uncomp; + return buffer; + } + if (source_record.type == AssetType::PROC || source_record.type == AssetType::PROC_GPU) { // Dynamically generate the asset @@ -224,6 +253,12 @@ void DropAsset(AssetId asset_id, const uint8_t* asset) { delete[] g_asset_cache[index].data; g_asset_cache[index] = {}; // Zero out the struct to force re-generation } + // Heap-allocated decompressed buffer (compression != NONE): cache owns it. + if (g_asset_cache[index].data == asset && + g_asset_cache[index].compression != AssetCompression::NONE) { + delete[] g_asset_cache[index].data; + g_asset_cache[index] = {}; + } #if !defined(STRIP_ALL) if (g_asset_cache[index].data == asset && (g_asset_cache[index].type == AssetType::SPEC || @@ -245,10 +280,11 @@ bool ReloadAssetsFromFile(const char* config_path) { // Clear cache to force reload for (size_t i = 0; i < (size_t)AssetId::ASSET_LAST_ID; ++i) { - if ((g_asset_cache[i].type == AssetType::PROC || - g_asset_cache[i].type == AssetType::PROC_GPU) && - g_asset_cache[i].data) { - delete[] g_asset_cache[i].data; + const AssetRecord& e = g_asset_cache[i]; + if (e.data && + (e.type == AssetType::PROC || e.type == AssetType::PROC_GPU || + e.compression != AssetCompression::NONE)) { + delete[] e.data; } g_asset_cache[i] = {}; } diff --git a/src/util/asset_manager.h b/src/util/asset_manager.h index 786a8db..f6ad244 100644 --- a/src/util/asset_manager.h +++ b/src/util/asset_manager.h @@ -16,10 +16,21 @@ enum class AssetType : uint8_t { PROC_GPU, }; +// Compression scheme applied to the packed bytes (only relevant for static +// embedded assets; disk-load and procedural assets are always NONE). +enum class AssetCompression : uint8_t { + NONE = 0, + // Order-0 rANS, model seeded from a corpus-wide histogram embedded in the + // generated assets blob (see GetAnsAsciiHistogram). Used for WGSL. + ANS_ASCII = 1, +}; + struct AssetRecord { const uint8_t* data; // Pointer to asset data (static or dynamic) - size_t size; // Size of the asset data + size_t size; // Size of 'data' in bytes (compressed size if any) AssetType type; + AssetCompression compression; // How 'data' is encoded; NONE = raw bytes + size_t uncompressed_size; // Size after decompression (== size if NONE) const char* proc_func_name_str; // Name of procedural generation function // (string literal) const float* proc_params; // Parameters for procedural generation (static, @@ -27,6 +38,11 @@ struct AssetRecord { int num_proc_params; // Number of procedural parameters }; +// Initial-state histogram (256 entries) used to seed ANS_ASCII compression. +// Defined in the generated assets blob; computed at pack time over the WGSL +// corpus. +const uint32_t* GetAnsAsciiHistogram(); + // Generic interface // Retrieves a pointer to the asset data. // - Static assets are guaranteed to be 16-byte aligned. -- cgit v1.2.3