diff options
| -rw-r--r-- | CLAUDE.md | 2 | ||||
| -rw-r--r-- | GEMINI.md | 2 | ||||
| -rw-r--r-- | PROJECT_CONTEXT.md | 2 | ||||
| -rw-r--r-- | cmake/DemoSourceLists.cmake | 2 | ||||
| -rw-r--r-- | cmake/DemoTests.cmake | 4 | ||||
| -rw-r--r-- | cmake/DemoTools.cmake | 5 | ||||
| -rw-r--r-- | doc/ANS.md | 166 | ||||
| -rw-r--r-- | doc/ASSET_SYSTEM.md | 14 | ||||
| -rw-r--r-- | doc/COMPLETED.md | 4 | ||||
| -rw-r--r-- | src/tests/util/test_ans.cc | 189 | ||||
| -rw-r--r-- | src/util/ans.cc | 215 | ||||
| -rw-r--r-- | src/util/ans.h | 58 | ||||
| -rw-r--r-- | src/util/asset_manager.cc | 44 | ||||
| -rw-r--r-- | src/util/asset_manager.h | 18 | ||||
| -rw-r--r-- | tools/asset_packer.cc | 121 |
15 files changed, 817 insertions, 29 deletions
@@ -21,7 +21,7 @@ # CNN: @cnn_v1/docs/CNN_V1_EFFECT.md, @cnn_v2/docs/CNN_V2.md, @cnn_v2/docs/CNN_V2_BINARY_FORMAT.md # 3D/Graphics: @doc/3D.md, @doc/MASKING_SYSTEM.md, @doc/SDF_EFFECT_GUIDE.md # Scene: @doc/SCENE_FORMAT.md, @doc/SEQUENCE.md, @doc/WORKSPACE_SYSTEM.md -# Build: @doc/ASSET_SYSTEM.md, @doc/BUILD.md, @doc/CMAKE_MODULES.md, @doc/SIZE_MEASUREMENT.md +# Build: @doc/ASSET_SYSTEM.md, @doc/ANS.md, @doc/BUILD.md, @doc/CMAKE_MODULES.md, @doc/SIZE_MEASUREMENT.md # Rendering: @doc/UNIFORM_BUFFER_GUIDELINES.md, @doc/WGPU_HELPERS.md, @doc/AUXILIARY_TEXTURE_INIT.md # Tools: @doc/test_demo_README.md, @doc/HOT_RELOAD.md, @doc/HEADLESS_MODE.md, @doc/RECIPE.md, @doc/TOOLS_REFERENCE.md # Arch: @doc/ARCHITECTURE.md, @doc/CODING_STYLE.md, @doc/BACKLOG.md, @doc/CONTEXT_MAINTENANCE.md @@ -21,7 +21,7 @@ # CNN: @cnn_v1/docs/CNN_V1_EFFECT.md, @cnn_v2/docs/CNN_V2.md, @cnn_v2/docs/CNN_V2_BINARY_FORMAT.md # 3D/Graphics: @doc/3D.md, @doc/GPU_PROCEDURAL_PHASE4.md, @doc/MASKING_SYSTEM.md, @doc/SDF_EFFECT_GUIDE.md # Scene: @doc/SCENE_FORMAT.md, @doc/SEQUENCE.md, @doc/WORKSPACE_SYSTEM.md -# Build: @doc/ASSET_SYSTEM.md, @doc/BUILD.md, @doc/CMAKE_MODULES.md, @doc/SIZE_MEASUREMENT.md +# Build: @doc/ASSET_SYSTEM.md, @doc/ANS.md, @doc/BUILD.md, @doc/CMAKE_MODULES.md, @doc/SIZE_MEASUREMENT.md # Rendering: @doc/GEOM_BUFFER.md, @doc/SHADER_REUSE_INVESTIGATION.md, @doc/UNIFORM_BUFFER_GUIDELINES.md, @doc/WGPU_HELPERS.md, @doc/AUXILIARY_TEXTURE_INIT.md # Tools: @doc/test_demo_README.md, @doc/HOT_RELOAD.md, @doc/HEADLESS_MODE.md, @doc/RECIPE.md, @doc/TOOLS_REFERENCE.md # Arch: @doc/ARCHITECTURE.md, @doc/CODING_STYLE.md, @doc/BACKLOG.md, @doc/CONTEXT_MAINTENANCE.md diff --git a/PROJECT_CONTEXT.md b/PROJECT_CONTEXT.md index d7fc771..43ce2f1 100644 --- a/PROJECT_CONTEXT.md +++ b/PROJECT_CONTEXT.md @@ -38,7 +38,7 @@ - **3D:** Hybrid SDF/rasterization with BVH. Binary scene loader. Blender pipeline. - **Effects:** CNN post-processing: CNNEffect (v1) and CNNv2Effect operational. CNN v2: sigmoid activation, storage buffer weights (~3.2 KB), 7D static features, dynamic layers. Training stable, convergence validated. **CNN v3 Phases 1–9 complete** + runtime pipeline operational: `GBufferEffect` (MRT raster + sphere impostors + SDF shadow pass) → `GBufDeferredEffect` (albedo×diffuse) in `cnn_v3_test`; debug sequence adds `CNNv3Effect` → `GBufViewEffect`. Training bugs fixed: dec0 ReLU removed, FiLM MLP loaded from `.bin`. Parity validated: max_err=4.88e-4. See `cnn_v3/docs/HOWTO.md`. - **Tools:** CNN test tool operational. Texture readback utility functional. Timeline editor (web-based, beat-aligned, audio playback). -- **Build:** Asset dependency tracking. Size measurement. Hot-reload (debug-only). WSL (Windows 10) supported: native Linux build and cross-compile to `.exe` via `mingw-w64`. +- **Build:** Asset dependency tracking. Size measurement. Hot-reload (debug-only). WSL (Windows 10) supported: native Linux build and cross-compile to `.exe` via `mingw-w64`. WGSL shaders ANS-compressed in embedded (STRIP_ALL) builds with a corpus-derived ASCII histogram, ~0.62–0.71× on the main workspace (see `doc/ANS.md`). - **Sequence:** DAG-based effect routing with explicit node system. Python compiler with topological sort and ping-pong optimization. 18 effects operational (Passthrough, Placeholder, GaussianBlur, Heptagon, Particles, RotatingCube, Hybrid3D, Flash, PeakMeter, Scene1, Scene2, Scratch, Ntsc, NtscYiq, GBufferEffect, CNNv3Effect, GBufDeferredEffect, GBufViewEffect). Effect times are absolute (seq_compiler adds sequence start offset). See `doc/SEQUENCE.md`. - **Testing:** **38/38 passing**. diff --git a/cmake/DemoSourceLists.cmake b/cmake/DemoSourceLists.cmake index 52278ee..ef297f8 100644 --- a/cmake/DemoSourceLists.cmake +++ b/cmake/DemoSourceLists.cmake @@ -26,7 +26,7 @@ set(AUDIO_SOURCES set(PROCEDURAL_SOURCES src/procedural/generator.cc) # Utility sources (unconditional) -set(UTIL_SOURCES src/util/asset_manager.cc src/util/file_watcher.cc) +set(UTIL_SOURCES src/util/asset_manager.cc src/util/file_watcher.cc src/util/ans.cc) # Common effect sources (shared between headless and normal modes) set(COMMON_GPU_EFFECTS diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake index 59859c5..c6d8b7d 100644 --- a/cmake/DemoTests.cmake +++ b/cmake/DemoTests.cmake @@ -10,6 +10,10 @@ add_demo_test(test_maths MathUtilsTest util src/tests/util/test_maths.cc) add_demo_test(test_file_watcher FileWatcherTest util src/tests/util/test_file_watcher.cc) target_link_libraries(test_file_watcher PRIVATE util ${DEMO_LIBS}) +add_demo_test(test_ans AnsCoderTest util src/tests/util/test_ans.cc src/util/ans.cc) +target_compile_definitions(test_ans PRIVATE ANS_ENABLE_ENCODER) +target_link_libraries(test_ans PRIVATE ${DEMO_LIBS}) + add_demo_test(test_synth SynthEngineTest audio src/tests/audio/test_synth.cc ${GEN_DEMO_CC}) target_link_libraries(test_synth PRIVATE audio util procedural ${DEMO_LIBS}) demo_add_asset_deps(test_synth audio) diff --git a/cmake/DemoTools.cmake b/cmake/DemoTools.cmake index e83a384..d61efc1 100644 --- a/cmake/DemoTools.cmake +++ b/cmake/DemoTools.cmake @@ -6,9 +6,10 @@ if(DEFINED ASSET_PACKER_PATH) set(ASSET_PACKER_CMD ${ASSET_PACKER_PATH}) set(ASSET_PACKER_DEPENDS ${ASSET_PACKER_PATH}) else() - add_executable(asset_packer tools/asset_packer.cc) + add_executable(asset_packer tools/asset_packer.cc src/util/ans.cc) + target_compile_definitions(asset_packer PRIVATE ANS_ENABLE_ENCODER) target_link_libraries(asset_packer PRIVATE procedural) - target_include_directories(asset_packer PRIVATE third_party) + target_include_directories(asset_packer PRIVATE third_party src) set(ASSET_PACKER_CMD $<TARGET_FILE:asset_packer>) set(ASSET_PACKER_DEPENDS asset_packer) endif() diff --git a/doc/ANS.md b/doc/ANS.md new file mode 100644 index 0000000..c93bf82 --- /dev/null +++ b/doc/ANS.md @@ -0,0 +1,166 @@ +# ANS Compression + +Order-0 rANS entropy coder used to compress shader assets at build time and +decompress them on first access at runtime. + +**Source:** `src/util/ans.{h,cc}`. + +--- + +## Algorithm + +Per-chunk adaptive order-0 byte coder. + +| Parameter | Value | +|------------------|----------------------------------------| +| Precision | 16 bits (`kBits = 16`) | +| State range | `[1 << 16, 1 << 32)` (`uint32_t`) | +| Renorm I/O width | 16 bits (big-endian) | +| Chunk size | 1024 bytes | +| Symbols | 256 (bytes) | +| Initial state | `1 << 16` (`kInitState`) | + +The encoder iterates each chunk in reverse, the decoder forward. Symbol +counts are mutated on the fly during encode/decode and re-normalized at +each chunk boundary so the cumulative table sums to `1 << 16`. + +The chunk-end state always equals `kInitState`; the decoder rejects the +stream if it doesn't. That single check catches both bit-level corruption +and decoder/encoder model divergence (e.g. wrong initial histogram). + +The per-chunk initial state must be exactly `1 << kBits`. A higher value +(e.g. with a "signature" packed into the upper bits) forces a renorm-emit +at iter 0 that the decoder never consumes — harmless on a single chunk, +but it corrupts any stream with two or more chunks once the per-chunk +stats become skewed. + +--- + +## Bitstream Format + +Big-endian throughout. + +``` +[u32 uncompressed_size] // 4 bytes, header +per chunk (uncompressed_size > 0): + [u32 final_state] // 4 bytes + [u16 emitted_words]* // variable, in stream order +``` + +Number of emitted words per chunk is implicit — the decoder pulls a word +whenever its state drops at or below `kMask = (1 << kBits) - 1`. + +--- + +## API + +```cpp +#include "util/ans.h" + +// Always built. +bool ans::Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts = nullptr); + +uint32_t ans::PeekUncompressedSize(const uint8_t* src, size_t src_size); + +// Gated on ANS_ENABLE_ENCODER (tools only). +bool ans::Encode(const uint8_t* src, size_t size, + std::vector<uint8_t>* dst, + const uint32_t* initial_counts = nullptr); + +void ans::Histogram(const uint8_t* src, size_t size, uint32_t* out_counts); +``` + +`initial_counts` is a 256-entry table that seeds the adaptive model. Both +encoder and decoder must use the same seed — a mismatch trips the chunk-end +state check immediately. Pass `nullptr` for a uniform default (all-ones). + +--- + +## Asset Pipeline Integration + +`AssetRecord` carries two extra fields: + +```cpp +enum class AssetCompression : uint8_t { + NONE = 0, + ANS_ASCII = 1, // seeded from GetAnsAsciiHistogram() +}; + +struct AssetRecord { + ... + AssetCompression compression; + size_t uncompressed_size; // == size if compression == NONE +}; +``` + +### Build time (`tools/asset_packer.cc`) + +Embedded (non-disk-load) builds only: + +1. Scan every `WGSL` asset to build a corpus-wide 256-entry byte histogram. +2. Emit it as `static const uint32_t kAnsAsciiHistogram[256]` plus a + `GetAnsAsciiHistogram()` accessor in `assets_data.cc`. +3. For each `WGSL` asset, call `TryAnsCompress()`: + `ans::Encode(...)` → reject if it's not smaller than the raw input → + round-trip verify with `ans::Decode(...)` → only then mark the asset + `ANS_ASCII`. +4. Other asset types (SPEC, TEXTURE, MESH, BINARY, MP3, PROC*) pass + through uncompressed. + +Disk-load (dev) builds skip the encoder entirely: WGSL data is the file +path, never the file contents. + +### Runtime (`src/util/asset_manager.cc`) + +`GetAsset()` checks `compression` on a cache miss: + +- `NONE` → return the static pointer (or hit the existing PROC / disk-load + branch). +- `ANS_ASCII` → allocate `uncompressed_size + 1` bytes, + `ans::Decode(..., GetAnsAsciiHistogram())`, NUL-terminate, cache. + +`DropAsset()` and `ReloadAssetsFromFile()` free the heap-allocated buffer +when `compression != NONE`, alongside the existing procedural cleanup. + +--- + +## Observed Compression + +`workspaces/main`, STRIP_ALL build: WGSL shaders compress to **0.62×–0.71×** +their raw size (81 of 105 assets qualify). Round-trip verification runs +at pack time for every compressed asset; failures abort the build. + +--- + +## Limitations + +The encoder returns `false` if it cannot produce a final state above +`kMask` for some chunk. With the corpus-derived ASCII histogram this never +trips on the demo's WGSL corpus, but inputs with a near-monolithic byte +distribution can fail. Such assets fall back to uncompressed storage. + +--- + +## Tests + +`src/tests/util/test_ans.cc` (run via `make run_util_tests` or +`./build/test_ans`): + +- Roundtrip variants: empty, single byte, single-symbol run, all-zeros, + random uniform, random skewed, repeated ASCII. +- Seeded-vs-uniform: a corpus-matched histogram compresses at least as + well as a uniform seed. +- Rejection: mismatched seed model, payload bit-flip, truncated stream. +- `PeekUncompressedSize` returns the header value. + +--- + +## See Also + +- `doc/ASSET_SYSTEM.md` — overall asset pipeline. +- `src/util/ans.h` — public API. +- `tools/asset_packer.cc` — corpus scan and per-asset compression. +- `src/util/asset_manager.cc` — runtime decompression. diff --git a/doc/ASSET_SYSTEM.md b/doc/ASSET_SYSTEM.md index a97886c..415342d 100644 --- a/doc/ASSET_SYSTEM.md +++ b/doc/ASSET_SYSTEM.md @@ -60,6 +60,16 @@ enum class AssetType : uint8_t { }; ``` +## Compression + +Each `AssetRecord` carries an `AssetCompression` flag and an +`uncompressed_size`. In **Release Mode** the asset packer runs an order-0 +rANS coder over every `WGSL` asset, seeded with a histogram derived from +the full shader corpus. `AssetManager::GetAsset()` decompresses lazily on +first access and caches the result. Other asset types and **Development +Mode** are unaffected. See `doc/ANS.md` for the algorithm, bitstream, and +runtime API. + Query at runtime: ```cpp if (GetAssetType(AssetId::NEVER_MP3) == AssetType::MP3) { ... } @@ -93,8 +103,8 @@ Tool: `tools/asset_packer.cc` ## Technical Guarantees - **Alignment**: All embedded data arrays are declared `alignas(16)` for safe `reinterpret_cast`. -- **String Safety**: Embedded assets are null-terminated (safe as C-strings). In disk-load mode, the path itself is a null-terminated C-string. -- **Size**: For embedded assets, `size` reflects the original file size (the buffer is `size + 1`). For disk-loaded assets, it reflects the file path's string length. +- **String Safety**: Embedded assets are null-terminated (safe as C-strings). In disk-load mode, the path itself is a null-terminated C-string. ANS-compressed assets are NUL-terminated by the decompressor on first access. +- **Size**: For uncompressed embedded assets, `size` is the original file size (the buffer is `size + 1`). For disk-loaded assets, it is the file path's string length. For ANS-compressed assets, `size` is the *compressed* byte count; query `uncompressed_size` for the decoded length. ## Developer Workflow diff --git a/doc/COMPLETED.md b/doc/COMPLETED.md index 233373e..bf4c3ba 100644 --- a/doc/COMPLETED.md +++ b/doc/COMPLETED.md @@ -34,6 +34,10 @@ Completed task archive. See `doc/archive/` for detailed historical documents. --- +## May 2026 + +- [x] **ANS shader compression (2026-05-14)** — Order-0 rANS coder in `src/util/ans.{h,cc}` (decoder always built; encoder gated on `ANS_ENABLE_ENCODER`). `asset_packer` derives a corpus-wide byte histogram from every WGSL file, ANS-encodes each shader with that seed, and round-trip-verifies at pack time. `AssetRecord` gains `compression` + `uncompressed_size`; `asset_manager` decompresses lazily on first `GetAsset()` and frees in `DropAsset`/`ReloadAssetsFromFile`. WGSL assets shrink to ~0.62–0.71× in `workspaces/main` (81/105). See `doc/ANS.md`. Tests: 37/37 dev, 36/36 STRIP_ALL. + ## March 2026 (continued) - [x] **FFT twiddle factor fix** — `fft_radix2` computes `wr/wi` directly per k via `cosf/sinf(angle*k)`. Tests A–E added to `test_fft.cc`. Tolerance reverted to 5e-3. diff --git a/src/tests/util/test_ans.cc b/src/tests/util/test_ans.cc new file mode 100644 index 0000000..108c46d --- /dev/null +++ b/src/tests/util/test_ans.cc @@ -0,0 +1,189 @@ +// This file is part of the 64k demo project. +// Tests for the ANS (rANS) entropy coder in util/ans.h. +// Encoder is enabled in tests via ANS_ENABLE_ENCODER. + +#include "util/ans.h" + +#include <cassert> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <random> +#include <string> +#include <vector> + +namespace { + +bool RoundtripCheck(const std::vector<uint8_t>& input, + const uint32_t* initial_counts, + const char* label) { + std::vector<uint8_t> compressed; + if (!ans::Encode(input.data(), input.size(), &compressed, initial_counts)) { + fprintf(stderr, "[%s] Encode failed\n", label); + return false; + } + std::vector<uint8_t> decoded(input.size()); + size_t decoded_size = 0; + if (!ans::Decode(compressed.data(), compressed.size(), decoded.data(), + decoded.size(), &decoded_size, initial_counts)) { + fprintf(stderr, "[%s] Decode failed\n", label); + return false; + } + if (decoded_size != input.size() || + std::memcmp(decoded.data(), input.data(), input.size()) != 0) { + fprintf(stderr, "[%s] payload mismatch\n", label); + return false; + } + const double ratio = + input.empty() ? 0.0 : (double)compressed.size() / (double)input.size(); + fprintf(stderr, "[%s] OK: %zu -> %zu bytes (%.3f x)\n", label, input.size(), + compressed.size(), ratio); + return true; +} + +// Covers: empty / single byte / single-symbol run / all-zeros / random uniform +// / random skewed / repeated ASCII. Each is a roundtrip with default (uniform) +// initial counts. +void TestRoundtripVariants() { + std::mt19937 rng_uniform(12345); + std::vector<uint8_t> random_uniform(64 * 1024); + for (auto& b : random_uniform) b = (uint8_t)(rng_uniform() & 0xff); + + std::mt19937 rng_skewed(67890); + std::vector<uint8_t> random_skewed(32 * 1024); + for (auto& b : random_skewed) { + // 90% byte 'A', 10% other ASCII letters. + const uint32_t r = rng_skewed(); + b = (r % 10 == 0) ? (uint8_t)('B' + (r >> 8) % 25) : (uint8_t)'A'; + } + + const char* ascii = + "@group(0) @binding(0) var smplr: sampler;\n" + "@group(0) @binding(1) var tex: texture_2d<f32>;\n" + "@fragment fn fs_main(@builtin(position) p: vec4f) -> @location(0) " + "vec4f {\n" + " let uv = p.xy / vec2f(1280.0, 720.0);\n" + " return textureSample(tex, smplr, uv);\n" + "}\n"; + std::vector<uint8_t> ascii_block; + for (int i = 0; i < 50; ++i) { // cross chunk boundary + ascii_block.insert(ascii_block.end(), ascii, ascii + std::strlen(ascii)); + } + + struct Case { + const char* label; + std::vector<uint8_t> data; + }; + const Case cases[] = { + {"empty", {}}, + {"single-byte", {0x42}}, + {"single-symbol-run", std::vector<uint8_t>(4096, 'A')}, + {"all-zeros", std::vector<uint8_t>(8192, 0)}, + {"random-uniform", random_uniform}, + {"random-skewed", random_skewed}, + {"ascii-shader-text", ascii_block}, + }; + for (const Case& c : cases) { + assert(RoundtripCheck(c.data, nullptr, c.label)); + } +} + +// Seeded initial counts should compress a matching corpus at least as well +// as uniform init, and roundtrip identically. +void TestSeededInitialCounts() { + std::vector<uint8_t> corpus; + const char* sample = "fn main() { let x = vec3f(1.0, 2.0, 3.0); }\n"; + for (int i = 0; i < 200; ++i) { + corpus.insert(corpus.end(), sample, sample + std::strlen(sample)); + } + uint32_t hist[256] = {}; + ans::Histogram(corpus.data(), corpus.size(), hist); + + std::vector<uint8_t> seeded_out, uniform_out; + assert(ans::Encode(corpus.data(), corpus.size(), &seeded_out, hist)); + assert(ans::Encode(corpus.data(), corpus.size(), &uniform_out, nullptr)); + + std::vector<uint8_t> decoded(corpus.size()); + size_t decoded_size = 0; + assert(ans::Decode(seeded_out.data(), seeded_out.size(), decoded.data(), + decoded.size(), &decoded_size, hist)); + assert(decoded_size == corpus.size()); + assert(std::memcmp(decoded.data(), corpus.data(), corpus.size()) == 0); + + fprintf(stderr, "[seeded-vs-uniform] seeded=%zu uniform=%zu raw=%zu\n", + seeded_out.size(), uniform_out.size(), corpus.size()); + assert(seeded_out.size() <= uniform_out.size()); +} + +// The chunk-end state check must reject: a mismatched model (decoder uses a +// different initial distribution), a single bit-flip in the payload, and a +// truncated stream. +void TestRejection() { + std::mt19937 rng(1); + + // 1) Mismatched models. + { + std::vector<uint8_t> v(4096); + for (auto& b : v) b = (uint8_t)('a' + (rng() % 26)); + uint32_t hist[256] = {}; + ans::Histogram(v.data(), v.size(), hist); + + std::vector<uint8_t> encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, hist)); + + std::vector<uint8_t> decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] mismatched-counts OK\n"); + } + + // 2) Corruption. + { + std::vector<uint8_t> v(2048); + for (auto& b : v) b = (uint8_t)(rng() & 0xff); + std::vector<uint8_t> encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + encoded[encoded.size() / 2] ^= 0x55; // flip a payload byte + + std::vector<uint8_t> decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] corruption OK\n"); + } + + // 3) Truncation. + { + std::vector<uint8_t> v(4096); + for (size_t i = 0; i < v.size(); ++i) v[i] = (uint8_t)i; + std::vector<uint8_t> encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + encoded.resize(encoded.size() - 8); + + std::vector<uint8_t> decoded(v.size()); + size_t decoded_size = 0; + assert(!ans::Decode(encoded.data(), encoded.size(), decoded.data(), + decoded.size(), &decoded_size, nullptr)); + fprintf(stderr, "[rejection] truncation OK\n"); + } +} + +void TestPeekSize() { + std::vector<uint8_t> v(1234, 'Q'); + std::vector<uint8_t> encoded; + assert(ans::Encode(v.data(), v.size(), &encoded, nullptr)); + assert(ans::PeekUncompressedSize(encoded.data(), encoded.size()) == + v.size()); +} + +} // namespace + +int main() { + TestRoundtripVariants(); + TestSeededInitialCounts(); + TestRejection(); + TestPeekSize(); + fprintf(stderr, "All ANS tests passed.\n"); + return 0; +} diff --git a/src/util/ans.cc b/src/util/ans.cc new file mode 100644 index 0000000..aa79e4b --- /dev/null +++ b/src/util/ans.cc @@ -0,0 +1,215 @@ +// This file is part of the 64k demo project. +// Order-0 rANS coder. See ans.h for the bitstream format. + +#include "util/ans.h" + +#include <algorithm> +#include <cmath> +#include <cstring> + +namespace ans { + +namespace { + +// Per-chunk adaptive byte model. Probabilities sum to 1<<kBits after +// normalize(); raw counts in 'stats' keep adapting across chunks. +struct Stats { + uint32_t stats[kNumSymbols]; + uint32_t cumul[kNumSymbols + 1]; + + // Seeds 'stats' from caller counts (or all-ones if null) and builds the + // normalized cumulative table. stats_to_cumul() floors zero counts, so + // 'init' doesn't have to. + void init(const uint32_t* initial_counts) { + if (initial_counts) { + std::memcpy(stats, initial_counts, sizeof(stats)); + } else { + std::fill_n(stats, kNumSymbols, 1u); + } + normalize(); + } + + // Rebuilds 'cumul' from 'stats'. Every slot is floored at 1 so each + // symbol remains reachable even if it has been seen zero times. + void stats_to_cumul() { + uint32_t acc = 0; + for (int i = 0; i < kNumSymbols; ++i) { + cumul[i] = acc; + acc += stats[i] ? stats[i] : 1u; + } + cumul[kNumSymbols] = acc; + } + + // Rescales 'cumul' so cumul[NUM_SYMBOLS] == (1 << kBits). Ceil-multiply + // preserves monotonicity and prevents any active slot from collapsing. + void normalize() { + stats_to_cumul(); + const uint32_t target = 1u << kBits; + while (cumul[kNumSymbols] != target) { + const double ratio = (double)target / (double)cumul[kNumSymbols]; + for (int i = 0; i <= kNumSymbols; ++i) { + cumul[i] = (uint32_t)std::ceil((double)cumul[i] * ratio); + } + } + } + + // Decoder lookup: locate the slot containing 's' in [0, 1<<kBits). + // Returns the symbol, its probability, and the residual offset within + // the slot. Increments the symbol's count on the fly. + void decode_lookup(uint32_t s, uint8_t* sym, uint32_t* p, uint32_t* r) { + // upper_bound(s) - 1; cumul is strictly non-decreasing. + const uint32_t* it = + std::upper_bound(cumul, cumul + kNumSymbols + 1, s); + const int c = (int)(it - cumul) - 1; + stats[c] += 1; + *sym = (uint8_t)c; + *p = cumul[c + 1] - cumul[c]; + *r = s - cumul[c]; + } + + // Encoder lookup: probability and base offset for symbol 'c'. + void encode_lookup(int c, uint32_t* p, uint32_t* base) { + stats[c] += 1; + *p = cumul[c + 1] - cumul[c]; + *base = cumul[c]; + } +}; + +// Big-endian primitive readers/writers. +inline uint16_t ReadU16BE(const uint8_t* p) { + return (uint16_t)((p[0] << 8) | p[1]); +} +inline uint32_t ReadU32BE(const uint8_t* p) { + return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | + ((uint32_t)p[2] << 8) | (uint32_t)p[3]; +} + +#if defined(ANS_ENABLE_ENCODER) +inline void AppendU16BE(std::vector<uint8_t>* dst, uint16_t v) { + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +inline void AppendU32BE(std::vector<uint8_t>* dst, uint32_t v) { + dst->push_back((uint8_t)(v >> 24)); + dst->push_back((uint8_t)(v >> 16)); + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +#endif + +} // namespace + +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size) { + if (!src || src_size < 4) return 0; + return ReadU32BE(src); +} + +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts) { + if (out_size) *out_size = 0; + if (!src || src_size < 4) return false; + + const uint8_t* p = src; + const uint8_t* end = src + src_size; + + const uint32_t output_size = ReadU32BE(p); + p += 4; + if (output_size > dst_capacity) return false; + if (output_size > 0 && !dst) return false; + + Stats stats; + stats.init(initial_counts); + + uint32_t t = 0; + while (t < output_size) { + const uint32_t chunk = std::min<uint32_t>(kChunkSize, output_size - t); + + if (end - p < 4) return false; + uint32_t s = ReadU32BE(p); + p += 4; + + for (uint32_t i = 0; i < chunk; ++i) { + if (s <= kMask) { + // Pull in one renorm word. + if (end - p < 2) return false; + s = (s << kBits) | (uint32_t)ReadU16BE(p); + p += 2; + } + uint8_t sym; + uint32_t proba, residual; + stats.decode_lookup(s & kMask, &sym, &proba, &residual); + dst[t + i] = sym; + s = proba * (s >> kBits) + residual; + } + // Final-state sanity check: catches stream corruption and model mismatch. + if (s != kInitState) return false; + stats.normalize(); + t += chunk; + } + + if (out_size) *out_size = output_size; + return true; +} + +#if defined(ANS_ENABLE_ENCODER) + +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts) { + if (!src || !out_counts) return; + for (size_t i = 0; i < size; ++i) out_counts[src[i]] += 1; +} + +bool Encode(const uint8_t* src, size_t size, + std::vector<uint8_t>* dst, + const uint32_t* initial_counts) { + if (!dst) return false; + dst->clear(); + if (size > 0xffffffffu) return false; // header is u32 + + AppendU32BE(dst, (uint32_t)size); + if (size == 0) return true; + + Stats stats; + stats.init(initial_counts); + + uint16_t tmp[kChunkSize]; + + size_t t = 0; + while (t < size) { + // Initial state is the bare signature (not shifted): the decoder ends + // each chunk with s == kInitState, so symmetry requires the encoder to + // start in the same low-state. Starting with (kInitState << kBits) would + // force a spurious renorm at iter 0 that the decoder never consumes, + // corrupting any stream with more than one chunk. + uint32_t s = kInitState; + const size_t chunk = std::min<size_t>(kChunkSize, size - t); + size_t pos = chunk; + // Encode the chunk in reverse so the decoder consumes symbols in order. + for (size_t k = chunk; k-- > 0;) { + const uint8_t sym = src[t + k]; + uint32_t proba, base; + stats.encode_lookup(sym, &proba, &base); + if (s >= (proba << kBits)) { + // Flush low word and shift down to keep s/proba bounded in u32. + --pos; + tmp[pos] = (uint16_t)(s & kMask); + s >>= kBits; + } + s = ((s / proba) << kBits) + (s % proba) + base; + } + // Invariant: final state must be > kMask so the decoder's first read is + // a valid renormalized state. + if (s <= kMask) return false; + AppendU32BE(dst, s); + for (size_t k = pos; k < chunk; ++k) AppendU16BE(dst, tmp[k]); + + stats.normalize(); + t += chunk; + } + return true; +} + +#endif // ANS_ENABLE_ENCODER + +} // namespace ans diff --git a/src/util/ans.h b/src/util/ans.h new file mode 100644 index 0000000..53c34b1 --- /dev/null +++ b/src/util/ans.h @@ -0,0 +1,58 @@ +// This file is part of the 64k demo project. +// Asymmetric Numeral System (rANS) order-0 entropy coder. +// Decoder is always built; encoder is gated by ANS_ENABLE_ENCODER. +// Bitstream is big-endian and chunk-adaptive; see the per-chunk format in +// the implementation file. + +#ifndef ANS_H_ +#define ANS_H_ + +#include <cstddef> +#include <cstdint> + +#if defined(ANS_ENABLE_ENCODER) +#include <vector> +#endif + +namespace ans { + +// Fixed parameters; must match the encoder/decoder on both ends. +constexpr int kBits = 16; +constexpr uint32_t kMask = (1u << kBits) - 1u; +constexpr int kNumSymbols = 256; +constexpr int kChunkSize = 1024; +// Initial state per chunk. Doubles as a chunk-end integrity check: any +// decoder/encoder model divergence drives the state away from this constant, +// which is verified after every chunk. +constexpr uint32_t kInitState = 1u << kBits; + +// Reads the original (uncompressed) size from a bitstream header. +// Returns 0 on malformed input. +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size); + +// Decodes 'src_size' bytes from 'src' into 'dst' (capacity 'dst_capacity'). +// 'initial_counts' (256 entries) seeds the per-chunk adaptive model; pass +// nullptr for the uniform default (all-ones). +// Returns true on success and writes the decoded size to '*out_size'. +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts = nullptr); + +#if defined(ANS_ENABLE_ENCODER) +// Encodes 'src[0..size]' into '*dst' (cleared and re-filled). +// 'initial_counts' has the same semantics as in Decode(). +// Returns true on success. +bool Encode(const uint8_t* src, size_t size, + std::vector<uint8_t>* dst, + const uint32_t* initial_counts = nullptr); + +// Computes a byte histogram over 'src[0..size]', accumulating into +// 'out_counts[256]' (caller must zero-initialize before the first call). +// Useful for deriving a corpus-wide initial distribution from many files. +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts); +#endif // ANS_ENABLE_ENCODER + +} // namespace ans + +#endif // ANS_H_ diff --git a/src/util/asset_manager.cc b/src/util/asset_manager.cc index 26a82bf..264ddda 100644 --- a/src/util/asset_manager.cc +++ b/src/util/asset_manager.cc @@ -2,6 +2,7 @@ // It implements the generic asset retrieval logic with runtime caching. #include "util/asset_manager.h" +#include "util/ans.h" #include "util/asset_manager_utils.h" #include "util/check_return.h" #if defined(USE_TEST_ASSETS) @@ -114,6 +115,34 @@ const uint8_t* GetAsset(AssetId asset_id, size_t* out_size) { AssetRecord cached_record = source_record; + if (source_record.compression == AssetCompression::ANS_ASCII) { + // Decompress on first access. Buffer is null-terminated for C-string use. + const size_t uncomp = source_record.uncompressed_size; + uint8_t* buffer = new (std::nothrow) uint8_t[uncomp + 1]; + CHECK_RETURN_BEGIN(!buffer, nullptr) + if (out_size) *out_size = 0; + ERROR_MSG("Failed to allocate buffer for ANS-compressed asset id=%u", + index); + return nullptr; + CHECK_RETURN_END + size_t got = 0; + if (!ans::Decode(source_record.data, source_record.size, buffer, uncomp, + &got, GetAnsAsciiHistogram()) || + got != uncomp) { + delete[] buffer; + if (out_size) *out_size = 0; + ERROR_MSG("ANS decode failed for asset id=%u", index); + return nullptr; + } + buffer[uncomp] = 0; + cached_record.data = buffer; + cached_record.size = uncomp; + cached_record.uncompressed_size = uncomp; + g_asset_cache[index] = cached_record; + if (out_size) *out_size = uncomp; + return buffer; + } + if (source_record.type == AssetType::PROC || source_record.type == AssetType::PROC_GPU) { // Dynamically generate the asset @@ -224,6 +253,12 @@ void DropAsset(AssetId asset_id, const uint8_t* asset) { delete[] g_asset_cache[index].data; g_asset_cache[index] = {}; // Zero out the struct to force re-generation } + // Heap-allocated decompressed buffer (compression != NONE): cache owns it. + if (g_asset_cache[index].data == asset && + g_asset_cache[index].compression != AssetCompression::NONE) { + delete[] g_asset_cache[index].data; + g_asset_cache[index] = {}; + } #if !defined(STRIP_ALL) if (g_asset_cache[index].data == asset && (g_asset_cache[index].type == AssetType::SPEC || @@ -245,10 +280,11 @@ bool ReloadAssetsFromFile(const char* config_path) { // Clear cache to force reload for (size_t i = 0; i < (size_t)AssetId::ASSET_LAST_ID; ++i) { - if ((g_asset_cache[i].type == AssetType::PROC || - g_asset_cache[i].type == AssetType::PROC_GPU) && - g_asset_cache[i].data) { - delete[] g_asset_cache[i].data; + const AssetRecord& e = g_asset_cache[i]; + if (e.data && + (e.type == AssetType::PROC || e.type == AssetType::PROC_GPU || + e.compression != AssetCompression::NONE)) { + delete[] e.data; } g_asset_cache[i] = {}; } diff --git a/src/util/asset_manager.h b/src/util/asset_manager.h index 786a8db..f6ad244 100644 --- a/src/util/asset_manager.h +++ b/src/util/asset_manager.h @@ -16,10 +16,21 @@ enum class AssetType : uint8_t { PROC_GPU, }; +// Compression scheme applied to the packed bytes (only relevant for static +// embedded assets; disk-load and procedural assets are always NONE). +enum class AssetCompression : uint8_t { + NONE = 0, + // Order-0 rANS, model seeded from a corpus-wide histogram embedded in the + // generated assets blob (see GetAnsAsciiHistogram). Used for WGSL. + ANS_ASCII = 1, +}; + struct AssetRecord { const uint8_t* data; // Pointer to asset data (static or dynamic) - size_t size; // Size of the asset data + size_t size; // Size of 'data' in bytes (compressed size if any) AssetType type; + AssetCompression compression; // How 'data' is encoded; NONE = raw bytes + size_t uncompressed_size; // Size after decompression (== size if NONE) const char* proc_func_name_str; // Name of procedural generation function // (string literal) const float* proc_params; // Parameters for procedural generation (static, @@ -27,6 +38,11 @@ struct AssetRecord { int num_proc_params; // Number of procedural parameters }; +// Initial-state histogram (256 entries) used to seed ANS_ASCII compression. +// Defined in the generated assets blob; computed at pack time over the WGSL +// corpus. +const uint32_t* GetAnsAsciiHistogram(); + // Generic interface // Retrieves a pointer to the asset data. // - Static assets are guaranteed to be 16-byte aligned. diff --git a/tools/asset_packer.cc b/tools/asset_packer.cc index df876be..6162f19 100644 --- a/tools/asset_packer.cc +++ b/tools/asset_packer.cc @@ -23,6 +23,7 @@ #include "stb_image.h" #include "procedural/generator.h" // For ProcGenFunc and procedural functions +#include "util/ans.h" // ANS compression for WGSL assets #include "util/asset_manager.h" // For AssetRecord and AssetId // Map of procedural function names to their pointers (used only internally by @@ -81,6 +82,11 @@ struct AssetBuildInfo { std::string data_array_name; // ASSET_DATA_xxx for static std::string params_array_name; // ASSET_PROC_PARAMS_xxx for procedural std::string func_name_str_name; // ASSET_PROC_FUNC_STR_xxx for procedural + + // Set during the per-asset emit step (only for embedded data, not + // disk-load and not procedural). + std::string compression = "NONE"; // "NONE" | "ANS_ASCII" + size_t uncompressed_size = 0; // 0 when 'compression' == "NONE" }; static bool ParseProceduralFunction(const std::string& compression_type_str, @@ -339,6 +345,41 @@ static bool ProcessImageFile(const std::string& full_path, return true; } +// ANS-compress 'raw' with the seeded histogram and round-trip verify the +// payload. Returns true on success and writes the compressed bytes to '*out'. +// Returns false (without populating *out) if encoding fails, the compressed +// payload is not smaller, or the round-trip mismatches. +static bool TryAnsCompress(const std::vector<uint8_t>& raw, + const uint32_t* hist, + std::vector<uint8_t>* out) { + if (raw.empty()) return false; + std::vector<uint8_t> enc; + if (!ans::Encode(raw.data(), raw.size(), &enc, hist)) return false; + if (enc.size() >= raw.size()) return false; + std::vector<uint8_t> verify(raw.size()); + size_t got = 0; + if (!ans::Decode(enc.data(), enc.size(), verify.data(), verify.size(), &got, + hist) || + got != raw.size() || + std::memcmp(verify.data(), raw.data(), raw.size()) != 0) { + return false; + } + *out = std::move(enc); + return true; +} + +// Emits a comma-separated list of values as a C array initializer, wrapping +// at 12 entries per line. +template <typename T, typename FormatFn> +static void EmitArrayInit(FILE* f, const T* data, size_t n, FormatFn fmt) { + for (size_t i = 0; i < n; ++i) { + if (i % 12 == 0) fprintf(f, "\n "); + fmt(f, data[i]); + if (i + 1 != n) fprintf(f, ", "); + } + fprintf(f, "\n"); +} + int main(int argc, char* argv[]) { if (argc < 4) { fprintf(stderr, @@ -482,7 +523,37 @@ int main(int argc, char* argv[]) { std::fclose(assets_h_file); - for (const auto& info : asset_build_infos) { + // --------------------------------------------------------------------- + // Pre-pass: build a corpus-wide byte histogram from all WGSL assets to + // seed the ANS coder. Skipped in disk-load mode (WGSL data is not + // embedded then, so we never run the encoder). + // --------------------------------------------------------------------- + uint32_t ans_ascii_hist[256] = {}; + if (!disk_load_mode) { + for (const auto& info : asset_build_infos) { + if (info.asset_type != "WGSL") continue; + std::string base_dir = + assets_txt_path.substr(0, assets_txt_path.find_last_of("/\\") + 1); + std::filesystem::path p = std::filesystem::absolute(base_dir) / info.filename; + std::ifstream f(p.lexically_normal().string(), std::ios::binary); + if (!f.is_open()) continue; + std::vector<uint8_t> buf((std::istreambuf_iterator<char>(f)), + std::istreambuf_iterator<char>()); + ans::Histogram(buf.data(), buf.size(), ans_ascii_hist); + } + } + + fprintf(assets_data_cc_file, + "// Per-corpus byte histogram, seed for ANS_ASCII decompression.\n"); + fprintf(assets_data_cc_file, + "static const uint32_t kAnsAsciiHistogram[256] = {"); + EmitArrayInit(assets_data_cc_file, ans_ascii_hist, 256, + [](FILE* f, uint32_t v) { fprintf(f, "%u", v); }); + fprintf(assets_data_cc_file, "};\n"); + fprintf(assets_data_cc_file, + "const uint32_t* GetAnsAsciiHistogram() { return kAnsAsciiHistogram; }\n\n"); + + for (auto& info : asset_build_infos) { if (info.asset_type != "PROC" && info.asset_type != "PROC_GPU") { std::string base_dir = assets_txt_path.substr(0, assets_txt_path.find_last_of("/\\") + 1); @@ -526,21 +597,35 @@ int main(int argc, char* argv[]) { std::istreambuf_iterator<char>()); } - size_t original_size = buffer.size(); - buffer.push_back(0); // Null terminator for safety + const size_t original_size = buffer.size(); + + // ANS-compress WGSL (ASCII text) using the corpus histogram. + // Compressed payload replaces the raw buffer; we don't null-terminate + // compressed blobs since the runtime decoder writes NUL itself. + std::vector<uint8_t> compressed; + const bool use_ans = + (info.asset_type == "WGSL") && + TryAnsCompress(buffer, ans_ascii_hist, &compressed); + if (use_ans) { + info.compression = "ANS_ASCII"; + info.uncompressed_size = original_size; + printf(" ANS %-32s %7zu -> %7zu (%.2f x)\n", info.name.c_str(), + original_size, compressed.size(), + (double)compressed.size() / (double)original_size); + } else { + buffer.push_back(0); // null-terminate raw assets + } + const std::vector<uint8_t>& payload = use_ans ? compressed : buffer; fprintf(assets_data_cc_file, "const size_t ASSET_SIZE_%s = %zu;\n", - info.name.c_str(), original_size); + info.name.c_str(), + use_ans ? payload.size() : original_size); fprintf(assets_data_cc_file, - "alignas(16) static const uint8_t %s[] = {\n ", + "alignas(16) static const uint8_t %s[] = {", info.data_array_name.c_str()); - for (size_t i = 0; i < buffer.size(); ++i) { - if (i > 0 && i % 12 == 0) - fprintf(assets_data_cc_file, "\n "); - fprintf(assets_data_cc_file, "0x%02x%s", buffer[i], - (i == buffer.size() - 1 ? "" : ", ")); - } - fprintf(assets_data_cc_file, "\n};\n"); + EmitArrayInit(assets_data_cc_file, payload.data(), payload.size(), + [](FILE* f, uint8_t v) { fprintf(f, "0x%02x", v); }); + fprintf(assets_data_cc_file, "};\n"); } } else { fprintf(assets_data_cc_file, "static const float %s[] = {", @@ -561,15 +646,19 @@ int main(int argc, char* argv[]) { for (const auto& info : asset_build_infos) { fprintf(assets_data_cc_file, " { "); if (info.asset_type == "PROC" || info.asset_type == "PROC_GPU") { - fprintf(assets_data_cc_file, "nullptr, 0, AssetType::%s, %s, %s, %zu", + // data, size, type, compression, uncompressed_size, proc_func, params, n + fprintf(assets_data_cc_file, + "nullptr, 0, AssetType::%s, AssetCompression::NONE, 0, " + "%s, %s, %zu", info.asset_type.c_str(), info.func_name_str_name.c_str(), info.params_array_name.c_str(), info.proc_params.size()); } else { fprintf(assets_data_cc_file, - "(const uint8_t*)%s, ASSET_SIZE_%s, AssetType::%s, nullptr, " - "nullptr, 0", + "(const uint8_t*)%s, ASSET_SIZE_%s, AssetType::%s, " + "AssetCompression::%s, %zu, nullptr, nullptr, 0", info.data_array_name.c_str(), info.name.c_str(), - info.asset_type.c_str()); + info.asset_type.c_str(), info.compression.c_str(), + info.uncompressed_size); } fprintf(assets_data_cc_file, " },\n"); } |
