diff options
Diffstat (limited to 'src/util')
| -rw-r--r-- | src/util/ans.cc | 215 | ||||
| -rw-r--r-- | src/util/ans.h | 58 | ||||
| -rw-r--r-- | src/util/asset_manager.cc | 44 | ||||
| -rw-r--r-- | src/util/asset_manager.h | 18 |
4 files changed, 330 insertions, 5 deletions
diff --git a/src/util/ans.cc b/src/util/ans.cc new file mode 100644 index 0000000..aa79e4b --- /dev/null +++ b/src/util/ans.cc @@ -0,0 +1,215 @@ +// This file is part of the 64k demo project. +// Order-0 rANS coder. See ans.h for the bitstream format. + +#include "util/ans.h" + +#include <algorithm> +#include <cmath> +#include <cstring> + +namespace ans { + +namespace { + +// Per-chunk adaptive byte model. Probabilities sum to 1<<kBits after +// normalize(); raw counts in 'stats' keep adapting across chunks. +struct Stats { + uint32_t stats[kNumSymbols]; + uint32_t cumul[kNumSymbols + 1]; + + // Seeds 'stats' from caller counts (or all-ones if null) and builds the + // normalized cumulative table. stats_to_cumul() floors zero counts, so + // 'init' doesn't have to. + void init(const uint32_t* initial_counts) { + if (initial_counts) { + std::memcpy(stats, initial_counts, sizeof(stats)); + } else { + std::fill_n(stats, kNumSymbols, 1u); + } + normalize(); + } + + // Rebuilds 'cumul' from 'stats'. Every slot is floored at 1 so each + // symbol remains reachable even if it has been seen zero times. + void stats_to_cumul() { + uint32_t acc = 0; + for (int i = 0; i < kNumSymbols; ++i) { + cumul[i] = acc; + acc += stats[i] ? stats[i] : 1u; + } + cumul[kNumSymbols] = acc; + } + + // Rescales 'cumul' so cumul[NUM_SYMBOLS] == (1 << kBits). Ceil-multiply + // preserves monotonicity and prevents any active slot from collapsing. + void normalize() { + stats_to_cumul(); + const uint32_t target = 1u << kBits; + while (cumul[kNumSymbols] != target) { + const double ratio = (double)target / (double)cumul[kNumSymbols]; + for (int i = 0; i <= kNumSymbols; ++i) { + cumul[i] = (uint32_t)std::ceil((double)cumul[i] * ratio); + } + } + } + + // Decoder lookup: locate the slot containing 's' in [0, 1<<kBits). + // Returns the symbol, its probability, and the residual offset within + // the slot. Increments the symbol's count on the fly. + void decode_lookup(uint32_t s, uint8_t* sym, uint32_t* p, uint32_t* r) { + // upper_bound(s) - 1; cumul is strictly non-decreasing. + const uint32_t* it = + std::upper_bound(cumul, cumul + kNumSymbols + 1, s); + const int c = (int)(it - cumul) - 1; + stats[c] += 1; + *sym = (uint8_t)c; + *p = cumul[c + 1] - cumul[c]; + *r = s - cumul[c]; + } + + // Encoder lookup: probability and base offset for symbol 'c'. + void encode_lookup(int c, uint32_t* p, uint32_t* base) { + stats[c] += 1; + *p = cumul[c + 1] - cumul[c]; + *base = cumul[c]; + } +}; + +// Big-endian primitive readers/writers. +inline uint16_t ReadU16BE(const uint8_t* p) { + return (uint16_t)((p[0] << 8) | p[1]); +} +inline uint32_t ReadU32BE(const uint8_t* p) { + return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | + ((uint32_t)p[2] << 8) | (uint32_t)p[3]; +} + +#if defined(ANS_ENABLE_ENCODER) +inline void AppendU16BE(std::vector<uint8_t>* dst, uint16_t v) { + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +inline void AppendU32BE(std::vector<uint8_t>* dst, uint32_t v) { + dst->push_back((uint8_t)(v >> 24)); + dst->push_back((uint8_t)(v >> 16)); + dst->push_back((uint8_t)(v >> 8)); + dst->push_back((uint8_t)(v & 0xff)); +} +#endif + +} // namespace + +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size) { + if (!src || src_size < 4) return 0; + return ReadU32BE(src); +} + +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts) { + if (out_size) *out_size = 0; + if (!src || src_size < 4) return false; + + const uint8_t* p = src; + const uint8_t* end = src + src_size; + + const uint32_t output_size = ReadU32BE(p); + p += 4; + if (output_size > dst_capacity) return false; + if (output_size > 0 && !dst) return false; + + Stats stats; + stats.init(initial_counts); + + uint32_t t = 0; + while (t < output_size) { + const uint32_t chunk = std::min<uint32_t>(kChunkSize, output_size - t); + + if (end - p < 4) return false; + uint32_t s = ReadU32BE(p); + p += 4; + + for (uint32_t i = 0; i < chunk; ++i) { + if (s <= kMask) { + // Pull in one renorm word. + if (end - p < 2) return false; + s = (s << kBits) | (uint32_t)ReadU16BE(p); + p += 2; + } + uint8_t sym; + uint32_t proba, residual; + stats.decode_lookup(s & kMask, &sym, &proba, &residual); + dst[t + i] = sym; + s = proba * (s >> kBits) + residual; + } + // Final-state sanity check: catches stream corruption and model mismatch. + if (s != kInitState) return false; + stats.normalize(); + t += chunk; + } + + if (out_size) *out_size = output_size; + return true; +} + +#if defined(ANS_ENABLE_ENCODER) + +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts) { + if (!src || !out_counts) return; + for (size_t i = 0; i < size; ++i) out_counts[src[i]] += 1; +} + +bool Encode(const uint8_t* src, size_t size, + std::vector<uint8_t>* dst, + const uint32_t* initial_counts) { + if (!dst) return false; + dst->clear(); + if (size > 0xffffffffu) return false; // header is u32 + + AppendU32BE(dst, (uint32_t)size); + if (size == 0) return true; + + Stats stats; + stats.init(initial_counts); + + uint16_t tmp[kChunkSize]; + + size_t t = 0; + while (t < size) { + // Initial state is the bare signature (not shifted): the decoder ends + // each chunk with s == kInitState, so symmetry requires the encoder to + // start in the same low-state. Starting with (kInitState << kBits) would + // force a spurious renorm at iter 0 that the decoder never consumes, + // corrupting any stream with more than one chunk. + uint32_t s = kInitState; + const size_t chunk = std::min<size_t>(kChunkSize, size - t); + size_t pos = chunk; + // Encode the chunk in reverse so the decoder consumes symbols in order. + for (size_t k = chunk; k-- > 0;) { + const uint8_t sym = src[t + k]; + uint32_t proba, base; + stats.encode_lookup(sym, &proba, &base); + if (s >= (proba << kBits)) { + // Flush low word and shift down to keep s/proba bounded in u32. + --pos; + tmp[pos] = (uint16_t)(s & kMask); + s >>= kBits; + } + s = ((s / proba) << kBits) + (s % proba) + base; + } + // Invariant: final state must be > kMask so the decoder's first read is + // a valid renormalized state. + if (s <= kMask) return false; + AppendU32BE(dst, s); + for (size_t k = pos; k < chunk; ++k) AppendU16BE(dst, tmp[k]); + + stats.normalize(); + t += chunk; + } + return true; +} + +#endif // ANS_ENABLE_ENCODER + +} // namespace ans diff --git a/src/util/ans.h b/src/util/ans.h new file mode 100644 index 0000000..53c34b1 --- /dev/null +++ b/src/util/ans.h @@ -0,0 +1,58 @@ +// This file is part of the 64k demo project. +// Asymmetric Numeral System (rANS) order-0 entropy coder. +// Decoder is always built; encoder is gated by ANS_ENABLE_ENCODER. +// Bitstream is big-endian and chunk-adaptive; see the per-chunk format in +// the implementation file. + +#ifndef ANS_H_ +#define ANS_H_ + +#include <cstddef> +#include <cstdint> + +#if defined(ANS_ENABLE_ENCODER) +#include <vector> +#endif + +namespace ans { + +// Fixed parameters; must match the encoder/decoder on both ends. +constexpr int kBits = 16; +constexpr uint32_t kMask = (1u << kBits) - 1u; +constexpr int kNumSymbols = 256; +constexpr int kChunkSize = 1024; +// Initial state per chunk. Doubles as a chunk-end integrity check: any +// decoder/encoder model divergence drives the state away from this constant, +// which is verified after every chunk. +constexpr uint32_t kInitState = 1u << kBits; + +// Reads the original (uncompressed) size from a bitstream header. +// Returns 0 on malformed input. +uint32_t PeekUncompressedSize(const uint8_t* src, size_t src_size); + +// Decodes 'src_size' bytes from 'src' into 'dst' (capacity 'dst_capacity'). +// 'initial_counts' (256 entries) seeds the per-chunk adaptive model; pass +// nullptr for the uniform default (all-ones). +// Returns true on success and writes the decoded size to '*out_size'. +bool Decode(const uint8_t* src, size_t src_size, + uint8_t* dst, size_t dst_capacity, + size_t* out_size, + const uint32_t* initial_counts = nullptr); + +#if defined(ANS_ENABLE_ENCODER) +// Encodes 'src[0..size]' into '*dst' (cleared and re-filled). +// 'initial_counts' has the same semantics as in Decode(). +// Returns true on success. +bool Encode(const uint8_t* src, size_t size, + std::vector<uint8_t>* dst, + const uint32_t* initial_counts = nullptr); + +// Computes a byte histogram over 'src[0..size]', accumulating into +// 'out_counts[256]' (caller must zero-initialize before the first call). +// Useful for deriving a corpus-wide initial distribution from many files. +void Histogram(const uint8_t* src, size_t size, uint32_t* out_counts); +#endif // ANS_ENABLE_ENCODER + +} // namespace ans + +#endif // ANS_H_ diff --git a/src/util/asset_manager.cc b/src/util/asset_manager.cc index 26a82bf..264ddda 100644 --- a/src/util/asset_manager.cc +++ b/src/util/asset_manager.cc @@ -2,6 +2,7 @@ // It implements the generic asset retrieval logic with runtime caching. #include "util/asset_manager.h" +#include "util/ans.h" #include "util/asset_manager_utils.h" #include "util/check_return.h" #if defined(USE_TEST_ASSETS) @@ -114,6 +115,34 @@ const uint8_t* GetAsset(AssetId asset_id, size_t* out_size) { AssetRecord cached_record = source_record; + if (source_record.compression == AssetCompression::ANS_ASCII) { + // Decompress on first access. Buffer is null-terminated for C-string use. + const size_t uncomp = source_record.uncompressed_size; + uint8_t* buffer = new (std::nothrow) uint8_t[uncomp + 1]; + CHECK_RETURN_BEGIN(!buffer, nullptr) + if (out_size) *out_size = 0; + ERROR_MSG("Failed to allocate buffer for ANS-compressed asset id=%u", + index); + return nullptr; + CHECK_RETURN_END + size_t got = 0; + if (!ans::Decode(source_record.data, source_record.size, buffer, uncomp, + &got, GetAnsAsciiHistogram()) || + got != uncomp) { + delete[] buffer; + if (out_size) *out_size = 0; + ERROR_MSG("ANS decode failed for asset id=%u", index); + return nullptr; + } + buffer[uncomp] = 0; + cached_record.data = buffer; + cached_record.size = uncomp; + cached_record.uncompressed_size = uncomp; + g_asset_cache[index] = cached_record; + if (out_size) *out_size = uncomp; + return buffer; + } + if (source_record.type == AssetType::PROC || source_record.type == AssetType::PROC_GPU) { // Dynamically generate the asset @@ -224,6 +253,12 @@ void DropAsset(AssetId asset_id, const uint8_t* asset) { delete[] g_asset_cache[index].data; g_asset_cache[index] = {}; // Zero out the struct to force re-generation } + // Heap-allocated decompressed buffer (compression != NONE): cache owns it. + if (g_asset_cache[index].data == asset && + g_asset_cache[index].compression != AssetCompression::NONE) { + delete[] g_asset_cache[index].data; + g_asset_cache[index] = {}; + } #if !defined(STRIP_ALL) if (g_asset_cache[index].data == asset && (g_asset_cache[index].type == AssetType::SPEC || @@ -245,10 +280,11 @@ bool ReloadAssetsFromFile(const char* config_path) { // Clear cache to force reload for (size_t i = 0; i < (size_t)AssetId::ASSET_LAST_ID; ++i) { - if ((g_asset_cache[i].type == AssetType::PROC || - g_asset_cache[i].type == AssetType::PROC_GPU) && - g_asset_cache[i].data) { - delete[] g_asset_cache[i].data; + const AssetRecord& e = g_asset_cache[i]; + if (e.data && + (e.type == AssetType::PROC || e.type == AssetType::PROC_GPU || + e.compression != AssetCompression::NONE)) { + delete[] e.data; } g_asset_cache[i] = {}; } diff --git a/src/util/asset_manager.h b/src/util/asset_manager.h index 786a8db..f6ad244 100644 --- a/src/util/asset_manager.h +++ b/src/util/asset_manager.h @@ -16,10 +16,21 @@ enum class AssetType : uint8_t { PROC_GPU, }; +// Compression scheme applied to the packed bytes (only relevant for static +// embedded assets; disk-load and procedural assets are always NONE). +enum class AssetCompression : uint8_t { + NONE = 0, + // Order-0 rANS, model seeded from a corpus-wide histogram embedded in the + // generated assets blob (see GetAnsAsciiHistogram). Used for WGSL. + ANS_ASCII = 1, +}; + struct AssetRecord { const uint8_t* data; // Pointer to asset data (static or dynamic) - size_t size; // Size of the asset data + size_t size; // Size of 'data' in bytes (compressed size if any) AssetType type; + AssetCompression compression; // How 'data' is encoded; NONE = raw bytes + size_t uncompressed_size; // Size after decompression (== size if NONE) const char* proc_func_name_str; // Name of procedural generation function // (string literal) const float* proc_params; // Parameters for procedural generation (static, @@ -27,6 +38,11 @@ struct AssetRecord { int num_proc_params; // Number of procedural parameters }; +// Initial-state histogram (256 entries) used to seed ANS_ASCII compression. +// Defined in the generated assets blob; computed at pack time over the WGSL +// corpus. +const uint32_t* GetAnsAsciiHistogram(); + // Generic interface // Retrieves a pointer to the asset data. // - Static assets are guaranteed to be 16-byte aligned. |
