From eaf0bd855306e70ca03f2d6579b4d6551aff6482 Mon Sep 17 00:00:00 2001 From: skal Date: Thu, 12 Feb 2026 12:11:53 +0100 Subject: TODO: 8-bit weight quantization for 2× size reduction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add QAT (quantization-aware training) notes - Requires training with fake quantization - Target: ~1.6 KB weights (vs 3.2 KB f16) - Shader unpacking needs adaptation (4× u8 per u32) --- training/export_cnn_v2_weights.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'training/export_cnn_v2_weights.py') diff --git a/training/export_cnn_v2_weights.py b/training/export_cnn_v2_weights.py index e3d1724..723f572 100755 --- a/training/export_cnn_v2_weights.py +++ b/training/export_cnn_v2_weights.py @@ -94,6 +94,8 @@ def export_weights_binary(checkpoint_path, output_path): weight_offset += len(layer2_flat) # Convert to f16 + # TODO: Use 8-bit quantization for 2× size reduction + # Requires quantization-aware training (QAT) to maintain accuracy all_weights_f16 = np.array(all_weights, dtype=np.float16) # Pack f16 pairs into u32 for storage buffer -- cgit v1.2.3