From c49d828f101b435d73a76fcfc8444cf76aeda22f Mon Sep 17 00:00:00 2001 From: skal Date: Wed, 11 Feb 2026 00:26:25 +0100 Subject: opt: Move invariant in1 calculation outside CNN convolution loops The in1 vector (uv_norm, gray, 1.0) is loop-invariant and doesn't depend on dx/dy offset. Moving it outside the convolution loop eliminates redundant computation and enables better SIMD optimization. Updated both shader files and train.py code generation. Co-Authored-By: Claude Sonnet 4.5 --- training/train_cnn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'training') diff --git a/training/train_cnn.py b/training/train_cnn.py index 497a07b..d8522ed 100755 --- a/training/train_cnn.py +++ b/training/train_cnn.py @@ -420,7 +420,8 @@ def generate_conv_src_function(kernel_size, output_path): # Normalize center pixel for gray channel f.write(f" let original = (textureSample(tex, samp, uv) - 0.5) * 2.0;\n") f.write(f" let gray = dot(original.rgb, vec3(0.2126, 0.7152, 0.0722));\n") - f.write(f" let uv_norm = (uv - 0.5) * 2.0;\n\n") + f.write(f" let uv_norm = (uv - 0.5) * 2.0;\n") + f.write(f" let in1 = vec4(uv_norm, gray, 1.0);\n\n") f.write(f" var sum = vec4(0.0);\n") f.write(f" var pos = 0;\n\n") @@ -429,8 +430,7 @@ def generate_conv_src_function(kernel_size, output_path): f.write(f" for (var dy = -{radius}; dy <= {radius}; dy++) {{\n") f.write(f" for (var dx = -{radius}; dx <= {radius}; dx++) {{\n") f.write(f" let offset = vec2(f32(dx), f32(dy)) * step;\n") - f.write(f" let rgbd = (textureSample(tex, samp, uv + offset) - 0.5) * 2.0;\n") - f.write(f" let in1 = vec4(uv_norm, gray, 1.0);\n\n") + f.write(f" let rgbd = (textureSample(tex, samp, uv + offset) - 0.5) * 2.0;\n\n") # Accumulate with dot products (unrolled) f.write(f" sum.r += dot(weights[pos+0], rgbd) + dot(weights[pos+1], in1);\n") @@ -465,7 +465,8 @@ def generate_conv_final_function(kernel_size, output_path): f.write(f" weights: array, {num_positions * 2}>\n") f.write(f") -> f32 {{\n") f.write(f" let step = 1.0 / resolution;\n") - f.write(f" let uv_norm = (uv - 0.5) * 2.0;\n\n") + f.write(f" let uv_norm = (uv - 0.5) * 2.0;\n") + f.write(f" let in1 = vec4(uv_norm, gray, 1.0);\n\n") f.write(f" var sum = 0.0;\n") f.write(f" var pos = 0;\n\n") @@ -473,8 +474,7 @@ def generate_conv_final_function(kernel_size, output_path): f.write(f" for (var dy = -{radius}; dy <= {radius}; dy++) {{\n") f.write(f" for (var dx = -{radius}; dx <= {radius}; dx++) {{\n") f.write(f" let offset = vec2(f32(dx), f32(dy)) * step;\n") - f.write(f" let rgbd = textureSample(tex, samp, uv + offset);\n") - f.write(f" let in1 = vec4(uv_norm, gray, 1.0);\n\n") + f.write(f" let rgbd = textureSample(tex, samp, uv + offset);\n\n") # Accumulate with dot products f.write(f" sum += dot(weights[pos], rgbd) + dot(weights[pos+1], in1);\n") -- cgit v1.2.3