From 60bcff9bb265b63732b24e68062df3db943a82b4 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 8 Sep 2021 22:46:49 -0500 Subject: [PATCH] simd draw bugfix --- src/pixie/images.nim | 35 ++++++++++++++++++++++----------- tests/benchmark_images_draw.nim | 2 +- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 8ceac10..d36faff 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -757,24 +757,35 @@ proc drawUber( when type(a) is Image: if blendMode.hasSimdBlender(): let blenderSimd = blendMode.blenderSimd() - for _ in countup(x, xMax - 4, 4): + for _ in countup(x, xMax - 16, 16): + # Always take steps of 16 indices since masks will be reading + # 16 bytes even if we only use 4 from the last read. let srcPos = p + dx * x.float32 + dy * y.float32 sx = srcPos.x.int sy = srcPos.y.int - backdrop = mm_loadu_si128(a.data[a.dataIndex(x, y)].addr) when type(b) is Image: - let source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + for q in [0, 4, 8, 12]: + let + backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + source = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blenderSimd(backdrop, source) + ) else: # b is a Mask - # Need to move 4 mask values into the alpha slots - var source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) - source = unpackAlphaValues(source) - - mm_storeu_si128( - a.data[a.dataIndex(x, y)].addr, - blenderSimd(backdrop, source) - ) - x += 4 + var values = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + for q in [0, 4, 8, 12]: + let + backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + source = unpackAlphaValues(values) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blenderSimd(backdrop, source) + ) + # Shuffle 32 bits off for the next iteration + values = mm_srli_si128(values, 4) + x += 16 else: # is a Mask if blendMode.hasSimdMasker(): let maskerSimd = blendMode.maskerSimd() diff --git a/tests/benchmark_images_draw.nim b/tests/benchmark_images_draw.nim index 3ae2443..34aabd1 100644 --- a/tests/benchmark_images_draw.nim +++ b/tests/benchmark_images_draw.nim @@ -95,7 +95,7 @@ block: timeIt "shadow": b.fill(rgba(0, 0, 0, 255)) - a.draw(b, vec2(25, 25)) + a.draw(b, translate(vec2(25, 25))) let shadow = a.shadow( offset = vec2(0, 0),