From 264b30b118486c63c225d31c5b7cae1fdca0328f Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 9 Feb 2021 16:08:08 -0600 Subject: [PATCH] image.newMask 35% faster with simd --- src/pixie/images.nim | 41 ++++++++++++++++++++++++++++++++++++-- tests/benchmark_images.nim | 6 ++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index a8648fc..6d900d5 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -373,8 +373,45 @@ proc newMask*(image: Image): Mask = ## Returns a new mask using the alpha values of the parameter image. result = newMask(image.width, image.height) - for i, rgba in image.data: - result.data[i] = rgba.a + var i: int + when defined(amd64) and not defined(pixieNoSimd): + let mask32 = cast[M128i]([uint32.high, 0, 0, 0]) + + for _ in countup(0, image.data.len - 16, 16): + var + a = mm_loadu_si128(image.data[i + 0].addr) + b = mm_loadu_si128(image.data[i + 4].addr) + c = mm_loadu_si128(image.data[i + 8].addr) + d = mm_loadu_si128(image.data[i + 12].addr) + + template pack(v: var M128i) = + # Shuffle the alpha values for these 4 colors to the first 4 bytes + v = mm_srli_epi32(v, 24) + let + i = mm_srli_si128(v, 3) + j = mm_srli_si128(v, 6) + k = mm_srli_si128(v, 9) + v = mm_or_si128(mm_or_si128(v, i), mm_or_si128(j, k)) + v = mm_and_si128(v, mask32) + + pack(a) + pack(b) + pack(c) + pack(d) + + b = mm_slli_si128(b, 4) + c = mm_slli_si128(c, 8) + d = mm_slli_si128(d, 12) + + mm_storeu_si128( + result.data[i].addr, + mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)) + ) + + i += 16 + + for j in i ..< image.data.len: + result.data[i] = image.data[j].a proc getRgbaSmooth*(image: Image, x, y: float32): ColorRGBA = let diff --git a/tests/benchmark_images.nim b/tests/benchmark_images.nim index df46382..33a4adf 100644 --- a/tests/benchmark_images.nim +++ b/tests/benchmark_images.nim @@ -72,6 +72,12 @@ block: reset() +timeIt "newMask": + let mask = image.newMask() + doAssert mask[0, 0] == image[0, 0].a + +reset() + timeIt "lerp integers": for i in 0 ..< 100000: let c = image[0, 0]