From 82881ae75b65f162290432e65e53e2e03ab8687e Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Mon, 20 Jun 2022 22:19:11 -0500
Subject: [PATCH] simpler, faster

---
 src/pixie/images.nim | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index 072a8cb..e77f93c 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -262,7 +262,7 @@ proc minifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
       when defined(amd64) and allowSimd:
         let
           oddMask = mm_set1_epi16(cast[int16](0xff00))
-          first32 = cast[M128i]([uint32.high, 0, 0, 0])
+          mergedMask = mm_set_epi32(0, uint32.high, 0, uint32.high)
         for _ in countup(0, resultEvenWidth - 4, 2):
           let
             top = mm_loadu_si128(src.data[src.dataIndex(x * 2, y * 2 + 0)].addr)
@@ -271,36 +271,36 @@ proc minifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
             btmShifted = mm_srli_si128(btm, 4)
 
             topEven = mm_andnot_si128(oddMask, top)
-            topOdd = mm_srli_epi16(mm_and_si128(top, oddMask), 8)
+            topOdd = mm_srli_epi16(top, 8)
             btmEven = mm_andnot_si128(oddMask, btm)
-            btmOdd = mm_srli_epi16(mm_and_si128(btm, oddMask), 8)
+            btmOdd = mm_srli_epi16(btm, 8)
 
             topShiftedEven = mm_andnot_si128(oddMask, topShifted)
-            topShiftedOdd = mm_srli_epi16(mm_and_si128(topShifted, oddMask), 8)
+            topShiftedOdd = mm_srli_epi16(topShifted, 8)
             btmShiftedEven = mm_andnot_si128(oddMask, btmShifted)
-            btmShiftedOdd = mm_srli_epi16(mm_and_si128(btmShifted, oddMask), 8)
+            btmShiftedOdd = mm_srli_epi16(btmShifted, 8)
 
             topAddedEven = mm_add_epi16(topEven, topShiftedEven)
             btmAddedEven = mm_add_epi16(btmEven, btmShiftedEven)
             topAddedOdd = mm_add_epi16(topOdd, topShiftedOdd)
-            bottomAddedOdd = mm_add_epi16(btmOdd, btmShiftedOdd)
+            btmAddedOdd = mm_add_epi16(btmOdd, btmShiftedOdd)
 
             addedEven = mm_add_epi16(topAddedEven, btmAddedEven)
-            addedOdd = mm_add_epi16(topAddedOdd, bottomAddedOdd)
+            addedOdd = mm_add_epi16(topAddedOdd, btmAddedOdd)
 
             addedEvenDiv4 = mm_srli_epi16(addedEven, 2)
             addedOddDiv4 = mm_srli_epi16(addedOdd, 2)
 
             merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8))
+            # Merged has the correct values for the next two pixels at
+            # index 0 and 2 so mask the others out and shift 0 and 2 into
+            # position and store
+            masked = mm_and_si128(merged, mergedMask)
 
-            # merged [0, 1, 2, 3] has the correct values for the next two pixels
-            # at index 0 and 2 so shift those into position and store
-
-            zero = mm_and_si128(merged, first32)
-            two = mm_and_si128(mm_srli_si128(merged, 8), first32)
-            zeroTwo = mm_or_si128(zero, mm_slli_si128(two, 4))
-
-          mm_storeu_si128(result.data[result.dataIndex(x, y)].addr, zeroTwo)
+          mm_storeu_si128(
+            result.data[result.dataIndex(x, y)].addr,
+            mm_shuffle_epi32(masked, MM_SHUFFLE(0, 0, 2, 0))
+          )
           x += 2
 
       for x in x ..< resultEvenWidth: