From c17a27041b6446df72d2b50f3a3584dd0b0c8da7 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Fri, 29 Jul 2022 13:10:03 -0500
Subject: [PATCH] faster sse2 minifyBy2

---
 src/pixie/simd/sse2.nim | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index 33db05d..08b4dc1 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -350,19 +350,32 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
       if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth,
       if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight
     )
-    let oddMask = mm_set1_epi16(0xff00)
+    let
+      oddMask = mm_set1_epi16(0xff00)
+      loMask = mm_set_epi32(0, 0, uint32.high, uint32.high)
+      hiMask = mm_set_epi32(uint32.high, uint32.high, 0, 0)
     for y in 0 ..< resultEvenHeight:
       let
         topRowStart = src.dataIndex(0, y * 2)
         bottomRowStart = src.dataIndex(0, y * 2 + 1)
 
+      template loadEven(src: Image, idx: int): M128i =
+        var
+          a = mm_loadu_si128(src.data[idx].addr)
+          b = mm_loadu_si128(src.data[idx + 4].addr)
+        a = mm_shuffle_epi32(a, MM_SHUFFLE(3, 3, 2, 0))
+        b = mm_shuffle_epi32(b, MM_SHUFFLE(2, 0, 3, 3))
+        a = mm_and_si128(a, loMask)
+        b = mm_and_si128(b, hiMask)
+        mm_or_si128(a, b)
+
       var x: int
-      while x <= resultEvenWidth - 4:
+      while x <= resultEvenWidth - 9:
         let
-          top = mm_loadu_si128(src.data[topRowStart + x * 2].addr)
-          bottom = mm_loadu_si128(src.data[bottomRowStart + x * 2].addr)
-          topShifted = mm_srli_si128(top, 4)
-          bottomShifted = mm_srli_si128(bottom, 4)
+          top = loadEven(src, topRowStart + x * 2)
+          bottom = loadEven(src, bottomRowStart + x * 2)
+          topShifted = loadEven(src, topRowStart + x * 2 + 1)
+          bottomShifted = loadEven(src, bottomRowStart + x * 2 + 1)
           topEven = mm_andnot_si128(oddMask, top)
           topOdd = mm_srli_epi16(top, 8)
           bottomEven = mm_andnot_si128(oddMask, bottom)
@@ -380,12 +393,8 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
           addedEvenDiv4 = mm_srli_epi16(addedEven, 2)
           addedOddDiv4 = mm_srli_epi16(addedOdd, 2)
           merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8))
-          # Merged has the correct values for the next two pixels at
-          # index 0 and 2 so shift 0 and 2 into position and store
-          shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0))
-          lower = mm_cvtsi128_si64(shuffled)
-        copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8)
-        x += 2
+        mm_storeu_si128(result.data[result.dataIndex(x, y)].addr, merged)
+        x += 4
 
       for x in x ..< resultEvenWidth:
         let