From c17a27041b6446df72d2b50f3a3584dd0b0c8da7 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 29 Jul 2022 13:10:03 -0500 Subject: [PATCH] faster sse2 minifyBy2 --- src/pixie/simd/sse2.nim | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 33db05d..08b4dc1 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -350,19 +350,32 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth, if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight ) - let oddMask = mm_set1_epi16(0xff00) + let + oddMask = mm_set1_epi16(0xff00) + loMask = mm_set_epi32(0, 0, uint32.high, uint32.high) + hiMask = mm_set_epi32(uint32.high, uint32.high, 0, 0) for y in 0 ..< resultEvenHeight: let topRowStart = src.dataIndex(0, y * 2) bottomRowStart = src.dataIndex(0, y * 2 + 1) + template loadEven(src: Image, idx: int): M128i = + var + a = mm_loadu_si128(src.data[idx].addr) + b = mm_loadu_si128(src.data[idx + 4].addr) + a = mm_shuffle_epi32(a, MM_SHUFFLE(3, 3, 2, 0)) + b = mm_shuffle_epi32(b, MM_SHUFFLE(2, 0, 3, 3)) + a = mm_and_si128(a, loMask) + b = mm_and_si128(b, hiMask) + mm_or_si128(a, b) + var x: int - while x <= resultEvenWidth - 4: + while x <= resultEvenWidth - 9: let - top = mm_loadu_si128(src.data[topRowStart + x * 2].addr) - bottom = mm_loadu_si128(src.data[bottomRowStart + x * 2].addr) - topShifted = mm_srli_si128(top, 4) - bottomShifted = mm_srli_si128(bottom, 4) + top = loadEven(src, topRowStart + x * 2) + bottom = loadEven(src, bottomRowStart + x * 2) + topShifted = loadEven(src, topRowStart + x * 2 + 1) + bottomShifted = loadEven(src, bottomRowStart + x * 2 + 1) topEven = mm_andnot_si128(oddMask, top) topOdd = mm_srli_epi16(top, 8) bottomEven = mm_andnot_si128(oddMask, bottom) @@ -380,12 +393,8 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = addedEvenDiv4 = mm_srli_epi16(addedEven, 2) addedOddDiv4 = mm_srli_epi16(addedOdd, 2) merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) - # Merged has the correct values for the next two pixels at - # index 0 and 2 so shift 0 and 2 into position and store - shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0)) - lower = mm_cvtsi128_si64(shuffled) - copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8) - x += 2 + mm_storeu_si128(result.data[result.dataIndex(x, y)].addr, merged) + x += 4 for x in x ..< resultEvenWidth: let