faster sse2 minifyBy2

This commit is contained in:
Ryan Oldenburg 2022-07-29 13:10:03 -05:00
parent d0b4befd2f
commit c17a27041b

View file

@ -350,19 +350,32 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth, if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth,
if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight
) )
let oddMask = mm_set1_epi16(0xff00) let
oddMask = mm_set1_epi16(0xff00)
loMask = mm_set_epi32(0, 0, uint32.high, uint32.high)
hiMask = mm_set_epi32(uint32.high, uint32.high, 0, 0)
for y in 0 ..< resultEvenHeight: for y in 0 ..< resultEvenHeight:
let let
topRowStart = src.dataIndex(0, y * 2) topRowStart = src.dataIndex(0, y * 2)
bottomRowStart = src.dataIndex(0, y * 2 + 1) bottomRowStart = src.dataIndex(0, y * 2 + 1)
template loadEven(src: Image, idx: int): M128i =
var
a = mm_loadu_si128(src.data[idx].addr)
b = mm_loadu_si128(src.data[idx + 4].addr)
a = mm_shuffle_epi32(a, MM_SHUFFLE(3, 3, 2, 0))
b = mm_shuffle_epi32(b, MM_SHUFFLE(2, 0, 3, 3))
a = mm_and_si128(a, loMask)
b = mm_and_si128(b, hiMask)
mm_or_si128(a, b)
var x: int var x: int
while x <= resultEvenWidth - 4: while x <= resultEvenWidth - 9:
let let
top = mm_loadu_si128(src.data[topRowStart + x * 2].addr) top = loadEven(src, topRowStart + x * 2)
bottom = mm_loadu_si128(src.data[bottomRowStart + x * 2].addr) bottom = loadEven(src, bottomRowStart + x * 2)
topShifted = mm_srli_si128(top, 4) topShifted = loadEven(src, topRowStart + x * 2 + 1)
bottomShifted = mm_srli_si128(bottom, 4) bottomShifted = loadEven(src, bottomRowStart + x * 2 + 1)
topEven = mm_andnot_si128(oddMask, top) topEven = mm_andnot_si128(oddMask, top)
topOdd = mm_srli_epi16(top, 8) topOdd = mm_srli_epi16(top, 8)
bottomEven = mm_andnot_si128(oddMask, bottom) bottomEven = mm_andnot_si128(oddMask, bottom)
@ -380,12 +393,8 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
addedEvenDiv4 = mm_srli_epi16(addedEven, 2) addedEvenDiv4 = mm_srli_epi16(addedEven, 2)
addedOddDiv4 = mm_srli_epi16(addedOdd, 2) addedOddDiv4 = mm_srli_epi16(addedOdd, 2)
merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8))
# Merged has the correct values for the next two pixels at mm_storeu_si128(result.data[result.dataIndex(x, y)].addr, merged)
# index 0 and 2 so shift 0 and 2 into position and store x += 4
shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0))
lower = mm_cvtsi128_si64(shuffled)
copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8)
x += 2
for x in x ..< resultEvenWidth: for x in x ..< resultEvenWidth:
let let