From 51d38c7e37dee6f80492da457cade17d03c213ce Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 18 Jun 2021 16:03:25 -0500 Subject: [PATCH] 2x faster mask minifyBy2 --- src/pixie/masks.nim | 73 ++++++++++++++++++++++++++-- tests/images/masks/minifiedBlur.png | Bin 0 -> 1371 bytes tests/test_masks.nim | 8 +++ 3 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 tests/images/masks/minifiedBlur.png diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index d21b0be..763718c 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -73,17 +73,80 @@ proc minifyBy2*(mask: Mask, power = 1): Mask = if power == 0: return mask.copy() + var src = mask for i in 1 .. power: result = newMask(mask.width div 2, mask.height div 2) for y in 0 ..< result.height: - for x in 0 ..< result.width: + var x: int + when defined(amd64) and not defined(pixieNoSimd): + let + oddMask = mm_set1_epi16(cast[int16](0xff00)) + first8 = cast[M128i]([uint8.high, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + for _ in countup(0, result.width - 16, 8): + let + top = mm_loadu_si128(src.data[src.dataIndex(x * 2, y * 2 + 0)].addr) + btm = mm_loadu_si128(src.data[src.dataIndex(x * 2, y * 2 + 1)].addr) + topShifted = mm_srli_si128(top, 1) + btmShifted = mm_srli_si128(btm, 1) + + topEven = mm_andnot_si128(oddMask, top) + topOdd = mm_srli_epi16(mm_and_si128(top, oddMask), 8) + btmEven = mm_andnot_si128(oddMask, btm) + btmOdd = mm_srli_epi16(mm_and_si128(btm, oddMask), 8) + + topShiftedEven = mm_andnot_si128(oddMask, topShifted) + topShiftedOdd = mm_srli_epi16(mm_and_si128(topShifted, oddMask), 8) + btmShiftedEven = mm_andnot_si128(oddMask, btmShifted) + btmShiftedOdd = mm_srli_epi16(mm_and_si128(btmShifted, oddMask), 8) + + topAddedEven = mm_add_epi16(topEven, topShiftedEven) + btmAddedEven = mm_add_epi16(btmEven, btmShiftedEven) + topAddedOdd = mm_add_epi16(topOdd, topShiftedOdd) + bottomAddedOdd = mm_add_epi16(btmOdd, btmShiftedOdd) + + addedEven = mm_add_epi16(topAddedEven, btmAddedEven) + addedOdd = mm_add_epi16(topAddedOdd, bottomAddedOdd) + + addedEvenDiv4 = mm_srli_epi16(addedEven, 2) + addedOddDiv4 = mm_srli_epi16(addedOdd, 2) + + merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) + + # merged has the correct values in the even indices + + a = mm_and_si128(merged, first8) + b = mm_and_si128(mm_srli_si128(merged, 2), first8) + c = mm_and_si128(mm_srli_si128(merged, 4), first8) + d = mm_and_si128(mm_srli_si128(merged, 6), first8) + e = mm_and_si128(mm_srli_si128(merged, 8), first8) + f = mm_and_si128(mm_srli_si128(merged, 10), first8) + g = mm_and_si128(mm_srli_si128(merged, 12), first8) + h = mm_and_si128(mm_srli_si128(merged, 14), first8) + + ab = mm_or_si128(a, mm_slli_si128(b, 1)) + cd = mm_or_si128(c, mm_slli_si128(d, 1)) + ef = mm_or_si128(e, mm_slli_si128(f, 1)) + gh = mm_or_si128(g, mm_slli_si128(h, 1)) + + abcd = mm_or_si128(ab, mm_slli_si128(cd, 2)) + efgh = mm_or_si128(ef, mm_slli_si128(gh, 2)) + + abcdefgh = mm_or_si128(abcd, mm_slli_si128(efgh, 4)) + + mm_storeu_si128(result.data[result.dataIndex(x, y)].addr, abcdefgh) + x += 8 + + for x in x ..< result.width: let value = - mask.getValueUnsafe(x * 2 + 0, y * 2 + 0).uint32 + - mask.getValueUnsafe(x * 2 + 1, y * 2 + 0) + - mask.getValueUnsafe(x * 2 + 1, y * 2 + 1) + - mask.getValueUnsafe(x * 2 + 0, y * 2 + 1) + src.getValueUnsafe(x * 2 + 0, y * 2 + 0).uint32 + + src.getValueUnsafe(x * 2 + 1, y * 2 + 0) + + src.getValueUnsafe(x * 2 + 1, y * 2 + 1) + + src.getValueUnsafe(x * 2 + 0, y * 2 + 1) result.setValueUnsafe(x, y, (value div 4).uint8) + # Set src as this result for if we do another power + src = result + proc fillUnsafe*(data: var seq[uint8], value: uint8, start, len: int) = ## Fills the mask data with the parameter value starting at index start and ## continuing for len indices. diff --git a/tests/images/masks/minifiedBlur.png b/tests/images/masks/minifiedBlur.png new file mode 100644 index 0000000000000000000000000000000000000000..047ddca9620741c8a768e2851e54df0b7e17f5b2 GIT binary patch literal 1371 zcmV-h1*H0kP)Ghr;@BwlzK`W-tzpnk=@1wUKMYRf9e1_B z$iZD!;2?omT<{g|ydSyCMl0vN92=ku)2#rphDZ${Mo}0?d1&QFKSFZO5-wv+6O7{} z4ClQc!x%yclRVh!)DO&i?}qL~WQ{;1SOh9eVVb6Cp4sXp$Auj?IGK0T1k)jd8@%@% z|2!}AvdDJUYZ8thox_O%(%?8_$ZY4wF--HaEX&H~Db0i+gpqiGFs;F8KuBJUO!G3Y z>$djCQ>=NOLl{SlnAT2#G$1gX8$*~uZ0m;BXGEN*DM++TGiN)&80N%U>s$}kx~(tU z%eHM|=82F^^Mny+ZENRwo{gJ0v7~PZ^E;zco{85y5v>nKJ!|*kBwT(B)3mG`K04!> z1enB#?>_2r)QTB#!h+@fH6t%C!XQ%5oGhf48?0%y9^8(~thDZA9H)tC2l28k>oOBw zt->^pesDu;_&CaO9H&r#I0=_PdS($^{i=eE?&GXSDJ7R4F>&GsoXqpGuG{v)jFX7u zyHc`hS#zFeDLLm{N~s6=_2ER$k{7afAxFZ+jT^tn1Rx zTJQD0Spy3^jGR!;MKY{I`ngE7guz)mvRc*rB<~bQWM6AL@VV&d!yHd>6-<*e{%@lJ3I6_)CxpNIqvbX zLjiKoLZOUqsiQ-ocS+G128X3oq1BM%zQ+fQt1jHciKQ}3xt>k~Emg|YQhQY^F(Gk( zpnZ>5a$QcC6h!}HFy~p1TI9e~N|}nVFkT;z4<{W$WtdJQjfTOt*2s1W?pj5Ji`NcG z9~EvR@k0LUtHH>*w{f|b^tK=SBt#-0X7_Tr=!O7YC zjCQzg^dRXPl$1zzy&n61F&;aujpLxv*qy)_>|p3~lI=`jq^PeKGxB&q6C*Hyn0^Ha z*B6jiv_I&7JjGym9ulmak6LSQW->Y#Vz81(`vW5~iCTU5jxos!EYa$%^6rHeJ