From c4cd380676a73b0b5ac3663c51d502e541cad88e Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 30 Jul 2022 23:14:02 -0500 Subject: [PATCH] align blit simd blends --- src/pixie/simd/avx2.nim | 22 ++++++++++++++-------- src/pixie/simd/sse2.nim | 22 ++++++++++++++-------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 7bd9c9c..9375075 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -383,6 +383,11 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = proc blitLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 31) != 0: + a[i] = blendNormal(a[i], b[i]) + inc i + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) @@ -393,8 +398,6 @@ proc blitLineNormalAvx2*( 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 ) - - var i: int while i < len - 8: let source = mm256_loadu_si256(b[i].addr) @@ -402,7 +405,7 @@ proc blitLineNormalAvx2*( if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source mm256_storeu_si256(a[i].addr, source) else: - let backdrop = mm256_loadu_si256(a[i].addr) + let backdrop = mm256_load_si256(a[i].addr) var sourceAlpha = mm256_and_si256(source, alphaMask) @@ -423,7 +426,7 @@ proc blitLineNormalAvx2*( mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) ) - mm256_storeu_si256(a[i].addr, added) + mm256_store_si256(a[i].addr, added) i += 8 @@ -433,6 +436,11 @@ proc blitLineNormalAvx2*( proc blitLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 31) != 0: + a[i] = blendMask(a[i], b[i]) + inc i + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) @@ -442,8 +450,6 @@ proc blitLineMaskAvx2*( 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 ) - - var i: int while i < len - 8: let source = mm256_loadu_si256(b[i].addr) @@ -451,7 +457,7 @@ proc blitLineMaskAvx2*( if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source discard else: - let backdrop = mm256_loadu_si256(a[i].addr) + let backdrop = mm256_load_si256(a[i].addr) var sourceAlpha = mm256_and_si256(source, alphaMask) @@ -465,7 +471,7 @@ proc blitLineMaskAvx2*( backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - mm256_storeu_si256( + mm256_store_si256( a[i].addr, mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) ) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index c8e0dc8..cc77910 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -530,14 +530,17 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = proc blitLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendNormal(a[i], b[i]) + inc i + let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) vec255 = mm_set1_epi8(255) vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) - - var i: int while i < len - 4: let source = mm_loadu_si128(b[i].addr) @@ -545,7 +548,7 @@ proc blitLineNormalSse2*( if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source mm_storeu_si128(a[i].addr, source) else: - let backdrop = mm_loadu_si128(a[i].addr) + let backdrop = mm_load_si128(a[i].addr) var sourceAlpha = mm_and_si128(source, alphaMask) @@ -566,7 +569,7 @@ proc blitLineNormalSse2*( mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) - mm_storeu_si128(a[i].addr, added) + mm_store_si128(a[i].addr, added) i += 4 @@ -576,13 +579,16 @@ proc blitLineNormalSse2*( proc blitLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendMask(a[i], b[i]) + inc i + let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) vec255 = mm_set1_epi8(255) - - var i: int while i < len - 4: let source = mm_loadu_si128(b[i].addr) @@ -590,7 +596,7 @@ proc blitLineMaskSse2*( if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source discard else: - let backdrop = mm_loadu_si128(a[i].addr) + let backdrop = mm_load_si128(a[i].addr) var sourceAlpha = mm_and_si128(source, alphaMask) @@ -604,7 +610,7 @@ proc blitLineMaskSse2*( backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - mm_storeu_si128( + mm_store_si128( a[i].addr, mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) )