From 6ba8ec472fc88b1356eea75a0130f4bca84170a8 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 15 Dec 2021 12:21:26 -0600 Subject: [PATCH 1/2] simd draw fast paths --- src/pixie/blends.nim | 15 ++- src/pixie/images.nim | 218 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 190 insertions(+), 43 deletions(-) diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index b9f148b..e0683fd 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -518,7 +518,7 @@ when defined(amd64) and not defined(pixieNoSimd): proc blendNormalSimd(backdrop, source: M128i): M128i = blendNormalInlineSimd(backdrop, source) - proc blendMaskSimd(backdrop, source: M128i): M128i = + proc blendMaskInlineSimd*(backdrop, source: M128i): M128i {.inline.} = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) @@ -539,6 +539,9 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) + proc blendMaskSimd(backdrop, source: M128i): M128i = + blendMaskInlineSimd(backdrop, source) + proc blendOverwriteSimd(backdrop, source: M128i): M128i = source @@ -555,7 +558,7 @@ when defined(amd64) and not defined(pixieNoSimd): ## Is there a blend function for a given blend mode with SIMD support? blendMode in {bmNormal, bmMask, bmOverwrite} - proc maskNormalSimd(backdrop, source: M128i): M128i = + proc maskNormalInlineSimd*(backdrop, source: M128i): M128i {.inline.} = ## Blending masks let oddMask = mm_set1_epi16(cast[int16](0xff00)) @@ -592,7 +595,10 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(blendedEven, mm_slli_epi16(blendedOdd, 8)) - proc maskMaskSimd(backdrop, source: M128i): M128i = + proc maskNormalSimd(backdrop, source: M128i): M128i = + maskNormalInlineSimd(backdrop, source) + + proc maskMaskInlineSimd*(backdrop, source: M128i): M128i = let oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) @@ -613,6 +619,9 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) + proc maskMaskSimd(backdrop, source: M128i): M128i = + maskMaskInlineSimd(backdrop, source) + proc maskerSimd*(blendMode: BlendMode): MaskerSimd {.raises: [PixieError].} = ## Returns a blend masking function with SIMD support. case blendMode: diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 0fa788c..723a23d 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -798,62 +798,200 @@ proc drawUber( continue when defined(amd64) and not defined(pixieNoSimd): - # Check we are not rotated - when type(a) is Image: - if blendMode.hasSimdBlender(): - let blenderSimd = blendMode.blenderSimd() - for _ in 0 ..< (xStop - xStart) div 16: - let - srcPos = p + dx * x.float32 + dy * y.float32 - sx = srcPos.x.int - sy = srcPos.y.int + case blendMode: + of bmOverwrite: + for _ in 0 ..< (xStop - xStart) div 16: + let + srcPos = p + dx * x.float32 + dy * y.float32 + sx = srcPos.x.int + sy = srcPos.y.int + when type(a) is Image: when type(b) is Image: for q in [0, 4, 8, 12]: - let - backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) - source = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) - mm_storeu_si128( - a.data[a.dataIndex(x + q, y)].addr, - blenderSimd(backdrop, source) - ) + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) + mm_storeu_si128(a.data[a.dataIndex(x + q, y)].addr, sourceVec) else: # b is a Mask var values = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) for q in [0, 4, 8, 12]: - let - backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) - source = unpackAlphaValues(values) - mm_storeu_si128( - a.data[a.dataIndex(x + q, y)].addr, - blenderSimd(backdrop, source) - ) + let sourceVec = unpackAlphaValues(values) + mm_storeu_si128(a.data[a.dataIndex(x + q, y)].addr, sourceVec) # Shuffle 32 bits off for the next iteration values = mm_srli_si128(values, 4) - x += 16 - else: # is a Mask - if blendMode.hasSimdMasker(): - let maskerSimd = blendMode.maskerSimd() - for _ in 0 ..< (xStop - xStart) div 16: - let - srcPos = p + dx * x.float32 + dy * y.float32 - sx = srcPos.x.int - sy = srcPos.y.int - backdrop = mm_loadu_si128(a.data[a.dataIndex(x, y)].addr) + else: # a is a Mask when type(b) is Image: - # Need to read 16 colors and pack their alpha values - let + var i = mm_loadu_si128(b.data[b.dataIndex(sx + 0, sy)].addr) j = mm_loadu_si128(b.data[b.dataIndex(sx + 4, sy)].addr) k = mm_loadu_si128(b.data[b.dataIndex(sx + 8, sy)].addr) l = mm_loadu_si128(b.data[b.dataIndex(sx + 12, sy)].addr) - source = pack4xAlphaValues(i, j, k, l) + let sourceVec = pack4xAlphaValues(i, j, k, l) else: # b is a Mask - let source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) - + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + mm_storeu_si128(a.data[a.dataIndex(x, y)].addr, sourceVec) + x += 16 + of bmNormal: + let vec255 = mm_set1_epi32(cast[int32](uint32.high)) + for _ in 0 ..< (xStop - xStart) div 16: + let + srcPos = p + dx * x.float32 + dy * y.float32 + sx = srcPos.x.int + sy = srcPos.y.int + when type(a) is Image: + when type(b) is Image: + for q in [0, 4, 8, 12]: + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) + if mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, mm_setzero_si128())) != 0xffff: + if (mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, vec255)) and 0x8888) == 0x8888: + mm_storeu_si128(a.data[a.dataIndex(x + q, y)].addr, sourceVec) + else: + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blendNormalInlineSimd(backdropVec, sourceVec) + ) + else: # b is a Mask + var values = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + for q in [0, 4, 8, 12]: + let sourceVec = unpackAlphaValues(values) + if mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, mm_setzero_si128())) != 0xffff: + if (mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, vec255)) and 0x8888) == 0x8888: + discard + else: + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blendNormalInlineSimd(backdropVec, sourceVec) + ) + # Shuffle 32 bits off for the next iteration + values = mm_srli_si128(values, 4) + else: # a is a Mask + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x, y)].addr) + when type(b) is Image: + var + i = mm_loadu_si128(b.data[b.dataIndex(sx + 0, sy)].addr) + j = mm_loadu_si128(b.data[b.dataIndex(sx + 4, sy)].addr) + k = mm_loadu_si128(b.data[b.dataIndex(sx + 8, sy)].addr) + l = mm_loadu_si128(b.data[b.dataIndex(sx + 12, sy)].addr) + let sourceVec = pack4xAlphaValues(i, j, k, l) + else: # b is a Mask + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) mm_storeu_si128( a.data[a.dataIndex(x, y)].addr, - maskerSimd(backdrop, source) + maskNormalInlineSimd(backdropVec, sourceVec) ) - x += 16 + x += 16 + of bmMask: + let vec255 = mm_set1_epi32(cast[int32](uint32.high)) + for _ in 0 ..< (xStop - xStart) div 16: + let + srcPos = p + dx * x.float32 + dy * y.float32 + sx = srcPos.x.int + sy = srcPos.y.int + when type(a) is Image: + when type(b) is Image: + for q in [0, 4, 8, 12]: + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) + if mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, mm_setzero_si128())) == 0xffff: + mm_storeu_si128(a.data[a.dataIndex(x + q, y)].addr, mm_setzero_si128()) + elif mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, vec255)) != 0xffff: + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blendMaskInlineSimd(backdropVec, sourceVec) + ) + else: # b is a Mask + var values = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + for q in [0, 4, 8, 12]: + let sourceVec = unpackAlphaValues(values) + if mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, mm_setzero_si128())) == 0xffff: + mm_storeu_si128(a.data[a.dataIndex(x + q, y)].addr, mm_setzero_si128()) + elif (mm_movemask_epi8(mm_cmpeq_epi8(sourceVec, vec255)) and 0x8888) != 0x8888: + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blendMaskInlineSimd(backdropVec, sourceVec) + ) + # Shuffle 32 bits off for the next iteration + values = mm_srli_si128(values, 4) + else: # a is a Mask + let backdropVec = mm_loadu_si128(a.data[a.dataIndex(x, y)].addr) + when type(b) is Image: + var + i = mm_loadu_si128(b.data[b.dataIndex(sx + 0, sy)].addr) + j = mm_loadu_si128(b.data[b.dataIndex(sx + 4, sy)].addr) + k = mm_loadu_si128(b.data[b.dataIndex(sx + 8, sy)].addr) + l = mm_loadu_si128(b.data[b.dataIndex(sx + 12, sy)].addr) + i = packAlphaValues(i) + j = packAlphaValues(j) + k = packAlphaValues(k) + l = packAlphaValues(l) + j = mm_slli_si128(j, 4) + k = mm_slli_si128(k, 8) + l = mm_slli_si128(l, 12) + let sourceVec = mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + else: # b is a Mask + let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x, y)].addr, + maskMaskInlineSimd(backdropVec, sourceVec) + ) + x += 16 + else: + when type(a) is Image: + if blendMode.hasSimdBlender(): + let blenderSimd = blendMode.blenderSimd() + for _ in 0 ..< (xStop - xStart) div 16: + let + srcPos = p + dx * x.float32 + dy * y.float32 + sx = srcPos.x.int + sy = srcPos.y.int + when type(b) is Image: + for q in [0, 4, 8, 12]: + let + backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + source = mm_loadu_si128(b.data[b.dataIndex(sx + q, sy)].addr) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blenderSimd(backdrop, source) + ) + else: # b is a Mask + var values = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + for q in [0, 4, 8, 12]: + let + backdrop = mm_loadu_si128(a.data[a.dataIndex(x + q, y)].addr) + source = unpackAlphaValues(values) + mm_storeu_si128( + a.data[a.dataIndex(x + q, y)].addr, + blenderSimd(backdrop, source) + ) + # Shuffle 32 bits off for the next iteration + values = mm_srli_si128(values, 4) + x += 16 + else: # is a Mask + if blendMode.hasSimdMasker(): + let maskerSimd = blendMode.maskerSimd() + for _ in 0 ..< (xStop - xStart) div 16: + let + srcPos = p + dx * x.float32 + dy * y.float32 + sx = srcPos.x.int + sy = srcPos.y.int + backdrop = mm_loadu_si128(a.data[a.dataIndex(x, y)].addr) + when type(b) is Image: + # Need to read 16 colors and pack their alpha values + let + i = mm_loadu_si128(b.data[b.dataIndex(sx + 0, sy)].addr) + j = mm_loadu_si128(b.data[b.dataIndex(sx + 4, sy)].addr) + k = mm_loadu_si128(b.data[b.dataIndex(sx + 8, sy)].addr) + l = mm_loadu_si128(b.data[b.dataIndex(sx + 12, sy)].addr) + source = pack4xAlphaValues(i, j, k, l) + else: # b is a Mask + let source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) + + mm_storeu_si128( + a.data[a.dataIndex(x, y)].addr, + maskerSimd(backdrop, source) + ) + x += 16 var srcPos = p + dx * x.float32 + dy * y.float32 srcPos = vec2( From bfa1f8b3a0e62653f235d463eea678067b11d01b Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 15 Dec 2021 12:27:49 -0600 Subject: [PATCH 2/2] use --- src/pixie/images.nim | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 723a23d..de63539 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -921,14 +921,7 @@ proc drawUber( j = mm_loadu_si128(b.data[b.dataIndex(sx + 4, sy)].addr) k = mm_loadu_si128(b.data[b.dataIndex(sx + 8, sy)].addr) l = mm_loadu_si128(b.data[b.dataIndex(sx + 12, sy)].addr) - i = packAlphaValues(i) - j = packAlphaValues(j) - k = packAlphaValues(k) - l = packAlphaValues(l) - j = mm_slli_si128(j, 4) - k = mm_slli_si128(k, 8) - l = mm_slli_si128(l, 12) - let sourceVec = mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + let sourceVec = pack4xAlphaValues(i, j, k, l) else: # b is a Mask let sourceVec = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) mm_storeu_si128(