From 00f2741aa889ccd1f0da005413ab803556c40652 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 12:12:10 -0500 Subject: [PATCH 01/15] combine fuzzing --- tests/fuzz_image_draw.nim | 28 ++++++++++++++++++++++++++++ tests/fuzz_image_draw_smooth.nim | 31 ------------------------------- 2 files changed, 28 insertions(+), 31 deletions(-) delete mode 100644 tests/fuzz_image_draw_smooth.nim diff --git a/tests/fuzz_image_draw.nim b/tests/fuzz_image_draw.nim index d8d43b8..1271156 100644 --- a/tests/fuzz_image_draw.nim +++ b/tests/fuzz_image_draw.nim @@ -25,3 +25,31 @@ for i in 0 ..< 250: a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) a.draw(b, translate(translation)) + +for i in 0 ..< 25: + let a = newImage(rand(1 .. 20), rand(1 .. 20)) + for j in 0 ..< 25: + let b = newImage(rand(1 .. 20), rand(1 .. 20)) + + let + translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5) + rotation = rand(2 * PI).float32 + + echo a, " ", b, " ", translation, " ", rotation + + a.draw(b, translate(vec2(translation.x, translation.y))) + a.draw(b, translate(translation) * rotate(rotation)) + +for i in 0 ..< 25: + let a = newImage(rand(1 .. 2000), rand(1 .. 2000)) + for j in 0 ..< 25: + let b = newImage(rand(1 .. 1000), rand(1 .. 1000)) + + let + translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500) + rotation = rand(2 * PI).float32 + + echo a, " ", b, " ", translation, " ", rotation + + a.draw(b, translate(vec2(translation.x, translation.y))) + a.draw(b, translate(translation) * rotate(rotation)) diff --git a/tests/fuzz_image_draw_smooth.nim b/tests/fuzz_image_draw_smooth.nim deleted file mode 100644 index 0a80cd8..0000000 --- a/tests/fuzz_image_draw_smooth.nim +++ /dev/null @@ -1,31 +0,0 @@ -import pixie, random - -randomize() - -for i in 0 ..< 25: - let a = newImage(rand(1 .. 20), rand(1 .. 20)) - for j in 0 ..< 25: - let b = newImage(rand(1 .. 20), rand(1 .. 20)) - - let - translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5) - rotation = rand(2 * PI).float32 - - echo a, " ", b, " ", translation, " ", rotation - - a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) - a.draw(b, translate(translation) * rotate(rotation)) - -for i in 0 ..< 25: - let a = newImage(rand(1 .. 2000), rand(1 .. 2000)) - for j in 0 ..< 25: - let b = newImage(rand(1 .. 1000), rand(1 .. 1000)) - - let - translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500) - rotation = rand(2 * PI).float32 - - echo a, " ", b, " ", translation, " ", rotation - - a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) - a.draw(b, translate(translation) * rotate(rotation)) From 77b5df9d00862707bad60bd2aaaf466450bfef02 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 12:23:43 -0500 Subject: [PATCH 02/15] morepretty --- src/pixie/images.nim | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index ddc1153..68d5f67 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,6 +1,6 @@ import blends, bumpy, chroma, common, internal, simd, vmath -export Image, newImage, copy, dataIndex +export Image, copy, dataIndex, newImage const h = 0.5.float32 @@ -441,11 +441,15 @@ template getUncheckedArray( ): ptr UncheckedArray[ColorRGBX] = cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr) -proc blitLine(a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender) {.inline.} = +proc blitLine( + a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender +) {.inline.} = for i in 0 ..< len: a[i] = blender(a[i], b[i]) -proc blitLineOverwrite(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.inline.} = +proc blitLineOverwrite( + a, b: ptr UncheckedArray[ColorRGBX], len: int +) {.inline.} = copyMem(a[0].addr, b[0].addr, len * 4) proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = From 04fc992dc48eb61c4fa87c9ee694fe035d8f97db Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 12:51:37 -0500 Subject: [PATCH 03/15] simpler for now --- src/pixie/paths.nim | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index d296bbb..7ffabd2 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1588,24 +1588,6 @@ proc fillCoverage( image.clearUnsafe(0, y, startX, y) image.clearUnsafe(startX + coverages.len, y, image.width, y) - of SubtractMaskBlend: - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 255 and rgbx.a == 255: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - elif coverage != 0: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendSubtractMask(backdrop, source(rgbx, coverage)) - inc dataIndex - - of ExcludeMaskBlend: - for x in x ..< startX + coverages.len: - let - coverage = coverages[x - startX] - backdrop = image.data[dataIndex] - image.data[dataIndex] = blendExcludeMask(backdrop, source(rgbx, coverage)) - inc dataIndex - else: let blender = blendMode.blender() for x in x ..< startX + coverages.len: @@ -1658,7 +1640,6 @@ proc fillHits( of MaskBlend: {.linearScanEnd.} - var filledTo = startX for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): if maskClears: # Clear any gap between this fill and the previous fill @@ -1684,25 +1665,6 @@ proc fillHits( image.clearUnsafe(0, y, startX, y) image.clearUnsafe(filledTo, y, image.width, y) - of SubtractMaskBlend: - for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): - var dataIndex = image.dataIndex(start, y) - for _ in 0 ..< len: - if rgbx.a == 255: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendSubtractMask(backdrop, rgbx) - inc dataIndex - - of ExcludeMaskBlend: - for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): - var dataIndex = image.dataIndex(start, y) - for _ in 0 ..< len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendExcludeMask(backdrop, rgbx) - inc dataIndex - else: let blender = blendMode.blender() for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): From dd7bf9f210a53d1982d6c388fbcff55686ad826f Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 13:31:13 -0500 Subject: [PATCH 04/15] blendLine rgbx --- src/pixie/images.nim | 5 --- src/pixie/internal.nim | 5 +++ src/pixie/paths.nim | 40 ++++++++------------- src/pixie/simd/sse2.nim | 77 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 31 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 68d5f67..475e8f0 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -436,11 +436,6 @@ proc drawCorrect( blended = blender(backdrop, sample) a.unsafe[x, y] = blended -template getUncheckedArray( - image: Image, x, y: int -): ptr UncheckedArray[ColorRGBX] = - cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr) - proc blitLine( a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender ) {.inline.} = diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 0120333..a4e9938 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -47,6 +47,11 @@ proc intersectsInside*(a, b: Segment, at: var Vec2): bool {.inline.} = at = a.at + (t * s1) return true +template getUncheckedArray*( + image: Image, x, y: int +): ptr UncheckedArray[ColorRGBX] = + cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr) + proc fillUnsafe*( data: var seq[ColorRGBX], color: SomeColor, start, len: int ) {.hasSimd, raises: [].} = diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 7ffabd2..2cf7b09 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1597,6 +1597,18 @@ proc fillCoverage( image.data[dataIndex] = blender(backdrop, source(rgbx, coverage)) inc dataIndex +proc blendLineNormal( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.hasSimd.} = + for i in 0 ..< len: + line[i] = blendNormal(line[i], rgbx) + +proc blendLineMask( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.hasSimd.} = + for i in 0 ..< len: + line[i] = blendMask(line[i], rgbx) + proc fillHits( image: Image, rgbx: ColorRGBX, @@ -1607,19 +1619,6 @@ proc fillHits( blendMode: BlendMode, maskClears = true ) = - template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) = - when allowSimd: - when defined(amd64): - var p = cast[uint](image.data[image.dataIndex(x, y)].addr) - let - iterations = len div 4 - colorVec = mm_set1_epi32(cast[int32](rgbx)) - for _ in 0 ..< iterations: - let backdrop = mm_loadu_si128(cast[pointer](p)) - mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec)) - p += 16 - x += iterations * 4 - case blendMode: of OverwriteBlend: for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): @@ -1630,13 +1629,7 @@ proc fillHits( if rgbx.a == 255: fillUnsafe(image.data, rgbx, image.dataIndex(start, y), len) else: - var x = start - simdBlob(image, x, len, blendNormalSimd) - var dataIndex = image.dataIndex(x, y) - for _ in x ..< start + len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendNormal(backdrop, rgbx) - inc dataIndex + blendLineNormal(image.getUncheckedArray(start, y), rgbx, len) of MaskBlend: {.linearScanEnd.} @@ -1653,12 +1646,7 @@ proc fillHits( ) block: # Handle this fill if rgbx.a != 255: - var x = start - simdBlob(image, x, len, blendMaskSimd) - var dataIndex = image.dataIndex(x, y) - for _ in x ..< start + len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendMask(backdrop, rgbx) + blendLineMask(image.getUncheckedArray(start, y), rgbx, len) filledTo = start + len if maskClears: diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index cc77910..1eead85 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -527,6 +527,47 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) +proc blendLineNormalSse2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendNormal(line[i], rgbx) + inc i + + let + source = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + while i < len - 4: + let backdrop = mm_load_si128(line[i].addr) + var + sourceAlpha = mm_and_si128(source, alphaMask) + backdropEven = mm_slli_epi16(backdrop, 8) + backdropOdd = mm_and_si128(backdrop, oddMask) + + sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) + + let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) + + backdropEven = mm_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) + backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) + + let added = mm_add_epi8( + source, + mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) + ) + + mm_store_si128(line[i].addr, added) + i += 4 + + for i in i ..< len: + line[i] = blendNormal(line[i], rgbx) + proc blitLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = @@ -576,6 +617,42 @@ proc blitLineNormalSse2*( for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineMaskSse2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendMask(line[i], rgbx) + inc i + + let + source = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + while i < len - 4: + let backdrop = mm_load_si128(line[i].addr) + var + sourceAlpha = mm_and_si128(source, alphaMask) + backdropEven = mm_slli_epi16(backdrop, 8) + backdropOdd = mm_and_si128(backdrop, oddMask) + + sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) + + backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) + backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) + backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) + + mm_store_si128( + line[i].addr, + mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) + ) + i += 4 + + for i in i ..< len: + line[i] = blendMask(line[i], rgbx) + proc blitLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = From 6582f7c4ca15de3ab0b7e8ece08b3c7a0ceb8e58 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 13:32:12 -0500 Subject: [PATCH 05/15] rename --- src/pixie/images.nim | 20 ++++++++++---------- src/pixie/simd/sse2.nim | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 475e8f0..f444328 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -436,26 +436,26 @@ proc drawCorrect( blended = blender(backdrop, sample) a.unsafe[x, y] = blended -proc blitLine( +proc blendLine( a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender ) {.inline.} = for i in 0 ..< len: a[i] = blender(a[i], b[i]) -proc blitLineOverwrite( +proc blendLineOverwrite( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.inline.} = copyMem(a[0].addr, b[0].addr, len * 4) -proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = +proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = for i in 0 ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = +proc blendLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = for i in 0 ..< len: a[i] = blendMask(a[i], b[i]) -proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = +proc blendRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = let px = pos.x.int py = pos.y.int @@ -474,14 +474,14 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = case blendMode: of NormalBlend: for y in yStart ..< yEnd: - blitLineNormal( + blendLineNormal( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart ) of OverwriteBlend: for y in yStart ..< yEnd: - blitLineOverwrite( + blendLineOverwrite( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart @@ -493,7 +493,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = for y in yStart ..< yEnd: if xStart + px > 0: zeroMem(a.data[a.dataIndex(0, y + py)].addr, (xStart + px) * 4) - blitLineMask( + blendLineMask( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart @@ -511,7 +511,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = else: let blender = blendMode.blender() for y in yStart ..< yEnd: - blitLine( + blendLine( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart, @@ -559,7 +559,7 @@ proc draw*( if hasRotationOrScaling or smooth: a.drawCorrect(b, inverseTransform.inverse(), blendMode, false) else: - a.blitRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode) + a.blendRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode) proc drawTiled*( dst, src: Image, mat: Mat3, blendMode = NormalBlend diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 1eead85..d611962 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -568,7 +568,7 @@ proc blendLineNormalSse2*( for i in i ..< len: line[i] = blendNormal(line[i], rgbx) -proc blitLineNormalSse2*( +proc blendLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int @@ -653,7 +653,7 @@ proc blendLineMaskSse2*( for i in i ..< len: line[i] = blendMask(line[i], rgbx) -proc blitLineMaskSse2*( +proc blendLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int From a56fba39a7ae09eab370ab1da097911407c85769 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 13:47:43 -0500 Subject: [PATCH 06/15] avx2 versions --- src/pixie/simd/avx2.nim | 87 ++++++++++++++++++++++++++++++++++++++++- src/pixie/simd/sse2.nim | 2 - 2 files changed, 85 insertions(+), 4 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 9375075..f4f4ecc 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -380,6 +380,51 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = # Set src as this result for if we do another power src = result +proc blendLineNormalAvx2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 31) != 0: + line[i] = blendNormal(line[i], rgbx) + inc i + + let + source = mm256_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 8: + let backdrop = mm256_load_si256(line[i].addr) + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) + + backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + let added = mm256_add_epi8( + source, + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + ) + + mm256_store_si256(line[i].addr, added) + i += 8 + + for i in i ..< len: + line[i] = blendNormal(line[i], rgbx) + proc blitLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = @@ -406,7 +451,6 @@ proc blitLineNormalAvx2*( mm256_storeu_si256(a[i].addr, source) else: let backdrop = mm256_load_si256(a[i].addr) - var sourceAlpha = mm256_and_si256(source, alphaMask) backdropEven = mm256_slli_epi16(backdrop, 8) @@ -433,6 +477,46 @@ proc blitLineNormalAvx2*( for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineMaskAvx2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 31) != 0: + line[i] = blendMask(line[i], rgbx) + inc i + + let + source = mm256_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 8: + let backdrop = mm256_load_si256(line[i].addr) + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) + backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_store_si256( + line[i].addr, + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + ) + i += 8 + + for i in i ..< len: + line[i] = blendMask(line[i], rgbx) + proc blitLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = @@ -458,7 +542,6 @@ proc blitLineMaskAvx2*( discard else: let backdrop = mm256_load_si256(a[i].addr) - var sourceAlpha = mm256_and_si256(source, alphaMask) backdropEven = mm256_slli_epi16(backdrop, 8) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index d611962..f2913ff 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -590,7 +590,6 @@ proc blendLineNormalSse2*( mm_storeu_si128(a[i].addr, source) else: let backdrop = mm_load_si128(a[i].addr) - var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -674,7 +673,6 @@ proc blendLineMaskSse2*( discard else: let backdrop = mm_load_si128(a[i].addr) - var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) From 2a2dd4b23119127c3c91d29651e0ea92d370ad02 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 13:51:30 -0500 Subject: [PATCH 07/15] rename --- src/pixie/simd/avx2.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index f4f4ecc..a60af01 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -425,7 +425,7 @@ proc blendLineNormalAvx2*( for i in i ..< len: line[i] = blendNormal(line[i], rgbx) -proc blitLineNormalAvx2*( +proc blendLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int @@ -517,7 +517,7 @@ proc blendLineMaskAvx2*( for i in i ..< len: line[i] = blendMask(line[i], rgbx) -proc blitLineMaskAvx2*( +proc blendLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int From e0cb5c2b1191be66b5d965506cef4acea548ef45 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 13:54:09 -0500 Subject: [PATCH 08/15] f --- src/pixie/paths.nim | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 2cf7b09..814a6ad 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1550,7 +1550,6 @@ proc fillCoverage( of MaskBlend: {.linearScanEnd.} - when allowSimd: when defined(amd64): for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): From 24b36b077e49e6b4bf4f6ec24314c77c66fd1793 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 15:08:27 -0500 Subject: [PATCH 09/15] tmp --- src/pixie/common.nim | 13 ++ src/pixie/paths.nim | 197 +++++++++-------------------- src/pixie/simd/avx2.nim | 113 ++++++----------- src/pixie/simd/sse2.nim | 274 +++++++++++++++++++++++++--------------- 4 files changed, 287 insertions(+), 310 deletions(-) diff --git a/src/pixie/common.nim b/src/pixie/common.nim index 902d55f..b8da007 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = a = ((color.a * x + 127) div 255).uint8 rgbx(r, g, b, a) +proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = + if coverage == 0: + discard + elif coverage == 255: + result = rgbx + else: + result = rgbx( + ((rgbx.r.uint32 * coverage + 127) div 255).uint8, + ((rgbx.g.uint32 * coverage + 127) div 255).uint8, + ((rgbx.b.uint32 * coverage + 127) div 255).uint8, + ((rgbx.a.uint32 * coverage + 127) div 255).uint8 + ) + proc snapToPixels*(rect: Rect): Rect {.raises: [].} = let xMin = rect.x diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 814a6ad..7c9bf0c 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1429,6 +1429,47 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) = len = image.dataIndex(toX, toY) - start fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len) +proc blendLineCoverageOverwrite( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + +proc blendLineCoverageNormal( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + +proc blendLineCoverageMask( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + proc fillCoverage( image: Image, rgbx: ColorRGBX, @@ -1440,149 +1481,31 @@ proc fillCoverage( x = startX dataIndex = image.dataIndex(x, y) - when allowSimd: - when defined(amd64): - iterator simd( - coverages: seq[uint8], x: var int, startX: int - ): (M128i, bool, bool) = - for _ in 0 ..< coverages.len div 16: - let - coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr) - eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255)) - allZeroes = mm_movemask_epi8(eqZero) == 0xffff - all255 = mm_movemask_epi8(eq255) == 0xffff - yield (coverageVec, allZeroes, all255) - x += 16 - - proc source(colorVec, coverageVec: M128i): M128i {.inline.} = - let - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - - var unpacked = unpackAlphaValues(coverageVec) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - - var - sourceEven = mm_slli_epi16(colorVec, 8) - sourceOdd = mm_and_si128(colorVec, oddMask) - sourceEven = mm_mulhi_epu16(sourceEven, unpacked) - sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked) - sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) - sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) - result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) - - let colorVec = mm_set1_epi32(cast[int32](rgbx)) - - proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = - if coverage == 0: - discard - elif coverage == 255: - result = rgbx - else: - result = rgbx( - ((rgbx.r.uint32 * coverage) div 255).uint8, - ((rgbx.g.uint32 * coverage) div 255).uint8, - ((rgbx.b.uint32 * coverage) div 255).uint8, - ((rgbx.a.uint32 * coverage) div 255).uint8 - ) - case blendMode: of OverwriteBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let source = source(colorVec, coverageVec) - mm_storeu_si128(image.data[dataIndex].addr, source) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage != 0: - image.data[dataIndex] = source(rgbx, coverage) - inc dataIndex + blendLineCoverageOverwrite( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of NormalBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255 and rgbx.a == 255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendNormalSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 255 and rgbx.a == 255: - image.data[dataIndex] = rgbx - elif coverage == 0: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageNormal( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of MaskBlend: {.linearScanEnd.} - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if not allZeroes: - if all255: - dataIndex += 16 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendMaskSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - else: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128()) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 0: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - elif coverage == 255: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageMask( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) image.clearUnsafe(0, y, startX, y) image.clearUnsafe(startX + coverages.len, y, image.width, y) @@ -1593,7 +1516,7 @@ proc fillCoverage( let coverage = coverages[x - startX] if coverage != 0: let backdrop = image.data[dataIndex] - image.data[dataIndex] = blender(backdrop, source(rgbx, coverage)) + image.data[dataIndex] = blender(backdrop, rgbx * coverage) inc dataIndex proc blendLineNormal( diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index a60af01..4ddc87d 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -6,6 +6,41 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} +template blendNormalSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) + + backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_add_epi8( + source, + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + ) + +template blendMaskSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) + backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + proc isOneColorAvx2*(image: Image): bool {.simd.} = result = true @@ -400,26 +435,7 @@ proc blendLineNormalAvx2*( ) while i < len - 8: let backdrop = mm256_load_si256(line[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm256_add_epi8( - source, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - - mm256_store_si256(line[i].addr, added) + mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -451,27 +467,7 @@ proc blendLineNormalAvx2*( mm256_storeu_si256(a[i].addr, source) else: let backdrop = mm256_load_si256(a[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm256_add_epi8( - source, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - - mm256_store_si256(a[i].addr, added) - + mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -496,22 +492,7 @@ proc blendLineMaskAvx2*( ) while i < len - 8: let backdrop = mm256_load_si256(line[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - mm256_store_si256( - line[i].addr, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) + mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -542,23 +523,7 @@ proc blendLineMaskAvx2*( discard else: let backdrop = mm256_load_si256(a[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - mm256_store_si256( - a[i].addr, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - + mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source)) i += 8 for i in i ..< len: diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index f2913ff..87b4bce 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) -proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) - -proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendNormalSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - let k = mm_sub_epi32( - mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])), - sourceAlpha - ) - - backdropEven = mm_mulhi_epu16(backdropEven, k) - backdropOdd = mm_mulhi_epu16(backdropOdd, k) + let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) + backdropEven = mm_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) -proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendMaskSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -527,6 +507,67 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) +proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} = + + proc unpackAlphaValues(v: M128i): M128i {.inline.} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). + result = mm_unpacklo_epi8(mm_setzero_si128(), v) + result = mm_unpacklo_epi8(mm_setzero_si128(), result) + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + + var unpacked = unpackAlphaValues(coverage) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + + var + rgbxEven = mm_slli_epi16(rgbxVec, 8) + rgbxOdd = mm_and_si128(rgbxVec, oddMask) + rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked) + rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked) + rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7) + rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7) + + mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8)) + +proc blendLineCoverageOverwriteSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + inc i + + let rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + proc blendLineNormalSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -543,26 +584,7 @@ proc blendLineNormalSse2*( vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) while i < len - 4: let backdrop = mm_load_si128(line[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm_add_epi8( - source, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - - mm_store_si128(line[i].addr, added) + mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source)) i += 4 for i in i ..< len: @@ -590,32 +612,65 @@ proc blendLineNormalSse2*( mm_storeu_si128(a[i].addr, source) else: let backdrop = mm_load_si128(a[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm_add_epi8( - source, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - - mm_store_si128(a[i].addr, added) - + mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineCoverageNormalSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + proc blendLineMaskSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -631,22 +686,7 @@ proc blendLineMaskSse2*( div255 = mm_set1_epi16(cast[int16](0x8081)) while i < len - 4: let backdrop = mm_load_si128(line[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - mm_store_si128( - line[i].addr, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) + mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source)) i += 4 for i in i ..< len: @@ -673,27 +713,63 @@ proc blendLineMaskSse2*( discard else: let backdrop = mm_load_si128(a[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - mm_store_si128( - a[i].addr, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - + mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendMask(a[i], b[i]) +proc blendLineCoverageMaskSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, mm_setzero_si128()) + i += 4 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + i += 16 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + when defined(release): {.pop.} From a92e289e366e369707b01fa5c430ca4692dbf6da Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 15:15:48 -0500 Subject: [PATCH 10/15] f --- src/pixie/simd/sse2.nim | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 87b4bce..966efbf 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -544,12 +544,15 @@ proc blendLineCoverageOverwriteSse2*( line[i] = rgbx * coverage inc i - let rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) while i < len - 16: let coverage = mm_loadu_si128(coverages[i].addr) - eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) if mm_movemask_epi8(eqZero) == 0xffff: i += 16 elif mm_movemask_epi8(eq255) == 0xffff: @@ -637,6 +640,8 @@ proc blendLineCoverageNormalSse2*( let rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) @@ -644,8 +649,8 @@ proc blendLineCoverageNormalSse2*( while i < len - 16: let coverage = mm_loadu_si128(coverages[i].addr) - eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) if mm_movemask_epi8(eqZero) == 0xffff: i += 16 elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: @@ -738,17 +743,19 @@ proc blendLineCoverageMaskSse2*( let rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) while i < len - 16: let coverage = mm_loadu_si128(coverages[i].addr) - eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) if mm_movemask_epi8(eqZero) == 0xffff: for _ in 0 ..< 4: - mm_store_si128(line[i].addr, mm_setzero_si128()) + mm_store_si128(line[i].addr, vecZero) i += 4 elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: i += 16 From 58887e8eb6e021096bead26a7b14db869ebbd857 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 15:20:18 -0500 Subject: [PATCH 11/15] use aligned store (bugfix) --- src/pixie/simd/avx2.nim | 2 +- src/pixie/simd/sse2.nim | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 4ddc87d..5ef6591 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -464,7 +464,7 @@ proc blendLineNormalAvx2*( source = mm256_loadu_si256(b[i].addr) eq255 = mm256_cmpeq_epi8(source, vec255) if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source - mm256_storeu_si256(a[i].addr, source) + mm256_store_si256(a[i].addr, source) else: let backdrop = mm256_load_si256(a[i].addr) mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source)) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 966efbf..e8fd2f7 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -305,7 +305,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) - mm_storeu_si128( + mm_store_si128( cast[pointer](p), mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) ) @@ -347,8 +347,8 @@ proc ceilSse2*(image: Image) {.simd.} = values1 = mm_cmpeq_epi8(values1, vecZero) values0 = mm_andnot_si128(values0, vec255) values1 = mm_andnot_si128(values1, vec255) - mm_storeu_si128(cast[pointer](p), values0) - mm_storeu_si128(cast[pointer](p + 16), values1) + mm_store_si128(cast[pointer](p), values0) + mm_store_si128(cast[pointer](p + 16), values1) p += 32 i += 8 * iterations @@ -562,7 +562,7 @@ proc blendLineCoverageOverwriteSse2*( else: var coverage = coverage for _ in 0 ..< 4: - mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage)) + mm_store_si128(line[i].addr, rgbxVec.applyCoverage(coverage)) coverage = mm_srli_si128(coverage, 4) i += 4 @@ -612,7 +612,7 @@ proc blendLineNormalSse2*( source = mm_loadu_si128(b[i].addr) eq255 = mm_cmpeq_epi8(source, vec255) if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source - mm_storeu_si128(a[i].addr, source) + mm_store_si128(a[i].addr, source) else: let backdrop = mm_load_si128(a[i].addr) mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source)) @@ -663,7 +663,7 @@ proc blendLineCoverageNormalSse2*( let backdrop = mm_loadu_si128(line[i].addr) source = rgbxVec.applyCoverage(coverage) - mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source)) + mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source)) coverage = mm_srli_si128(coverage, 4) i += 4 @@ -765,7 +765,7 @@ proc blendLineCoverageMaskSse2*( let backdrop = mm_loadu_si128(line[i].addr) source = rgbxVec.applyCoverage(coverage) - mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source)) + mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source)) coverage = mm_srli_si128(coverage, 4) i += 4 From 17cfb62ab370c1e8fbbc43bfc126d72a1bd71345 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 17:00:10 -0500 Subject: [PATCH 12/15] rename --- src/pixie/common.nim | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pixie/common.nim b/src/pixie/common.nim index b8da007..3e4bc40 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -76,17 +76,17 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = a = ((color.a * x + 127) div 255).uint8 rgbx(r, g, b, a) -proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = - if coverage == 0: +proc `*`*(rgbx: ColorRGBX, opacity: uint8): ColorRGBX {.inline.} = + if opacity == 0: discard - elif coverage == 255: + elif opacity == 255: result = rgbx else: result = rgbx( - ((rgbx.r.uint32 * coverage + 127) div 255).uint8, - ((rgbx.g.uint32 * coverage + 127) div 255).uint8, - ((rgbx.b.uint32 * coverage + 127) div 255).uint8, - ((rgbx.a.uint32 * coverage + 127) div 255).uint8 + ((rgbx.r.uint32 * opacity + 127) div 255).uint8, + ((rgbx.g.uint32 * opacity + 127) div 255).uint8, + ((rgbx.b.uint32 * opacity + 127) div 255).uint8, + ((rgbx.a.uint32 * opacity + 127) div 255).uint8 ) proc snapToPixels*(rect: Rect): Rect {.raises: [].} = From e107f85fb008c9f391ed675d68d1cc09fe9fdd1a Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 17:16:35 -0500 Subject: [PATCH 13/15] check bounds when aligning --- src/pixie/paths.nim | 8 ++------ src/pixie/simd/avx2.nim | 8 ++++---- src/pixie/simd/sse2.nim | 22 +++++++++------------- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 7c9bf0c..e738e54 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1448,9 +1448,7 @@ proc blendLineCoverageNormal( ) {.hasSimd.} = for i in 0 ..< len: let coverage = coverages[i] - if coverage == 255 and rgbx.a == 255: - line[i] = rgbx - elif coverage == 0: + if coverage == 0: discard else: line[i] = blendNormal(line[i], rgbx * coverage) @@ -1463,9 +1461,7 @@ proc blendLineCoverageMask( ) {.hasSimd.} = for i in 0 ..< len: let coverage = coverages[i] - if coverage == 0: - line[i] = rgbx(0, 0, 0, 0) - elif coverage == 255: + if coverage == 255: discard else: line[i] = blendMask(line[i], rgbx * coverage) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 5ef6591..97807c3 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -419,7 +419,7 @@ proc blendLineNormalAvx2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 31) != 0: + while i < len and (cast[uint](line[i].addr) and 31) != 0: line[i] = blendNormal(line[i], rgbx) inc i @@ -445,7 +445,7 @@ proc blendLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 31) != 0: + while i < len and (cast[uint](a[i].addr) and 31) != 0: a[i] = blendNormal(a[i], b[i]) inc i @@ -477,7 +477,7 @@ proc blendLineMaskAvx2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 31) != 0: + while i < len and (cast[uint](line[i].addr) and 31) != 0: line[i] = blendMask(line[i], rgbx) inc i @@ -502,7 +502,7 @@ proc blendLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 31) != 0: + while i < len and (cast[uint](a[i].addr) and 31) != 0: a[i] = blendMask(a[i], b[i]) inc i diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index e8fd2f7..df849dd 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -538,7 +538,7 @@ proc blendLineCoverageOverwriteSse2*( len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: let coverage = coverages[i] if coverage != 0: line[i] = rgbx * coverage @@ -575,7 +575,7 @@ proc blendLineNormalSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: line[i] = blendNormal(line[i], rgbx) inc i @@ -597,7 +597,7 @@ proc blendLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendNormal(a[i], b[i]) inc i @@ -628,11 +628,9 @@ proc blendLineCoverageNormalSse2*( len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: let coverage = coverages[i] - if coverage == 255 and rgbx.a == 255: - line[i] = rgbx - elif coverage == 0: + if coverage == 0: discard else: line[i] = blendNormal(line[i], rgbx * coverage) @@ -669,9 +667,7 @@ proc blendLineCoverageNormalSse2*( for i in i ..< len: let coverage = coverages[i] - if coverage == 255 and rgbx.a == 255: - line[i] = rgbx - elif coverage == 0: + if coverage == 0: discard else: line[i] = blendNormal(line[i], rgbx * coverage) @@ -680,7 +676,7 @@ proc blendLineMaskSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: line[i] = blendMask(line[i], rgbx) inc i @@ -701,7 +697,7 @@ proc blendLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendMask(a[i], b[i]) inc i @@ -731,7 +727,7 @@ proc blendLineCoverageMaskSse2*( len: int ) {.simd.} = var i: int - while (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: let coverage = coverages[i] if coverage == 0: line[i] = rgbx(0, 0, 0, 0) From 31bd588b172c6bff9422d2220b62318b726742fe Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 19:28:05 -0500 Subject: [PATCH 14/15] simpler --- src/pixie/simd/sse2.nim | 18 ++++++------------ tests/bench_fonts.nim | 3 +-- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index df849dd..a5880ed 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -507,18 +507,10 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) -proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} = - - proc unpackAlphaValues(v: M128i): M128i {.inline.} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) - - let - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - - var unpacked = unpackAlphaValues(coverage) +template applyCoverage*(rgbxVec, coverage: M128i): M128i = + ## Unpack the first 4 coverage bytes. + var unpacked = mm_unpacklo_epi8(mm_setzero_si128(), coverage) + unpacked = mm_unpacklo_epi8(mm_setzero_si128(), unpacked) unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) var @@ -548,6 +540,8 @@ proc blendLineCoverageOverwriteSse2*( rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) vecZero = mm_setzero_si128() vec255 = mm_set1_epi8(255) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) while i < len - 16: let coverage = mm_loadu_si128(coverages[i].addr) diff --git a/tests/bench_fonts.nim b/tests/bench_fonts.nim index 32bbc01..544ca12 100644 --- a/tests/bench_fonts.nim +++ b/tests/bench_fonts.nim @@ -5,8 +5,7 @@ const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis in q var font = readFont("tests/fonts/Roboto-Regular_1.ttf") font.size = 16 -let - image = newImage(500, 300) +let image = newImage(500, 300) timeIt "typeset": discard font.typeset(text, bounds = vec2(image.width.float32, 0)) From 36675576188d95f0a64dbd6e53bf5df1fe81ee30 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg <ryan@guzba.com> Date: Sun, 31 Jul 2022 21:14:21 -0500 Subject: [PATCH 15/15] rename --- src/pixie/simd/neon.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index bb43213..8beca4f 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -414,7 +414,7 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) -proc blitLineNormalNeon*( +proc blendLineNormalNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int @@ -463,7 +463,7 @@ proc blitLineNormalNeon*( for i in i ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMaskNeon*( +proc blendLineMaskNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int