diff --git a/src/pixie/common.nim b/src/pixie/common.nim index 902d55f..b8da007 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = a = ((color.a * x + 127) div 255).uint8 rgbx(r, g, b, a) +proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = + if coverage == 0: + discard + elif coverage == 255: + result = rgbx + else: + result = rgbx( + ((rgbx.r.uint32 * coverage + 127) div 255).uint8, + ((rgbx.g.uint32 * coverage + 127) div 255).uint8, + ((rgbx.b.uint32 * coverage + 127) div 255).uint8, + ((rgbx.a.uint32 * coverage + 127) div 255).uint8 + ) + proc snapToPixels*(rect: Rect): Rect {.raises: [].} = let xMin = rect.x diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 814a6ad..7c9bf0c 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1429,6 +1429,47 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) = len = image.dataIndex(toX, toY) - start fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len) +proc blendLineCoverageOverwrite( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + +proc blendLineCoverageNormal( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + +proc blendLineCoverageMask( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + proc fillCoverage( image: Image, rgbx: ColorRGBX, @@ -1440,149 +1481,31 @@ proc fillCoverage( x = startX dataIndex = image.dataIndex(x, y) - when allowSimd: - when defined(amd64): - iterator simd( - coverages: seq[uint8], x: var int, startX: int - ): (M128i, bool, bool) = - for _ in 0 ..< coverages.len div 16: - let - coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr) - eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255)) - allZeroes = mm_movemask_epi8(eqZero) == 0xffff - all255 = mm_movemask_epi8(eq255) == 0xffff - yield (coverageVec, allZeroes, all255) - x += 16 - - proc source(colorVec, coverageVec: M128i): M128i {.inline.} = - let - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - - var unpacked = unpackAlphaValues(coverageVec) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - - var - sourceEven = mm_slli_epi16(colorVec, 8) - sourceOdd = mm_and_si128(colorVec, oddMask) - sourceEven = mm_mulhi_epu16(sourceEven, unpacked) - sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked) - sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) - sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) - result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) - - let colorVec = mm_set1_epi32(cast[int32](rgbx)) - - proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = - if coverage == 0: - discard - elif coverage == 255: - result = rgbx - else: - result = rgbx( - ((rgbx.r.uint32 * coverage) div 255).uint8, - ((rgbx.g.uint32 * coverage) div 255).uint8, - ((rgbx.b.uint32 * coverage) div 255).uint8, - ((rgbx.a.uint32 * coverage) div 255).uint8 - ) - case blendMode: of OverwriteBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let source = source(colorVec, coverageVec) - mm_storeu_si128(image.data[dataIndex].addr, source) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage != 0: - image.data[dataIndex] = source(rgbx, coverage) - inc dataIndex + blendLineCoverageOverwrite( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of NormalBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255 and rgbx.a == 255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendNormalSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 255 and rgbx.a == 255: - image.data[dataIndex] = rgbx - elif coverage == 0: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageNormal( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of MaskBlend: {.linearScanEnd.} - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if not allZeroes: - if all255: - dataIndex += 16 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendMaskSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - else: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128()) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 0: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - elif coverage == 255: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageMask( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) image.clearUnsafe(0, y, startX, y) image.clearUnsafe(startX + coverages.len, y, image.width, y) @@ -1593,7 +1516,7 @@ proc fillCoverage( let coverage = coverages[x - startX] if coverage != 0: let backdrop = image.data[dataIndex] - image.data[dataIndex] = blender(backdrop, source(rgbx, coverage)) + image.data[dataIndex] = blender(backdrop, rgbx * coverage) inc dataIndex proc blendLineNormal( diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index a60af01..4ddc87d 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -6,6 +6,41 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} +template blendNormalSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) + + backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_add_epi8( + source, + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + ) + +template blendMaskSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) + backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + proc isOneColorAvx2*(image: Image): bool {.simd.} = result = true @@ -400,26 +435,7 @@ proc blendLineNormalAvx2*( ) while i < len - 8: let backdrop = mm256_load_si256(line[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm256_add_epi8( - source, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - - mm256_store_si256(line[i].addr, added) + mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -451,27 +467,7 @@ proc blendLineNormalAvx2*( mm256_storeu_si256(a[i].addr, source) else: let backdrop = mm256_load_si256(a[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm256_add_epi8( - source, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - - mm256_store_si256(a[i].addr, added) - + mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -496,22 +492,7 @@ proc blendLineMaskAvx2*( ) while i < len - 8: let backdrop = mm256_load_si256(line[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - mm256_store_si256( - line[i].addr, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) + mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source)) i += 8 for i in i ..< len: @@ -542,23 +523,7 @@ proc blendLineMaskAvx2*( discard else: let backdrop = mm256_load_si256(a[i].addr) - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - mm256_store_si256( - a[i].addr, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - + mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source)) i += 8 for i in i ..< len: diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index f2913ff..87b4bce 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) -proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) - -proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendNormalSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - let k = mm_sub_epi32( - mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])), - sourceAlpha - ) - - backdropEven = mm_mulhi_epu16(backdropEven, k) - backdropOdd = mm_mulhi_epu16(backdropOdd, k) + let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) + backdropEven = mm_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) -proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendMaskSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -527,6 +507,67 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) +proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} = + + proc unpackAlphaValues(v: M128i): M128i {.inline.} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). + result = mm_unpacklo_epi8(mm_setzero_si128(), v) + result = mm_unpacklo_epi8(mm_setzero_si128(), result) + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + + var unpacked = unpackAlphaValues(coverage) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + + var + rgbxEven = mm_slli_epi16(rgbxVec, 8) + rgbxOdd = mm_and_si128(rgbxVec, oddMask) + rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked) + rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked) + rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7) + rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7) + + mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8)) + +proc blendLineCoverageOverwriteSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + inc i + + let rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + proc blendLineNormalSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -543,26 +584,7 @@ proc blendLineNormalSse2*( vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) while i < len - 4: let backdrop = mm_load_si128(line[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm_add_epi8( - source, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - - mm_store_si128(line[i].addr, added) + mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source)) i += 4 for i in i ..< len: @@ -590,32 +612,65 @@ proc blendLineNormalSse2*( mm_storeu_si128(a[i].addr, source) else: let backdrop = mm_load_si128(a[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm_add_epi8( - source, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - - mm_store_si128(a[i].addr, added) - + mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineCoverageNormalSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 255 and rgbx.a == 255: + line[i] = rgbx + elif coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + proc blendLineMaskSse2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -631,22 +686,7 @@ proc blendLineMaskSse2*( div255 = mm_set1_epi16(cast[int16](0x8081)) while i < len - 4: let backdrop = mm_load_si128(line[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - mm_store_si128( - line[i].addr, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) + mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source)) i += 4 for i in i ..< len: @@ -673,27 +713,63 @@ proc blendLineMaskSse2*( discard else: let backdrop = mm_load_si128(a[i].addr) - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - mm_store_si128( - a[i].addr, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - + mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendMask(a[i], b[i]) +proc blendLineCoverageMaskSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128()) + eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255)) + if mm_movemask_epi8(eqZero) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, mm_setzero_si128()) + i += 4 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + i += 16 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + when defined(release): {.pop.}