This commit is contained in:
Ryan Oldenburg 2022-07-31 15:08:27 -05:00
parent e0cb5c2b11
commit 24b36b077e
4 changed files with 287 additions and 310 deletions

View file

@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
a = ((color.a * x + 127) div 255).uint8
rgbx(r, g, b, a)
proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
if coverage == 0:
discard
elif coverage == 255:
result = rgbx
else:
result = rgbx(
((rgbx.r.uint32 * coverage + 127) div 255).uint8,
((rgbx.g.uint32 * coverage + 127) div 255).uint8,
((rgbx.b.uint32 * coverage + 127) div 255).uint8,
((rgbx.a.uint32 * coverage + 127) div 255).uint8
)
proc snapToPixels*(rect: Rect): Rect {.raises: [].} =
let
xMin = rect.x

View file

@ -1429,6 +1429,47 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) =
len = image.dataIndex(toX, toY) - start
fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len)
proc blendLineCoverageOverwrite(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
proc blendLineCoverageNormal(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage == 255 and rgbx.a == 255:
line[i] = rgbx
elif coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
proc blendLineCoverageMask(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
proc fillCoverage(
image: Image,
rgbx: ColorRGBX,
@ -1440,149 +1481,31 @@ proc fillCoverage(
x = startX
dataIndex = image.dataIndex(x, y)
when allowSimd:
when defined(amd64):
iterator simd(
coverages: seq[uint8], x: var int, startX: int
): (M128i, bool, bool) =
for _ in 0 ..< coverages.len div 16:
let
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
allZeroes = mm_movemask_epi8(eqZero) == 0xffff
all255 = mm_movemask_epi8(eq255) == 0xffff
yield (coverageVec, allZeroes, all255)
x += 16
proc source(colorVec, coverageVec: M128i): M128i {.inline.} =
let
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
var unpacked = unpackAlphaValues(coverageVec)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var
sourceEven = mm_slli_epi16(colorVec, 8)
sourceOdd = mm_and_si128(colorVec, oddMask)
sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
let colorVec = mm_set1_epi32(cast[int32](rgbx))
proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
if coverage == 0:
discard
elif coverage == 255:
result = rgbx
else:
result = rgbx(
((rgbx.r.uint32 * coverage) div 255).uint8,
((rgbx.g.uint32 * coverage) div 255).uint8,
((rgbx.b.uint32 * coverage) div 255).uint8,
((rgbx.a.uint32 * coverage) div 255).uint8
)
case blendMode:
of OverwriteBlend:
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if allZeroes:
dataIndex += 16
else:
if all255:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, colorVec)
dataIndex += 4
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let source = source(colorVec, coverageVec)
mm_storeu_si128(image.data[dataIndex].addr, source)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage != 0:
image.data[dataIndex] = source(rgbx, coverage)
inc dataIndex
blendLineCoverageOverwrite(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
of NormalBlend:
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if allZeroes:
dataIndex += 16
else:
if all255 and rgbx.a == 255:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, colorVec)
dataIndex += 4
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let
backdrop = mm_loadu_si128(image.data[dataIndex].addr)
source = source(colorVec, coverageVec)
mm_storeu_si128(
image.data[dataIndex].addr,
blendNormalSimd(backdrop, source)
)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage == 255 and rgbx.a == 255:
image.data[dataIndex] = rgbx
elif coverage == 0:
discard
else:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage))
inc dataIndex
blendLineCoverageNormal(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
of MaskBlend:
{.linearScanEnd.}
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if not allZeroes:
if all255:
dataIndex += 16
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let
backdrop = mm_loadu_si128(image.data[dataIndex].addr)
source = source(colorVec, coverageVec)
mm_storeu_si128(
image.data[dataIndex].addr,
blendMaskSimd(backdrop, source)
)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
else:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128())
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage == 0:
image.data[dataIndex] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage))
inc dataIndex
blendLineCoverageMask(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
image.clearUnsafe(0, y, startX, y)
image.clearUnsafe(startX + coverages.len, y, image.width, y)
@ -1593,7 +1516,7 @@ proc fillCoverage(
let coverage = coverages[x - startX]
if coverage != 0:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blender(backdrop, source(rgbx, coverage))
image.data[dataIndex] = blender(backdrop, rgbx * coverage)
inc dataIndex
proc blendLineNormal(

View file

@ -6,6 +6,41 @@ when defined(gcc) or defined(clang):
when defined(release):
{.push checks: off.}
template blendNormalSimd(backdrop, source: M256i): M256i =
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_add_epi8(
source,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
template blendMaskSimd(backdrop, source: M256i): M256i =
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
proc isOneColorAvx2*(image: Image): bool {.simd.} =
result = true
@ -400,26 +435,7 @@ proc blendLineNormalAvx2*(
)
while i < len - 8:
let backdrop = mm256_load_si256(line[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
let added = mm256_add_epi8(
source,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(line[i].addr, added)
mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source))
i += 8
for i in i ..< len:
@ -451,27 +467,7 @@ proc blendLineNormalAvx2*(
mm256_storeu_si256(a[i].addr, source)
else:
let backdrop = mm256_load_si256(a[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
let added = mm256_add_epi8(
source,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(a[i].addr, added)
mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source))
i += 8
for i in i ..< len:
@ -496,22 +492,7 @@ proc blendLineMaskAvx2*(
)
while i < len - 8:
let backdrop = mm256_load_si256(line[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_store_si256(
line[i].addr,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source))
i += 8
for i in i ..< len:
@ -542,23 +523,7 @@ proc blendLineMaskAvx2*(
discard
else:
let backdrop = mm256_load_si256(a[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_store_si256(
a[i].addr,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source))
i += 8
for i in i ..< len:

View file

@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
template blendNormalSimd*(backdrop, source: M128i): M128i =
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
let k = mm_sub_epi32(
mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])),
sourceAlpha
)
backdropEven = mm_mulhi_epu16(backdropEven, k)
backdropOdd = mm_mulhi_epu16(backdropOdd, k)
let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
template blendMaskSimd*(backdrop, source: M128i): M128i =
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
@ -527,6 +507,67 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
result.width * 4
)
proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} =
proc unpackAlphaValues(v: M128i): M128i {.inline.} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
let
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
var unpacked = unpackAlphaValues(coverage)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var
rgbxEven = mm_slli_epi16(rgbxVec, 8)
rgbxOdd = mm_and_si128(rgbxVec, oddMask)
rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked)
rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked)
rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7)
rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7)
mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8))
proc blendLineCoverageOverwriteSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
inc i
let rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
if mm_movemask_epi8(eqZero) == 0xffff:
i += 16
elif mm_movemask_epi8(eq255) == 0xffff:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, rgbxVec)
i += 4
else:
var coverage = coverage
for _ in 0 ..< 4:
mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
proc blendLineNormalSse2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
@ -543,26 +584,7 @@ proc blendLineNormalSse2*(
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
while i < len - 4:
let backdrop = mm_load_si128(line[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
let added = mm_add_epi8(
source,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(line[i].addr, added)
mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source))
i += 4
for i in i ..< len:
@ -590,32 +612,65 @@ proc blendLineNormalSse2*(
mm_storeu_si128(a[i].addr, source)
else:
let backdrop = mm_load_si128(a[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
let added = mm_add_epi8(
source,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(a[i].addr, added)
mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source))
i += 4
for i in i ..< len:
a[i] = blendNormal(a[i], b[i])
proc blendLineCoverageNormalSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage == 255 and rgbx.a == 255:
line[i] = rgbx
elif coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
inc i
let
rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
if mm_movemask_epi8(eqZero) == 0xffff:
i += 16
elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, rgbxVec)
i += 4
else:
var coverage = coverage
for _ in 0 ..< 4:
let
backdrop = mm_loadu_si128(line[i].addr)
source = rgbxVec.applyCoverage(coverage)
mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage == 255 and rgbx.a == 255:
line[i] = rgbx
elif coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
proc blendLineMaskSse2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
@ -631,22 +686,7 @@ proc blendLineMaskSse2*(
div255 = mm_set1_epi16(cast[int16](0x8081))
while i < len - 4:
let backdrop = mm_load_si128(line[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
mm_store_si128(
line[i].addr,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source))
i += 4
for i in i ..< len:
@ -673,27 +713,63 @@ proc blendLineMaskSse2*(
discard
else:
let backdrop = mm_load_si128(a[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
mm_store_si128(
a[i].addr,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source))
i += 4
for i in i ..< len:
a[i] = blendMask(a[i], b[i])
proc blendLineCoverageMaskSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
inc i
let
rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
if mm_movemask_epi8(eqZero) == 0xffff:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, mm_setzero_si128())
i += 4
elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
i += 16
else:
var coverage = coverage
for _ in 0 ..< 4:
let
backdrop = mm_loadu_si128(line[i].addr)
source = rgbxVec.applyCoverage(coverage)
mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
when defined(release):
{.pop.}