faster simd path fill

This commit is contained in:
Ryan Oldenburg 2021-11-18 18:44:07 -06:00
parent 48ac033d71
commit 8813d61dec

View file

@ -1291,56 +1291,64 @@ proc fillCoverage(
# When supported, SIMD blend as much as possible
let
blenderSimd = blendMode.blenderSimd()
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vec255 = mm_set1_epi32(cast[int32](uint32.high))
zeroVec = mm_setzero_si128()
colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in countup(x, startX + coverages.len - 16, 4):
var coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr)
coverage = mm_and_si128(coverage, first32)
for _ in countup(x, startX + coverages.len - 16, 16):
let
index = image.dataIndex(x, y)
eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128())
if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask:
coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr)
if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff:
# If the coverages are not all zero
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) == 0xffff:
# Coverages are all 255
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, vec255)) == 0xffff:
# If the coverages are all 255
if blendMode == bmNormal and rgbx.a == 255:
mm_storeu_si128(image.data[index].addr, colorVec)
for i in 0 ..< 4:
mm_storeu_si128(image.data[index + i * 4].addr, colorVec)
else:
let backdrop = mm_loadu_si128(image.data[index].addr)
for i in 0 ..< 4:
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128(
image.data[index].addr,
image.data[index + i * 4].addr,
blenderSimd(backdrop, colorVec)
)
else:
# Coverages are not all 255
coverage = unpackAlphaValues(coverage)
var coverage = coverage
for i in 0 ..< 4:
var unpacked = unpackAlphaValues(coverage)
# Shift the coverages from `a` to `g` and `a` for multiplying
coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16))
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var
source = colorVec
sourceEven = mm_slli_epi16(source, 8)
sourceOdd = mm_and_si128(source, oddMask)
sourceEven = mm_mulhi_epu16(sourceEven, coverage)
sourceOdd = mm_mulhi_epu16(sourceOdd, coverage)
sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
let backdrop = mm_loadu_si128(image.data[index].addr)
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128(
image.data[index].addr,
image.data[index + i * 4].addr,
blenderSimd(backdrop, source)
)
coverage = mm_srli_si128(coverage, 4)
elif blendMode == bmMask:
mm_storeu_si128(image.data[index].addr, mm_setzero_si128())
x += 4
for i in 0 ..< 4:
mm_storeu_si128(image.data[index + i * 4].addr, zeroVec)
x += 16
let blender = blendMode.blender()
while x < startX + coverages.len:
@ -1375,13 +1383,14 @@ proc fillCoverage(
var x = startX
when defined(amd64) and not defined(pixieNoSimd):
if blendMode.hasSimdMasker():
let maskerSimd = blendMode.maskerSimd()
let
maskerSimd = blendMode.maskerSimd()
zeroVec = mm_setzero_si128()
for _ in countup(x, startX + coverages.len - 16, 16):
let
index = mask.dataIndex(x, y)
coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr)
eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128())
if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask:
if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff:
# If the coverages are not all zero
let backdrop = mm_loadu_si128(mask.data[index].addr)
mm_storeu_si128(
@ -1389,7 +1398,7 @@ proc fillCoverage(
maskerSimd(backdrop, coverage)
)
elif blendMode == bmMask:
mm_storeu_si128(mask.data[index].addr, mm_setzero_si128())
mm_storeu_si128(mask.data[index].addr, zeroVec)
x += 16
let masker = blendMode.masker()
@ -1439,15 +1448,15 @@ proc fillHits(
let
blenderSimd = blendMode.blenderSimd()
colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in countup(fillStart, fillLen - 16, 4):
let
index = image.dataIndex(x, y)
backdrop = mm_loadu_si128(image.data[index].addr)
for _ in countup(fillStart, fillLen - 16, 16):
let index = image.dataIndex(x, y)
for i in 0 ..< 4:
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128(
image.data[index].addr,
image.data[index + i * 4].addr,
blenderSimd(backdrop, colorVec)
)
x += 4
x += 16
for x in x ..< fillStart + fillLen:
let backdrop = image.getRgbaUnsafe(x, y)