inline simd faster

This commit is contained in:
Ryan Oldenburg 2021-11-29 03:38:49 -06:00
parent 8ddd22d761
commit ee5074c628
3 changed files with 78 additions and 36 deletions

View file

@ -184,3 +184,15 @@ block:
# a.writeFile("pixie4.png") # a.writeFile("pixie4.png")
# doDiff(readImage("cairo4.png"), a, "4") # doDiff(readImage("cairo4.png"), a, "4")
let mask = newMask(1000, 1000)
timeIt "pixie4 mask":
mask.fill(63)
let p = newPath()
p.moveTo(shapes[0][0])
for shape in shapes:
for v in shape:
p.lineTo(v)
mask.fillPath(p)

View file

@ -510,7 +510,7 @@ when defined(amd64) and not defined(pixieNoSimd):
MaskerSimd* = proc(blackdrop, source: M128i): M128i {.gcsafe, raises: [].} MaskerSimd* = proc(blackdrop, source: M128i): M128i {.gcsafe, raises: [].}
## Function signature returned by maskerSimd. ## Function signature returned by maskerSimd.
proc blendNormalSimd(backdrop, source: M128i): M128i = proc blendNormalInlineSimd*(backdrop, source: M128i): M128i {.inline.} =
let let
alphaMask = mm_set1_epi32(cast[int32](0xff000000)) alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00)) oddMask = mm_set1_epi16(cast[int16](0xff00))
@ -539,6 +539,9 @@ when defined(amd64) and not defined(pixieNoSimd):
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
) )
proc blendNormalSimd(backdrop, source: M128i): M128i =
blendNormalInlineSimd(backdrop, source)
proc blendMaskSimd(backdrop, source: M128i): M128i = proc blendMaskSimd(backdrop, source: M128i): M128i =
let let
alphaMask = mm_set1_epi32(cast[int32](0xff000000)) alphaMask = mm_set1_epi32(cast[int32](0xff000000))

View file

@ -1338,9 +1338,17 @@ proc fillCoverage(
# If the coverages are not all zero # If the coverages are not all zero
if mm_movemask_epi8(mm_cmpeq_epi32(coverageVec, vec255)) == 0xffff: if mm_movemask_epi8(mm_cmpeq_epi32(coverageVec, vec255)) == 0xffff:
# If the coverages are all 255 # If the coverages are all 255
if blendMode == bmNormal and rgbx.a == 255: if blendMode == bmNormal:
for i in 0 ..< 4: if rgbx.a == 255:
mm_storeu_si128(image.data[index + i * 4].addr, colorVec) for i in 0 ..< 4:
mm_storeu_si128(image.data[index + i * 4].addr, colorVec)
else:
for i in 0 ..< 4:
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128(
image.data[index + i * 4].addr,
blendNormalInlineSimd(backdrop, colorVec)
)
else: else:
for i in 0 ..< 4: for i in 0 ..< 4:
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr) let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
@ -1350,32 +1358,38 @@ proc fillCoverage(
) )
else: else:
# Coverages are not all 255 # Coverages are not all 255
var coverageVec = coverageVec template useCoverage(blendProc: untyped) =
for i in 0 ..< 4: var coverageVec = coverageVec
var unpacked = unpackAlphaValues(coverageVec) for i in 0 ..< 4:
# Shift the coverages from `a` to `g` and `a` for multiplying var unpacked = unpackAlphaValues(coverageVec)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) # Shift the coverages from `a` to `g` and `a` for multiplying
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var var
source = colorVec source = colorVec
sourceEven = mm_slli_epi16(source, 8) sourceEven = mm_slli_epi16(source, 8)
sourceOdd = mm_and_si128(source, oddMask) sourceOdd = mm_and_si128(source, oddMask)
sourceEven = mm_mulhi_epu16(sourceEven, unpacked) sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked) sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr) let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128( mm_storeu_si128(
image.data[index + i * 4].addr, image.data[index + i * 4].addr,
blenderSimd(backdrop, source) blendProc(backdrop, source)
) )
coverageVec = mm_srli_si128(coverageVec, 4) coverageVec = mm_srli_si128(coverageVec, 4)
if blendMode == bmNormal:
useCoverage(blendNormalInlineSimd)
else:
useCoverage(blenderSimd)
elif blendMode == bmMask: elif blendMode == bmMask:
for i in 0 ..< 4: for i in 0 ..< 4:
@ -1460,6 +1474,7 @@ proc fillHits(
let let
blender = blendMode.blender() blender = blendMode.blender()
width = image.width.float32 width = image.width.float32
var filledTo: int var filledTo: int
for (prevAt, at, count) in hits.walk(numHits, windingRule, y, width): for (prevAt, at, count) in hits.walk(numHits, windingRule, y, width):
let let
@ -1478,18 +1493,30 @@ proc fillHits(
when defined(amd64) and not defined(pixieNoSimd): when defined(amd64) and not defined(pixieNoSimd):
if blendMode.hasSimdBlender(): if blendMode.hasSimdBlender():
# When supported, SIMD blend as much as possible # When supported, SIMD blend as much as possible
let let colorVec = mm_set1_epi32(cast[int32](rgbx))
blenderSimd = blendMode.blenderSimd() if blendMode == bmNormal:
colorVec = mm_set1_epi32(cast[int32](rgbx)) # For path filling, bmNormal is almost always used.
for _ in 0 ..< fillLen div 4: # Inline SIMD is faster here.
let for _ in 0 ..< fillLen div 4:
index = image.dataIndex(x, y) let
backdrop = mm_loadu_si128(image.data[index].addr) index = image.dataIndex(x, y)
mm_storeu_si128( backdrop = mm_loadu_si128(image.data[index].addr)
image.data[index].addr, mm_storeu_si128(
blenderSimd(backdrop, colorVec) image.data[index].addr,
) blendNormalInlineSimd(backdrop, colorVec)
x += 4 )
x += 4
else:
let blenderSimd = blendMode.blenderSimd()
for _ in 0 ..< fillLen div 4:
let
index = image.dataIndex(x, y)
backdrop = mm_loadu_si128(image.data[index].addr)
mm_storeu_si128(
image.data[index].addr,
blenderSimd(backdrop, colorVec)
)
x += 4
for x in x ..< fillStart + fillLen: for x in x ..< fillStart + fillLen:
let backdrop = image.getRgbaUnsafe(x, y) let backdrop = image.getRgbaUnsafe(x, y)