Merge pull request #319 from guzba/master

3.0.3, faster simd path fill
This commit is contained in:
treeform 2021-11-18 20:17:44 -08:00 committed by GitHub
commit 2a54de7604
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 71 additions and 62 deletions

View file

@ -1,4 +1,4 @@
version = "3.0.2" version = "3.0.3"
author = "Andre von Houck and Ryan Oldenburg" author = "Andre von Houck and Ryan Oldenburg"
description = "Full-featured 2d graphics library for Nim." description = "Full-featured 2d graphics library for Nim."
license = "MIT" license = "MIT"

View file

@ -56,13 +56,11 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
oddMask = mm_set1_epi16(cast[int16](0xff00)) oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081)) div255 = mm_set1_epi16(cast[int16](0x8081))
for j in countup(i, data.len - 4, 4): for _ in countup(i, data.len - 4, 4):
var var
color = mm_loadu_si128(data[j].addr) color = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(color, alphaMask) alpha = mm_and_si128(color, alphaMask)
if mm_movemask_epi8(mm_cmpeq_epi16(alpha, alphaMask)) != 0xffff:
let eqOpaque = mm_cmpeq_epi16(alpha, alphaMask)
if mm_movemask_epi8(eqOpaque) != 0xffff:
# If not all of the alpha values are 255, premultiply # If not all of the alpha values are 255, premultiply
var var
colorEven = mm_slli_epi16(color, 8) colorEven = mm_slli_epi16(color, 8)
@ -81,8 +79,10 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
mm_and_si128(alpha, alphaMask), mm_and_si128(color, notAlphaMask) mm_and_si128(alpha, alphaMask), mm_and_si128(color, notAlphaMask)
) )
mm_storeu_si128(data[j].addr, color) mm_storeu_si128(data[i].addr, color)
i += 4 i += 4
# Convert whatever is left # Convert whatever is left
for j in i ..< data.len: for j in i ..< data.len:
var c = data[j] var c = data[j]

View file

@ -1164,10 +1164,10 @@ iterator walk(
windingRule: WindingRule, windingRule: WindingRule,
y: int, y: int,
width: float32 width: float32
): (float32, float32, int32) = ): (float32, float32, int) =
var var
prevAt: float32 prevAt: float32
count: int32 count: int
for i in 0 ..< numHits: for i in 0 ..< numHits:
let (at, winding) = hits[i] let (at, winding) = hits[i]
if windingRule == wrNonZero and if windingRule == wrNonZero and
@ -1257,10 +1257,10 @@ proc computeCoverages(
if fillLen > 0: if fillLen > 0:
var i = fillStart var i = fillStart
when defined(amd64) and not defined(pixieNoSimd): when defined(amd64) and not defined(pixieNoSimd):
let vSampleCoverage = mm_set1_epi8(cast[int8](sampleCoverage)) let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage))
for j in countup(i, fillStart + fillLen - 16, 16): for j in countup(i, fillStart + fillLen - 16, 16):
var coverage = mm_loadu_si128(coverages[j - startX].addr) var coverage = mm_loadu_si128(coverages[j - startX].addr)
coverage = mm_add_epi8(coverage, vSampleCoverage) coverage = mm_add_epi8(coverage, sampleCoverageVec)
mm_storeu_si128(coverages[j - startX].addr, coverage) mm_storeu_si128(coverages[j - startX].addr, coverage)
i += 16 i += 16
for j in i ..< fillStart + fillLen: for j in i ..< fillStart + fillLen:
@ -1291,56 +1291,64 @@ proc fillCoverage(
# When supported, SIMD blend as much as possible # When supported, SIMD blend as much as possible
let let
blenderSimd = blendMode.blenderSimd() blenderSimd = blendMode.blenderSimd()
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
oddMask = mm_set1_epi16(cast[int16](0xff00)) oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081)) div255 = mm_set1_epi16(cast[int16](0x8081))
vColor = mm_set1_epi32(cast[int32](rgbx)) vec255 = mm_set1_epi32(cast[int32](uint32.high))
for _ in countup(x, startX + coverages.len - 16, 4): zeroVec = mm_setzero_si128()
var coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr) colorVec = mm_set1_epi32(cast[int32](rgbx))
coverage = mm_and_si128(coverage, first32) for _ in countup(x, startX + coverages.len - 16, 16):
let let
index = image.dataIndex(x, y) index = image.dataIndex(x, y)
eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr)
if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask:
if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff:
# If the coverages are not all zero # If the coverages are not all zero
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) == 0xffff: if mm_movemask_epi8(mm_cmpeq_epi32(coverage, vec255)) == 0xffff:
# Coverages are all 255 # If the coverages are all 255
if blendMode == bmNormal and rgbx.a == 255: if blendMode == bmNormal and rgbx.a == 255:
mm_storeu_si128(image.data[index].addr, vColor) for i in 0 ..< 4:
mm_storeu_si128(image.data[index + i * 4].addr, colorVec)
else: else:
let backdrop = mm_loadu_si128(image.data[index].addr) for i in 0 ..< 4:
mm_storeu_si128( let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
image.data[index].addr, mm_storeu_si128(
blenderSimd(backdrop, vColor) image.data[index + i * 4].addr,
) blenderSimd(backdrop, colorVec)
)
else: else:
# Coverages are not all 255 # Coverages are not all 255
coverage = unpackAlphaValues(coverage) var coverage = coverage
# Shift the coverages from `a` to `g` and `a` for multiplying for i in 0 ..< 4:
coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16)) var unpacked = unpackAlphaValues(coverage)
# Shift the coverages from `a` to `g` and `a` for multiplying
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var var
source = vColor source = colorVec
sourceEven = mm_slli_epi16(source, 8) sourceEven = mm_slli_epi16(source, 8)
sourceOdd = mm_and_si128(source, oddMask) sourceOdd = mm_and_si128(source, oddMask)
sourceEven = mm_mulhi_epu16(sourceEven, coverage) sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
sourceOdd = mm_mulhi_epu16(sourceOdd, coverage) sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128(
image.data[index + i * 4].addr,
blenderSimd(backdrop, source)
)
coverage = mm_srli_si128(coverage, 4)
let backdrop = mm_loadu_si128(image.data[index].addr)
mm_storeu_si128(
image.data[index].addr,
blenderSimd(backdrop, source)
)
elif blendMode == bmMask: elif blendMode == bmMask:
mm_storeu_si128(image.data[index].addr, mm_setzero_si128()) for i in 0 ..< 4:
x += 4 mm_storeu_si128(image.data[index + i * 4].addr, zeroVec)
x += 16
let blender = blendMode.blender() let blender = blendMode.blender()
while x < startX + coverages.len: while x < startX + coverages.len:
@ -1375,13 +1383,14 @@ proc fillCoverage(
var x = startX var x = startX
when defined(amd64) and not defined(pixieNoSimd): when defined(amd64) and not defined(pixieNoSimd):
if blendMode.hasSimdMasker(): if blendMode.hasSimdMasker():
let maskerSimd = blendMode.maskerSimd() let
maskerSimd = blendMode.maskerSimd()
zeroVec = mm_setzero_si128()
for _ in countup(x, startX + coverages.len - 16, 16): for _ in countup(x, startX + coverages.len - 16, 16):
let let
index = mask.dataIndex(x, y) index = mask.dataIndex(x, y)
coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr) coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr)
eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff:
if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask:
# If the coverages are not all zero # If the coverages are not all zero
let backdrop = mm_loadu_si128(mask.data[index].addr) let backdrop = mm_loadu_si128(mask.data[index].addr)
mm_storeu_si128( mm_storeu_si128(
@ -1389,7 +1398,7 @@ proc fillCoverage(
maskerSimd(backdrop, coverage) maskerSimd(backdrop, coverage)
) )
elif blendMode == bmMask: elif blendMode == bmMask:
mm_storeu_si128(mask.data[index].addr, mm_setzero_si128()) mm_storeu_si128(mask.data[index].addr, zeroVec)
x += 16 x += 16
let masker = blendMode.masker() let masker = blendMode.masker()
@ -1438,16 +1447,16 @@ proc fillHits(
# When supported, SIMD blend as much as possible # When supported, SIMD blend as much as possible
let let
blenderSimd = blendMode.blenderSimd() blenderSimd = blendMode.blenderSimd()
vColor = mm_set1_epi32(cast[int32](rgbx)) colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in countup(fillStart, fillLen - 16, 4): for _ in countup(fillStart, fillLen - 16, 16):
let let index = image.dataIndex(x, y)
index = image.dataIndex(x, y) for i in 0 ..< 4:
backdrop = mm_loadu_si128(image.data[index].addr) let backdrop = mm_loadu_si128(image.data[index + i * 4].addr)
mm_storeu_si128( mm_storeu_si128(
image.data[index].addr, image.data[index + i * 4].addr,
blenderSimd(backdrop, vColor) blenderSimd(backdrop, colorVec)
) )
x += 4 x += 16
for x in x ..< fillStart + fillLen: for x in x ..< fillStart + fillLen:
let backdrop = image.getRgbaUnsafe(x, y) let backdrop = image.getRgbaUnsafe(x, y)
@ -1487,12 +1496,12 @@ proc fillHits(
if blendMode.hasSimdMasker(): if blendMode.hasSimdMasker():
let let
maskerSimd = blendMode.maskerSimd() maskerSimd = blendMode.maskerSimd()
vValue = mm_set1_epi8(cast[int8](255)) valueVec = mm_set1_epi8(cast[int8](255))
for _ in countup(fillStart, fillLen - 16, 16): for _ in countup(fillStart, fillLen - 16, 16):
let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr)
mm_storeu_si128( mm_storeu_si128(
mask.data[mask.dataIndex(x, y)].addr, mask.data[mask.dataIndex(x, y)].addr,
maskerSimd(backdrop, vValue) maskerSimd(backdrop, valueVec)
) )
x += 16 x += 16

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB