cleaner + faster simd
This commit is contained in:
parent
426c9766f1
commit
b800c77ab5
3 changed files with 38 additions and 62 deletions
|
@ -160,7 +160,7 @@ proc SetSat(C: Color, s: float32): Color {.inline.} =
|
||||||
if satC > 0:
|
if satC > 0:
|
||||||
result = (C - min([C.r, C.g, C.b])) * s / satC
|
result = (C - min([C.r, C.g, C.b])) * s / satC
|
||||||
|
|
||||||
proc blendNormal(backdrop, source: ColorRGBA): ColorRGBA =
|
proc blendNormal*(backdrop, source: ColorRGBA): ColorRGBA =
|
||||||
if backdrop.a == 0:
|
if backdrop.a == 0:
|
||||||
return source
|
return source
|
||||||
if source.a == 255:
|
if source.a == 255:
|
||||||
|
@ -516,6 +516,31 @@ when defined(amd64) and not defined(pixieNoSimd):
|
||||||
result = mm_or_si128(mm_or_si128(result, i), mm_or_si128(j, k))
|
result = mm_or_si128(mm_or_si128(result, i), mm_or_si128(j, k))
|
||||||
result = mm_and_si128(result, first32)
|
result = mm_and_si128(result, first32)
|
||||||
|
|
||||||
|
proc unpackAlphaValues*(v: M128i): M128i {.inline.} =
|
||||||
|
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value)
|
||||||
|
let
|
||||||
|
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
|
||||||
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a`
|
||||||
|
|
||||||
|
result = mm_shuffle_epi32(v, MM_SHUFFLE(0, 0, 0, 0))
|
||||||
|
|
||||||
|
var
|
||||||
|
i = mm_and_si128(result, first32)
|
||||||
|
j = mm_and_si128(result, mm_slli_si128(first32, 4))
|
||||||
|
k = mm_and_si128(result, mm_slli_si128(first32, 8))
|
||||||
|
l = mm_and_si128(result, mm_slli_si128(first32, 12))
|
||||||
|
|
||||||
|
# Shift the values to `a`
|
||||||
|
i = mm_slli_si128(i, 3)
|
||||||
|
j = mm_slli_si128(j, 2)
|
||||||
|
k = mm_slli_si128(k, 1)
|
||||||
|
# l = mm_slli_si128(l, 0)
|
||||||
|
|
||||||
|
result = mm_and_si128(
|
||||||
|
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)),
|
||||||
|
alphaMask
|
||||||
|
)
|
||||||
|
|
||||||
proc blendNormalSimd*(backdrop, source: M128i): M128i =
|
proc blendNormalSimd*(backdrop, source: M128i): M128i =
|
||||||
let
|
let
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
||||||
|
@ -615,9 +640,7 @@ when defined(amd64) and not defined(pixieNoSimd):
|
||||||
blendedEven = mm_add_epi16(sourceEven, backdropEven)
|
blendedEven = mm_add_epi16(sourceEven, backdropEven)
|
||||||
blendedOdd = mm_add_epi16(sourceOdd, backdropOdd)
|
blendedOdd = mm_add_epi16(sourceOdd, backdropOdd)
|
||||||
|
|
||||||
blendedOdd = mm_slli_epi16(blendedOdd, 8)
|
mm_or_si128(blendedEven, mm_slli_epi16(blendedOdd, 8))
|
||||||
|
|
||||||
mm_or_si128(blendedEven, blendedOdd)
|
|
||||||
|
|
||||||
proc maskMaskSimd*(backdrop, source: M128i): M128i =
|
proc maskMaskSimd*(backdrop, source: M128i): M128i =
|
||||||
let
|
let
|
||||||
|
@ -638,9 +661,7 @@ when defined(amd64) and not defined(pixieNoSimd):
|
||||||
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
|
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
|
||||||
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
|
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
|
||||||
|
|
||||||
backdropOdd = mm_slli_epi16(backdropOdd, 8)
|
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
|
||||||
|
|
||||||
mm_or_si128(backdropEven, backdropOdd)
|
|
||||||
|
|
||||||
proc maskerSimd*(blendMode: BlendMode): MaskerSimd =
|
proc maskerSimd*(blendMode: BlendMode): MaskerSimd =
|
||||||
case blendMode:
|
case blendMode:
|
||||||
|
|
|
@ -686,10 +686,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) =
|
||||||
# Check we are not rotated before using SIMD blends
|
# Check we are not rotated before using SIMD blends
|
||||||
when type(a) is Image:
|
when type(a) is Image:
|
||||||
if blendMode.hasSimdBlender():
|
if blendMode.hasSimdBlender():
|
||||||
let
|
let blenderSimd = blendMode.blenderSimd()
|
||||||
blenderSimd = blendMode.blenderSimd()
|
|
||||||
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
|
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a`
|
|
||||||
for _ in countup(x, xMax - 4, 4):
|
for _ in countup(x, xMax - 4, 4):
|
||||||
let
|
let
|
||||||
srcPos = p + dx * x.float32 + dy * y.float32
|
srcPos = p + dx * x.float32 + dy * y.float32
|
||||||
|
@ -701,24 +698,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) =
|
||||||
else: # b is a Mask
|
else: # b is a Mask
|
||||||
# Need to move 4 mask values into the alpha slots
|
# Need to move 4 mask values into the alpha slots
|
||||||
var source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr)
|
var source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr)
|
||||||
source = mm_slli_si128(source, 2)
|
source = unpackAlphaValues(source)
|
||||||
source = mm_shuffle_epi32(source, MM_SHUFFLE(1, 1, 0, 0))
|
|
||||||
|
|
||||||
var
|
|
||||||
i = mm_and_si128(source, first32)
|
|
||||||
j = mm_and_si128(source, mm_slli_si128(first32, 4))
|
|
||||||
k = mm_and_si128(source, mm_slli_si128(first32, 8))
|
|
||||||
l = mm_and_si128(source, mm_slli_si128(first32, 12))
|
|
||||||
|
|
||||||
# Shift the values to `a`
|
|
||||||
i = mm_slli_si128(i, 1)
|
|
||||||
k = mm_slli_si128(k, 3)
|
|
||||||
l = mm_slli_si128(l, 2)
|
|
||||||
|
|
||||||
source = mm_and_si128(
|
|
||||||
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)),
|
|
||||||
alphaMask
|
|
||||||
)
|
|
||||||
|
|
||||||
mm_storeu_si128(
|
mm_storeu_si128(
|
||||||
a.data[a.dataIndex(x, y)].addr,
|
a.data[a.dataIndex(x, y)].addr,
|
||||||
|
|
|
@ -966,13 +966,11 @@ proc fillShapes(
|
||||||
# When supported, SIMD blend as much as possible
|
# When supported, SIMD blend as much as possible
|
||||||
let
|
let
|
||||||
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
|
first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits
|
||||||
redMask = mm_set1_epi32(cast[int32](0x000000ff)) # Only `r`
|
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
v255 = mm_set1_epi32(255)
|
|
||||||
vColor = mm_set1_epi32(cast[int32](color))
|
vColor = mm_set1_epi32(cast[int32](color))
|
||||||
|
|
||||||
for _ in countup(x, coverages.len - 16, 16):
|
for _ in countup(x, image.width - 16, 4):
|
||||||
var coverage = mm_loadu_si128(coverages[x].addr)
|
var coverage = mm_loadu_si128(coverages[x].addr)
|
||||||
coverage = mm_and_si128(coverage, first32)
|
coverage = mm_and_si128(coverage, first32)
|
||||||
|
|
||||||
|
@ -981,32 +979,11 @@ proc fillShapes(
|
||||||
# If the coverages are not all zero
|
# If the coverages are not all zero
|
||||||
var source = vColor
|
var source = vColor
|
||||||
|
|
||||||
coverage = mm_slli_si128(coverage, 2)
|
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) != 0xffff:
|
||||||
coverage = mm_shuffle_epi32(coverage, MM_SHUFFLE(1, 1, 0, 0))
|
|
||||||
|
|
||||||
var
|
|
||||||
a = mm_and_si128(coverage, first32)
|
|
||||||
b = mm_and_si128(coverage, mm_slli_si128(first32, 4))
|
|
||||||
c = mm_and_si128(coverage, mm_slli_si128(first32, 8))
|
|
||||||
d = mm_and_si128(coverage, mm_slli_si128(first32, 12))
|
|
||||||
|
|
||||||
# Shift the coverages to `r`
|
|
||||||
a = mm_srli_si128(a, 2)
|
|
||||||
b = mm_srli_si128(b, 3)
|
|
||||||
d = mm_srli_si128(d, 1)
|
|
||||||
|
|
||||||
coverage = mm_and_si128(
|
|
||||||
mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)),
|
|
||||||
redMask
|
|
||||||
)
|
|
||||||
|
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff:
|
|
||||||
# If the coverages are not all 255
|
# If the coverages are not all 255
|
||||||
|
coverage = unpackAlphaValues(coverage)
|
||||||
# Shift the coverages from `r` to `g` and `a` for multiplying later
|
# Shift the coverages from `a` to `g` and `a` for multiplying
|
||||||
coverage = mm_or_si128(
|
coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16))
|
||||||
mm_slli_epi32(coverage, 8), mm_slli_epi32(coverage, 24)
|
|
||||||
)
|
|
||||||
|
|
||||||
var
|
var
|
||||||
colorEven = mm_slli_epi16(source, 8)
|
colorEven = mm_slli_epi16(source, 8)
|
||||||
|
@ -1085,18 +1062,16 @@ proc fillShapes(
|
||||||
when defined(amd64) and not defined(pixieNoSimd):
|
when defined(amd64) and not defined(pixieNoSimd):
|
||||||
# When supported, SIMD blend as much as possible
|
# When supported, SIMD blend as much as possible
|
||||||
for _ in countup(x, coverages.len - 16, 16):
|
for _ in countup(x, coverages.len - 16, 16):
|
||||||
var coverage = mm_loadu_si128(coverages[x].addr)
|
let
|
||||||
|
coverage = mm_loadu_si128(coverages[x].addr)
|
||||||
let eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128())
|
eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128())
|
||||||
if mm_movemask_epi8(eqZero) != 0xffff:
|
if mm_movemask_epi8(eqZero) != 0xffff:
|
||||||
# If the coverages are not all zero
|
# If the coverages are not all zero
|
||||||
let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr)
|
let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr)
|
||||||
|
|
||||||
mm_storeu_si128(
|
mm_storeu_si128(
|
||||||
mask.data[mask.dataIndex(x, y)].addr,
|
mask.data[mask.dataIndex(x, y)].addr,
|
||||||
maskNormalSimd(backdrop, coverage)
|
maskNormalSimd(backdrop, coverage)
|
||||||
)
|
)
|
||||||
|
|
||||||
x += 16
|
x += 16
|
||||||
|
|
||||||
while x < mask.width:
|
while x < mask.width:
|
||||||
|
|
Loading…
Reference in a new issue