Merge pull request #471 from treeform/simdmacro
simd macro works on signature not just name, split applyOpacity + invert
This commit is contained in:
commit
31c045dcdb
10 changed files with 397 additions and 98 deletions
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
||||||
requires "chroma >= 0.2.6"
|
requires "chroma >= 0.2.6"
|
||||||
requires "zippy >= 0.10.3"
|
requires "zippy >= 0.10.3"
|
||||||
requires "flatty >= 0.3.4"
|
requires "flatty >= 0.3.4"
|
||||||
requires "nimsimd >= 1.1.7"
|
requires "nimsimd >= 1.1.8"
|
||||||
requires "bumpy >= 1.1.1"
|
requires "bumpy >= 1.1.1"
|
||||||
|
|
||||||
task bindings, "Generate bindings":
|
task bindings, "Generate bindings":
|
||||||
|
|
|
@ -320,38 +320,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
|
||||||
result.width * 4
|
result.width * 4
|
||||||
)
|
)
|
||||||
|
|
||||||
proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} =
|
proc applyOpacity*(image: Image, opacity: float32) {.hasSimd, raises: [].} =
|
||||||
## Multiplies alpha of the image by opacity.
|
## Multiplies alpha of the image by opacity.
|
||||||
let opacity = round(255 * opacity).uint16
|
let opacity = round(255 * opacity).uint16
|
||||||
if opacity == 255:
|
if opacity == 255:
|
||||||
return
|
return
|
||||||
|
|
||||||
if opacity == 0:
|
if opacity == 0:
|
||||||
target.fill(rgbx(0, 0, 0, 0))
|
image.fill(rgbx(0, 0, 0, 0))
|
||||||
return
|
return
|
||||||
|
|
||||||
for i in 0 ..< target.data.len:
|
for i in 0 ..< image.data.len:
|
||||||
var rgbx = target.data[i]
|
var rgbx = image.data[i]
|
||||||
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||||
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||||
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||||
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||||
target.data[i] = rgbx
|
image.data[i] = rgbx
|
||||||
|
|
||||||
proc invert*(target: Image) {.hasSimd, raises: [].} =
|
proc invert*(image: Image) {.hasSimd, raises: [].} =
|
||||||
## Inverts all of the colors and alpha.
|
## Inverts all of the colors and alpha.
|
||||||
for i in 0 ..< target.data.len:
|
for i in 0 ..< image.data.len:
|
||||||
var rgbx = target.data[i]
|
var rgbx = image.data[i]
|
||||||
rgbx.r = 255 - rgbx.r
|
rgbx.r = 255 - rgbx.r
|
||||||
rgbx.g = 255 - rgbx.g
|
rgbx.g = 255 - rgbx.g
|
||||||
rgbx.b = 255 - rgbx.b
|
rgbx.b = 255 - rgbx.b
|
||||||
rgbx.a = 255 - rgbx.a
|
rgbx.a = 255 - rgbx.a
|
||||||
target.data[i] = rgbx
|
image.data[i] = rgbx
|
||||||
|
|
||||||
# Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
|
# Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
|
||||||
# is not a valid premultiplied alpha color.
|
# is not a valid premultiplied alpha color.
|
||||||
# We need to convert back to premultiplied alpha after inverting.
|
# We need to convert back to premultiplied alpha after inverting.
|
||||||
target.data.toPremultipliedAlpha()
|
image.data.toPremultipliedAlpha()
|
||||||
|
|
||||||
proc blur*(
|
proc blur*(
|
||||||
image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0)
|
image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0)
|
||||||
|
|
|
@ -165,18 +165,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
|
||||||
result.width
|
result.width
|
||||||
)
|
)
|
||||||
|
|
||||||
proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} =
|
proc applyOpacity*(mask: Mask, opacity: float32) {.hasSimd, raises: [].} =
|
||||||
## Multiplies alpha of the image by opacity.
|
## Multiplies alpha of the image by opacity.
|
||||||
let opacity = round(255 * opacity).uint16
|
let opacity = round(255 * opacity).uint16
|
||||||
if opacity == 255:
|
if opacity == 255:
|
||||||
return
|
return
|
||||||
|
|
||||||
if opacity == 0:
|
if opacity == 0:
|
||||||
target.fill(0)
|
mask.fill(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
for i in 0 ..< target.data.len:
|
for i in 0 ..< mask.data.len:
|
||||||
target.data[i] = ((target.data[i] * opacity) div 255).uint8
|
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
|
||||||
|
|
||||||
proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
|
proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
|
||||||
## Gets a interpolated value with float point coordinates.
|
## Gets a interpolated value with float point coordinates.
|
||||||
|
@ -206,10 +206,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
|
||||||
else:
|
else:
|
||||||
topMix
|
topMix
|
||||||
|
|
||||||
proc invert*(target: Mask) {.hasSimd, raises: [].} =
|
proc invert*(mask: Mask) {.hasSimd, raises: [].} =
|
||||||
## Inverts all of the values - creates a negative of the mask.
|
## Inverts all of the values - creates a negative of the mask.
|
||||||
for i in 0 ..< target.data.len:
|
for i in 0 ..< mask.data.len:
|
||||||
target.data[i] = 255 - target.data[i]
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
|
||||||
proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
|
proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
|
||||||
## Grows the mask by spread.
|
## Grows the mask by spread.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import simd/internal
|
import simd/internal, system/memory
|
||||||
|
|
||||||
export internal
|
export internal, memory
|
||||||
|
|
||||||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ when allowSimd:
|
||||||
|
|
||||||
elif defined(arm64):
|
elif defined(arm64):
|
||||||
import simd/neon
|
import simd/neon
|
||||||
|
export neon
|
||||||
|
|
||||||
import nimsimd/neon as nimsimdneon
|
import nimsimd/neon as nimsimdneon
|
||||||
export nimsimdneon
|
export nimsimdneon
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import chroma, internal, nimsimd/avx2, pixie/common
|
import avx, chroma, internal, nimsimd/avx2, pixie/common, vmath
|
||||||
|
|
||||||
when defined(gcc) or defined(clang):
|
when defined(gcc) or defined(clang):
|
||||||
{.localPassc: "-mavx2".}
|
{.localPassc: "-mavx2".}
|
||||||
|
@ -133,5 +133,88 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
||||||
c.b = ((c.b.uint32 * c.a + 127) div 255).uint8
|
c.b = ((c.b.uint32 * c.a + 127) div 255).uint8
|
||||||
data[i] = c
|
data[i] = c
|
||||||
|
|
||||||
|
proc invertAvx2*(image: Image) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
# Align to 32 bytes
|
||||||
|
while i < image.data.len and (p and 31) != 0:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = mm256_set1_epi8(255)
|
||||||
|
iterations = image.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
a = mm256_load_si256(cast[pointer](p))
|
||||||
|
b = mm256_load_si256(cast[pointer](p + 32))
|
||||||
|
mm256_store_si256(cast[pointer](p), mm256_sub_epi8(vec255, a))
|
||||||
|
mm256_store_si256(cast[pointer](p + 32), mm256_sub_epi8(vec255, b))
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
toPremultipliedAlphaAvx2(image.data)
|
||||||
|
|
||||||
|
proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint16
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
fillUnsafeAvx(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
|
||||||
|
return
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
oddMask = mm256_set1_epi16(0xff00)
|
||||||
|
div255 = mm256_set1_epi16(0x8081)
|
||||||
|
zeroVec = mm256_setzero_si256()
|
||||||
|
opacityVec = mm256_slli_epi16(mm256_set1_epi16(opacity), 8)
|
||||||
|
iterations = image.data.len div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values = mm256_loadu_si256(cast[pointer](p))
|
||||||
|
eqZero = mm256_cmpeq_epi16(values, zeroVec)
|
||||||
|
if mm256_movemask_epi8(eqZero) != cast[int32](0xffffffff):
|
||||||
|
var
|
||||||
|
valuesEven = mm256_slli_epi16(values, 8)
|
||||||
|
valuesOdd = mm256_and_si256(values, oddMask)
|
||||||
|
valuesEven = mm256_mulhi_epu16(valuesEven, opacityVec)
|
||||||
|
valuesOdd = mm256_mulhi_epu16(valuesOdd, opacityVec)
|
||||||
|
valuesEven = mm256_srli_epi16(mm256_mulhi_epu16(valuesEven, div255), 7)
|
||||||
|
valuesOdd = mm256_srli_epi16(mm256_mulhi_epu16(valuesOdd, div255), 7)
|
||||||
|
mm256_storeu_si256(
|
||||||
|
cast[pointer](p),
|
||||||
|
mm256_or_si256(valuesEven, mm256_slli_epi16(valuesOdd, 8))
|
||||||
|
)
|
||||||
|
p += 32
|
||||||
|
i += 8 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||||
|
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||||
|
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||||
|
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
|
@ -3,7 +3,7 @@ import std/macros, std/tables
|
||||||
var simdProcs* {.compiletime.}: Table[string, NimNode]
|
var simdProcs* {.compiletime.}: Table[string, NimNode]
|
||||||
|
|
||||||
proc procName(procedure: NimNode): string =
|
proc procName(procedure: NimNode): string =
|
||||||
## Given a procedure signature returns only name string.
|
## Given a procedure this returns the name as a string.
|
||||||
let nameNode = procedure[0]
|
let nameNode = procedure[0]
|
||||||
if nameNode.kind == nnkPostfix:
|
if nameNode.kind == nnkPostfix:
|
||||||
nameNode[1].strVal
|
nameNode[1].strVal
|
||||||
|
@ -11,16 +11,34 @@ proc procName(procedure: NimNode): string =
|
||||||
nameNode.strVal
|
nameNode.strVal
|
||||||
|
|
||||||
proc procArguments(procedure: NimNode): seq[NimNode] =
|
proc procArguments(procedure: NimNode): seq[NimNode] =
|
||||||
## Given a procedure signature gets the arguments as a list.
|
## Given a procedure this gets the arguments as a list.
|
||||||
for i, arg in procedure[3]:
|
for i, arg in procedure[3]:
|
||||||
if i > 0:
|
if i > 0:
|
||||||
for j in 0 ..< arg.len - 2:
|
for j in 0 ..< arg.len - 2:
|
||||||
result.add(arg[j])
|
result.add(arg[j])
|
||||||
|
|
||||||
proc procReturnType(procedure: NimNode): NimNode =
|
proc procReturnType(procedure: NimNode): NimNode =
|
||||||
## Given a procedure signature gets the return type.
|
## Given a procedure this gets the return type.
|
||||||
procedure[3][0]
|
procedure[3][0]
|
||||||
|
|
||||||
|
proc procSignature(procedure: NimNode): string =
|
||||||
|
## Given a procedure this returns the signature as a string.
|
||||||
|
result = "("
|
||||||
|
|
||||||
|
for i, arg in procedure[3]:
|
||||||
|
if i > 0:
|
||||||
|
for j in 0 ..< arg.len - 2:
|
||||||
|
result &= arg[^2].repr & ", "
|
||||||
|
|
||||||
|
if procedure[3].len > 1:
|
||||||
|
result = result[0 ..^ 3]
|
||||||
|
|
||||||
|
result &= ")"
|
||||||
|
|
||||||
|
let ret = procedure.procReturnType()
|
||||||
|
if ret.kind != nnkEmpty:
|
||||||
|
result &= ": " & ret.repr
|
||||||
|
|
||||||
proc callAndReturn(name: NimNode, procedure: NimNode): NimNode =
|
proc callAndReturn(name: NimNode, procedure: NimNode): NimNode =
|
||||||
## Produces a procedure call with arguments.
|
## Produces a procedure call with arguments.
|
||||||
let
|
let
|
||||||
|
@ -38,8 +56,8 @@ proc callAndReturn(name: NimNode, procedure: NimNode): NimNode =
|
||||||
return `call`
|
return `call`
|
||||||
|
|
||||||
macro simd*(procedure: untyped) =
|
macro simd*(procedure: untyped) =
|
||||||
let name = procedure.procName()
|
let signature = procedure.procName() & procSignature(procedure)
|
||||||
simdProcs[name] = procedure.copy()
|
simdProcs[signature] = procedure.copy()
|
||||||
return procedure
|
return procedure
|
||||||
|
|
||||||
macro hasSimd*(procedure: untyped) =
|
macro hasSimd*(procedure: untyped) =
|
||||||
|
@ -53,25 +71,31 @@ macro hasSimd*(procedure: untyped) =
|
||||||
callAvx = callAndReturn(ident(nameAvx), procedure)
|
callAvx = callAndReturn(ident(nameAvx), procedure)
|
||||||
callAvx2 = callAndReturn(ident(nameAvx2), procedure)
|
callAvx2 = callAndReturn(ident(nameAvx2), procedure)
|
||||||
|
|
||||||
var body = newStmtList()
|
var
|
||||||
|
foundSimd: bool
|
||||||
|
body = newStmtList()
|
||||||
|
|
||||||
when defined(amd64) and not defined(pixieNoAvx):
|
when defined(amd64) and not defined(pixieNoAvx):
|
||||||
if nameAvx2 in simdProcs:
|
if nameAvx2 & procSignature(procedure) in simdProcs:
|
||||||
|
foundSimd = true
|
||||||
body.add quote do:
|
body.add quote do:
|
||||||
if cpuHasAvx2:
|
if cpuHasAvx2:
|
||||||
`callAvx2`
|
`callAvx2`
|
||||||
|
|
||||||
if nameAvx in simdProcs:
|
if nameAvx & procSignature(procedure) in simdProcs:
|
||||||
|
foundSimd = true
|
||||||
body.add quote do:
|
body.add quote do:
|
||||||
if cpuHasAvx2:
|
if cpuHasAvx2:
|
||||||
`callAvx`
|
`callAvx`
|
||||||
|
|
||||||
if nameSse2 in simdProcs:
|
if nameSse2 & procSignature(procedure) in simdProcs:
|
||||||
let bodySse2 = simdProcs[nameSse2][6]
|
foundSimd = true
|
||||||
|
let bodySse2 = simdProcs[nameSse2 & procSignature(procedure)][6]
|
||||||
body.add quote do:
|
body.add quote do:
|
||||||
`bodySse2`
|
`bodySse2`
|
||||||
elif nameNeon in simdProcs:
|
elif nameNeon & procSignature(procedure) in simdProcs:
|
||||||
let bodyNeon = simdProcs[nameNeon][6]
|
foundSimd = true
|
||||||
|
let bodyNeon = simdProcs[nameNeon & procSignature(procedure)][6]
|
||||||
body.add quote do:
|
body.add quote do:
|
||||||
`bodyNeon`
|
`bodyNeon`
|
||||||
else:
|
else:
|
||||||
|
@ -80,4 +104,8 @@ macro hasSimd*(procedure: untyped) =
|
||||||
|
|
||||||
procedure[6] = body
|
procedure[6] = body
|
||||||
|
|
||||||
|
when not defined(pixieNoSimd):
|
||||||
|
if not foundSimd:
|
||||||
|
echo "No SIMD found for " & name & procSignature(procedure)
|
||||||
|
|
||||||
return procedure
|
return procedure
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import chroma, internal, nimsimd/neon, pixie/common
|
import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
||||||
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
||||||
vst4_u8(cast[pointer](p), channels)
|
vst4_u8(cast[pointer](p), channels)
|
||||||
p += 32
|
p += 32
|
||||||
i += 8
|
i += 8 * iterations
|
||||||
|
|
||||||
for i in i ..< data.len:
|
for i in i ..< data.len:
|
||||||
var c = data[i]
|
var c = data[i]
|
||||||
|
@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} =
|
||||||
for i in i ..< image.data.len:
|
for i in i ..< image.data.len:
|
||||||
result.data[i] = image.data[i].a
|
result.data[i] = image.data[i].a
|
||||||
|
|
||||||
|
proc invertNeon*(image: Image) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
# Align to 16 bytes
|
||||||
|
while i < image.data.len and (p and 15) != 0:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = image.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var channels = vld4q_u8(cast[pointer](p))
|
||||||
|
channels.val[0] = vsubq_u8(vec255, channels.val[0])
|
||||||
|
channels.val[1] = vsubq_u8(vec255, channels.val[1])
|
||||||
|
channels.val[2] = vsubq_u8(vec255, channels.val[2])
|
||||||
|
channels.val[3] = vsubq_u8(vec255, channels.val[3])
|
||||||
|
vst4q_u8(cast[pointer](p), channels)
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
toPremultipliedAlphaNeon(image.data)
|
||||||
|
|
||||||
|
proc invertNeon*(mask: Mask) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
# Align to 16 bytes
|
||||||
|
while i < mask.data.len and (p and 15) != 0:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
inc i
|
||||||
|
inc p
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = mask.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let values = vld1q_u8(cast[pointer](p))
|
||||||
|
vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values))
|
||||||
|
p += 16
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
|
||||||
|
proc ceilNeon*(mask: Mask) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
zeroVec = vmovq_n_u8(0)
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = mask.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var values = vld1q_u8(cast[pointer](p))
|
||||||
|
values = vceqq_u8(values, zeroVec)
|
||||||
|
values = vbicq_u8(vec255, values)
|
||||||
|
vst1q_u8(cast[pointer](p), values)
|
||||||
|
p += 16
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
if mask.data[i] != 0:
|
||||||
|
mask.data[i] = 255
|
||||||
|
|
||||||
|
proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint8
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
|
||||||
|
return
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
|
||||||
|
proc apply(c, o: uint8x8): uint8x8 {.inline.} =
|
||||||
|
let co = vmull_u8(c, o)
|
||||||
|
vraddhn_u16(co, vrshrq_n_u16(co, 8))
|
||||||
|
|
||||||
|
let
|
||||||
|
opacityVec = vmov_n_u8(opacity)
|
||||||
|
iterations = image.data.len div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var channels = vld4_u8(cast[pointer](p))
|
||||||
|
channels.val[0] = apply(channels.val[0], opacityVec)
|
||||||
|
channels.val[1] = apply(channels.val[1], opacityVec)
|
||||||
|
channels.val[2] = apply(channels.val[2], opacityVec)
|
||||||
|
channels.val[3] = apply(channels.val[3], opacityVec)
|
||||||
|
vst4_u8(cast[pointer](p), channels)
|
||||||
|
p += 32
|
||||||
|
i += 8 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||||
|
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||||
|
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||||
|
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint8
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
nimSetMem(mask.data[0].addr, 0.cint, mask.data.len)
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
opacityVec = vmov_n_u8(opacity)
|
||||||
|
iterations = mask.data.len div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values = vld1_u8(cast[pointer](p))
|
||||||
|
multiplied = vmull_u8(values, opacityVec)
|
||||||
|
rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8))
|
||||||
|
vst1_u8(cast[pointer](p), rounded)
|
||||||
|
p += 8
|
||||||
|
i += 8 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import chroma, internal, nimsimd/sse2, pixie/common, vmath
|
import chroma, internal, nimsimd/sse2, pixie/common, system/memory, vmath
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
@ -244,33 +244,24 @@ proc newMaskSse2*(image: Image): Mask {.simd.} =
|
||||||
for i in i ..< image.data.len:
|
for i in i ..< image.data.len:
|
||||||
result.data[i] = image.data[i].a
|
result.data[i] = image.data[i].a
|
||||||
|
|
||||||
proc invertSse2*(target: Image | Mask) {.simd.} =
|
proc invertSse2*(image: Image) {.simd.} =
|
||||||
var
|
var
|
||||||
i: int
|
i: int
|
||||||
p = cast[uint](target.data[0].addr)
|
p = cast[uint](image.data[0].addr)
|
||||||
# Align to 16 bytes
|
# Align to 16 bytes
|
||||||
while i < target.data.len and (p and 15) != 0:
|
while i < image.data.len and (p and 15) != 0:
|
||||||
when target is Image:
|
var rgbx = image.data[i]
|
||||||
var rgbx = target.data[i]
|
rgbx.r = 255 - rgbx.r
|
||||||
rgbx.r = 255 - rgbx.r
|
rgbx.g = 255 - rgbx.g
|
||||||
rgbx.g = 255 - rgbx.g
|
rgbx.b = 255 - rgbx.b
|
||||||
rgbx.b = 255 - rgbx.b
|
rgbx.a = 255 - rgbx.a
|
||||||
rgbx.a = 255 - rgbx.a
|
image.data[i] = rgbx
|
||||||
target.data[i] = rgbx
|
inc i
|
||||||
inc i
|
p += 4
|
||||||
p += 4
|
|
||||||
else:
|
|
||||||
target.data[i] = 255 - target.data[i]
|
|
||||||
inc i
|
|
||||||
inc p
|
|
||||||
|
|
||||||
let vec255 = mm_set1_epi8(255)
|
|
||||||
|
|
||||||
when target is Image:
|
|
||||||
let iterations = target.data.len div 16
|
|
||||||
else:
|
|
||||||
let iterations = target.data.len div 64
|
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = mm_set1_epi8(255)
|
||||||
|
iterations = image.data.len div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
a = mm_load_si128(cast[pointer](p))
|
a = mm_load_si128(cast[pointer](p))
|
||||||
|
@ -282,24 +273,46 @@ proc invertSse2*(target: Image | Mask) {.simd.} =
|
||||||
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
|
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
|
||||||
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
|
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
|
||||||
p += 64
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
when target is Image:
|
for i in i ..< image.data.len:
|
||||||
i += 16 * iterations
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
for i in i ..< target.data.len:
|
toPremultipliedAlphaSse2(image.data)
|
||||||
var rgbx = target.data[i]
|
|
||||||
rgbx.r = 255 - rgbx.r
|
|
||||||
rgbx.g = 255 - rgbx.g
|
|
||||||
rgbx.b = 255 - rgbx.b
|
|
||||||
rgbx.a = 255 - rgbx.a
|
|
||||||
target.data[i] = rgbx
|
|
||||||
|
|
||||||
toPremultipliedAlphaSse2(target.data)
|
proc invertSse2*(mask: Mask) {.simd.} =
|
||||||
else:
|
var
|
||||||
i += 64 * iterations
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
# Align to 16 bytes
|
||||||
|
while i < mask.data.len and (p and 15) != 0:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
inc i
|
||||||
|
inc p
|
||||||
|
|
||||||
for i in i ..< target.data.len:
|
let
|
||||||
target.data[i] = 255 - target.data[i]
|
vec255 = mm_set1_epi8(255)
|
||||||
|
iterations = mask.data.len div 64
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
a = mm_load_si128(cast[pointer](p))
|
||||||
|
b = mm_load_si128(cast[pointer](p + 16))
|
||||||
|
c = mm_load_si128(cast[pointer](p + 32))
|
||||||
|
d = mm_load_si128(cast[pointer](p + 48))
|
||||||
|
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
|
||||||
|
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
|
||||||
|
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
|
||||||
|
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
|
||||||
|
p += 64
|
||||||
|
i += 64 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
|
||||||
proc ceilSse2*(mask: Mask) {.simd.} =
|
proc ceilSse2*(mask: Mask) {.simd.} =
|
||||||
var
|
var
|
||||||
|
@ -322,34 +335,69 @@ proc ceilSse2*(mask: Mask) {.simd.} =
|
||||||
if mask.data[i] != 0:
|
if mask.data[i] != 0:
|
||||||
mask.data[i] = 255
|
mask.data[i] = 255
|
||||||
|
|
||||||
proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} =
|
proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} =
|
||||||
let opacity = round(255 * opacity).uint16
|
let opacity = round(255 * opacity).uint16
|
||||||
if opacity == 255:
|
if opacity == 255:
|
||||||
return
|
return
|
||||||
|
|
||||||
if opacity == 0:
|
if opacity == 0:
|
||||||
when target is Image:
|
fillUnsafeSse2(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
|
||||||
target.fill(rgbx(0, 0, 0, 0))
|
|
||||||
else:
|
|
||||||
target.fill(0)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
var
|
var
|
||||||
i: int
|
i: int
|
||||||
p = cast[uint](target.data[0].addr)
|
p = cast[uint](image.data[0].addr)
|
||||||
len =
|
|
||||||
when target is Image:
|
|
||||||
target.data.len * 4
|
|
||||||
else:
|
|
||||||
target.data.len
|
|
||||||
|
|
||||||
let
|
let
|
||||||
oddMask = mm_set1_epi16(0xff00)
|
oddMask = mm_set1_epi16(0xff00)
|
||||||
div255 = mm_set1_epi16(0x8081)
|
div255 = mm_set1_epi16(0x8081)
|
||||||
zeroVec = mm_setzero_si128()
|
zeroVec = mm_setzero_si128()
|
||||||
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
|
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
|
||||||
iterations = len div 16
|
iterations = image.data.len div 4
|
||||||
for _ in 0 ..< len div 16:
|
for _ in 0 ..< iterations:
|
||||||
|
let values = mm_loadu_si128(cast[pointer](p))
|
||||||
|
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
|
||||||
|
var
|
||||||
|
valuesEven = mm_slli_epi16(values, 8)
|
||||||
|
valuesOdd = mm_and_si128(values, oddMask)
|
||||||
|
valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
|
||||||
|
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
|
||||||
|
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
|
||||||
|
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
|
||||||
|
mm_storeu_si128(
|
||||||
|
cast[pointer](p),
|
||||||
|
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
|
||||||
|
)
|
||||||
|
p += 16
|
||||||
|
i += 4 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||||
|
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||||
|
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||||
|
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
proc applyOpacitySse2*(mask: Mask, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint16
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
nimSetMem(mask.data[0].addr, 0.cint, mask.data.len)
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
oddMask = mm_set1_epi16(0xff00)
|
||||||
|
div255 = mm_set1_epi16(0x8081)
|
||||||
|
zeroVec = mm_setzero_si128()
|
||||||
|
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
|
||||||
|
iterations = mask.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
let values = mm_loadu_si128(cast[pointer](p))
|
let values = mm_loadu_si128(cast[pointer](p))
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
|
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
|
||||||
var
|
var
|
||||||
|
@ -366,17 +414,8 @@ proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} =
|
||||||
p += 16
|
p += 16
|
||||||
i += 16 * iterations
|
i += 16 * iterations
|
||||||
|
|
||||||
when target is Image:
|
for i in i ..< mask.data.len:
|
||||||
for i in i div 4 ..< target.data.len:
|
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
|
||||||
var rgbx = target.data[i]
|
|
||||||
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
|
||||||
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
|
||||||
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
|
||||||
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
|
||||||
target.data[i] = rgbx
|
|
||||||
else:
|
|
||||||
for i in i ..< target.data.len:
|
|
||||||
target.data[i] = ((target.data[i] * opacity) div 255).uint8
|
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
|
@ -74,6 +74,7 @@ timeIt "invert":
|
||||||
reset()
|
reset()
|
||||||
|
|
||||||
timeIt "applyOpacity":
|
timeIt "applyOpacity":
|
||||||
|
reset()
|
||||||
image.applyOpacity(0.5)
|
image.applyOpacity(0.5)
|
||||||
|
|
||||||
reset()
|
reset()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import benchy, chroma, pixie
|
import benchy, pixie
|
||||||
|
|
||||||
let mask = newMask(2560, 1440)
|
let mask = newMask(2560, 1440)
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ timeIt "invert":
|
||||||
reset()
|
reset()
|
||||||
|
|
||||||
timeIt "applyOpacity":
|
timeIt "applyOpacity":
|
||||||
|
reset()
|
||||||
mask.applyOpacity(0.5)
|
mask.applyOpacity(0.5)
|
||||||
|
|
||||||
reset()
|
reset()
|
||||||
|
|
Loading…
Reference in a new issue