neon procs
This commit is contained in:
parent
0f93769ef1
commit
8670e0edec
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
|||
requires "chroma >= 0.2.6"
|
||||
requires "zippy >= 0.10.3"
|
||||
requires "flatty >= 0.3.4"
|
||||
requires "nimsimd >= 1.1.7"
|
||||
requires "nimsimd >= 1.1.8"
|
||||
requires "bumpy >= 1.1.1"
|
||||
|
||||
task bindings, "Generate bindings":
|
||||
|
|
|
@ -20,6 +20,7 @@ when allowSimd:
|
|||
|
||||
elif defined(arm64):
|
||||
import simd/neon
|
||||
export neon
|
||||
|
||||
import nimsimd/neon as nimsimdneon
|
||||
export nimsimdneon
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import chroma, internal, nimsimd/neon, pixie/common
|
||||
import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath
|
||||
|
||||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
|||
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
||||
vst4_u8(cast[pointer](p), channels)
|
||||
p += 32
|
||||
i += 8
|
||||
i += 8 * iterations
|
||||
|
||||
for i in i ..< data.len:
|
||||
var c = data[i]
|
||||
|
@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} =
|
|||
for i in i ..< image.data.len:
|
||||
result.data[i] = image.data[i].a
|
||||
|
||||
proc invertNeon*(image: Image) {.simd.} =
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](image.data[0].addr)
|
||||
# Align to 16 bytes
|
||||
while i < image.data.len and (p and 15) != 0:
|
||||
var rgbx = image.data[i]
|
||||
rgbx.r = 255 - rgbx.r
|
||||
rgbx.g = 255 - rgbx.g
|
||||
rgbx.b = 255 - rgbx.b
|
||||
rgbx.a = 255 - rgbx.a
|
||||
image.data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
|
||||
let
|
||||
vec255 = vmovq_n_u8(255)
|
||||
iterations = image.data.len div 16
|
||||
for _ in 0 ..< iterations:
|
||||
var channels = vld4q_u8(cast[pointer](p))
|
||||
channels.val[0] = vsubq_u8(vec255, channels.val[0])
|
||||
channels.val[1] = vsubq_u8(vec255, channels.val[1])
|
||||
channels.val[2] = vsubq_u8(vec255, channels.val[2])
|
||||
channels.val[3] = vsubq_u8(vec255, channels.val[3])
|
||||
vst4q_u8(cast[pointer](p), channels)
|
||||
p += 64
|
||||
i += 16 * iterations
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
var rgbx = image.data[i]
|
||||
rgbx.r = 255 - rgbx.r
|
||||
rgbx.g = 255 - rgbx.g
|
||||
rgbx.b = 255 - rgbx.b
|
||||
rgbx.a = 255 - rgbx.a
|
||||
image.data[i] = rgbx
|
||||
|
||||
toPremultipliedAlphaNeon(image.data)
|
||||
|
||||
proc invertNeon*(mask: Mask) {.simd.} =
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](mask.data[0].addr)
|
||||
# Align to 16 bytes
|
||||
while i < mask.data.len and (p and 15) != 0:
|
||||
mask.data[i] = 255 - mask.data[i]
|
||||
inc i
|
||||
inc p
|
||||
|
||||
let
|
||||
vec255 = vmovq_n_u8(255)
|
||||
iterations = mask.data.len div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let values = vld1q_u8(cast[pointer](p))
|
||||
vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values))
|
||||
p += 16
|
||||
i += 16 * iterations
|
||||
|
||||
for i in i ..< mask.data.len:
|
||||
mask.data[i] = 255 - mask.data[i]
|
||||
|
||||
proc ceilNeon*(mask: Mask) {.simd.} =
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](mask.data[0].addr)
|
||||
|
||||
let
|
||||
zeroVec = vmovq_n_u8(0)
|
||||
vec255 = vmovq_n_u8(255)
|
||||
iterations = mask.data.len div 16
|
||||
for _ in 0 ..< iterations:
|
||||
var values = vld1q_u8(cast[pointer](p))
|
||||
values = vceqq_u8(values, zeroVec)
|
||||
values = vbicq_u8(vec255, values)
|
||||
vst1q_u8(cast[pointer](p), values)
|
||||
p += 16
|
||||
i += 16 * iterations
|
||||
|
||||
for i in i ..< mask.data.len:
|
||||
if mask.data[i] != 0:
|
||||
mask.data[i] = 255
|
||||
|
||||
proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
|
||||
let opacity = round(255 * opacity).uint8
|
||||
if opacity == 255:
|
||||
return
|
||||
|
||||
if opacity == 0:
|
||||
fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
|
||||
return
|
||||
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](image.data[0].addr)
|
||||
|
||||
proc apply(c, o: uint8x8): uint8x8 {.inline.} =
|
||||
let co = vmull_u8(c, o)
|
||||
vraddhn_u16(co, vrshrq_n_u16(co, 8))
|
||||
|
||||
let
|
||||
opacityVec = vmov_n_u8(opacity)
|
||||
iterations = image.data.len div 8
|
||||
for _ in 0 ..< iterations:
|
||||
var channels = vld4_u8(cast[pointer](p))
|
||||
channels.val[0] = apply(channels.val[0], opacityVec)
|
||||
channels.val[1] = apply(channels.val[1], opacityVec)
|
||||
channels.val[2] = apply(channels.val[2], opacityVec)
|
||||
channels.val[3] = apply(channels.val[3], opacityVec)
|
||||
vst4_u8(cast[pointer](p), channels)
|
||||
p += 32
|
||||
i += 8 * iterations
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
var rgbx = image.data[i]
|
||||
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||
image.data[i] = rgbx
|
||||
|
||||
proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} =
|
||||
let opacity = round(255 * opacity).uint8
|
||||
if opacity == 255:
|
||||
return
|
||||
|
||||
if opacity == 0:
|
||||
nimSetMem(mask.data[0].addr, 0.cint, mask.data.len)
|
||||
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](mask.data[0].addr)
|
||||
|
||||
let
|
||||
opacityVec = vmov_n_u8(opacity)
|
||||
iterations = mask.data.len div 8
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
values = vld1_u8(cast[pointer](p))
|
||||
multiplied = vmull_u8(values, opacityVec)
|
||||
rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8))
|
||||
vst1_u8(cast[pointer](p), rounded)
|
||||
p += 8
|
||||
i += 8 * iterations
|
||||
|
||||
for i in i ..< mask.data.len:
|
||||
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
|
||||
|
||||
when defined(release):
|
||||
{.pop.}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import benchy, chroma, pixie
|
||||
import benchy, pixie
|
||||
|
||||
let mask = newMask(2560, 1440)
|
||||
|
||||
|
@ -25,6 +25,7 @@ timeIt "invert":
|
|||
reset()
|
||||
|
||||
timeIt "applyOpacity":
|
||||
reset()
|
||||
mask.applyOpacity(0.5)
|
||||
|
||||
reset()
|
||||
|
|
Loading…
Reference in a new issue