neon procs

This commit is contained in:
Ryan Oldenburg 2022-07-23 00:13:19 -05:00
parent 0f93769ef1
commit 8670e0edec
4 changed files with 152 additions and 4 deletions

View file

@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
requires "chroma >= 0.2.6"
requires "zippy >= 0.10.3"
requires "flatty >= 0.3.4"
requires "nimsimd >= 1.1.7"
requires "nimsimd >= 1.1.8"
requires "bumpy >= 1.1.1"
task bindings, "Generate bindings":

View file

@ -20,6 +20,7 @@ when allowSimd:
elif defined(arm64):
import simd/neon
export neon
import nimsimd/neon as nimsimdneon
export nimsimdneon

View file

@ -1,4 +1,4 @@
import chroma, internal, nimsimd/neon, pixie/common
import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath
when defined(release):
{.push checks: off.}
@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
channels.val[2] = premultiply(channels.val[2], channels.val[3])
vst4_u8(cast[pointer](p), channels)
p += 32
i += 8
i += 8 * iterations
for i in i ..< data.len:
var c = data[i]
@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} =
for i in i ..< image.data.len:
result.data[i] = image.data[i].a
proc invertNeon*(image: Image) {.simd.} =
var
i: int
p = cast[uint](image.data[0].addr)
# Align to 16 bytes
while i < image.data.len and (p and 15) != 0:
var rgbx = image.data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
image.data[i] = rgbx
inc i
p += 4
let
vec255 = vmovq_n_u8(255)
iterations = image.data.len div 16
for _ in 0 ..< iterations:
var channels = vld4q_u8(cast[pointer](p))
channels.val[0] = vsubq_u8(vec255, channels.val[0])
channels.val[1] = vsubq_u8(vec255, channels.val[1])
channels.val[2] = vsubq_u8(vec255, channels.val[2])
channels.val[3] = vsubq_u8(vec255, channels.val[3])
vst4q_u8(cast[pointer](p), channels)
p += 64
i += 16 * iterations
for i in i ..< image.data.len:
var rgbx = image.data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
image.data[i] = rgbx
toPremultipliedAlphaNeon(image.data)
proc invertNeon*(mask: Mask) {.simd.} =
var
i: int
p = cast[uint](mask.data[0].addr)
# Align to 16 bytes
while i < mask.data.len and (p and 15) != 0:
mask.data[i] = 255 - mask.data[i]
inc i
inc p
let
vec255 = vmovq_n_u8(255)
iterations = mask.data.len div 16
for _ in 0 ..< iterations:
let values = vld1q_u8(cast[pointer](p))
vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values))
p += 16
i += 16 * iterations
for i in i ..< mask.data.len:
mask.data[i] = 255 - mask.data[i]
proc ceilNeon*(mask: Mask) {.simd.} =
var
i: int
p = cast[uint](mask.data[0].addr)
let
zeroVec = vmovq_n_u8(0)
vec255 = vmovq_n_u8(255)
iterations = mask.data.len div 16
for _ in 0 ..< iterations:
var values = vld1q_u8(cast[pointer](p))
values = vceqq_u8(values, zeroVec)
values = vbicq_u8(vec255, values)
vst1q_u8(cast[pointer](p), values)
p += 16
i += 16 * iterations
for i in i ..< mask.data.len:
if mask.data[i] != 0:
mask.data[i] = 255
proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
let opacity = round(255 * opacity).uint8
if opacity == 255:
return
if opacity == 0:
fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
return
var
i: int
p = cast[uint](image.data[0].addr)
proc apply(c, o: uint8x8): uint8x8 {.inline.} =
let co = vmull_u8(c, o)
vraddhn_u16(co, vrshrq_n_u16(co, 8))
let
opacityVec = vmov_n_u8(opacity)
iterations = image.data.len div 8
for _ in 0 ..< iterations:
var channels = vld4_u8(cast[pointer](p))
channels.val[0] = apply(channels.val[0], opacityVec)
channels.val[1] = apply(channels.val[1], opacityVec)
channels.val[2] = apply(channels.val[2], opacityVec)
channels.val[3] = apply(channels.val[3], opacityVec)
vst4_u8(cast[pointer](p), channels)
p += 32
i += 8 * iterations
for i in i ..< image.data.len:
var rgbx = image.data[i]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
image.data[i] = rgbx
proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} =
let opacity = round(255 * opacity).uint8
if opacity == 255:
return
if opacity == 0:
nimSetMem(mask.data[0].addr, 0.cint, mask.data.len)
var
i: int
p = cast[uint](mask.data[0].addr)
let
opacityVec = vmov_n_u8(opacity)
iterations = mask.data.len div 8
for _ in 0 ..< iterations:
let
values = vld1_u8(cast[pointer](p))
multiplied = vmull_u8(values, opacityVec)
rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8))
vst1_u8(cast[pointer](p), rounded)
p += 8
i += 8 * iterations
for i in i ..< mask.data.len:
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
when defined(release):
{.pop.}

View file

@ -1,4 +1,4 @@
import benchy, chroma, pixie
import benchy, pixie
let mask = newMask(2560, 1440)
@ -25,6 +25,7 @@ timeIt "invert":
reset()
timeIt "applyOpacity":
reset()
mask.applyOpacity(0.5)
reset()