neon procs
This commit is contained in:
parent
0f93769ef1
commit
8670e0edec
4 changed files with 152 additions and 4 deletions
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
||||||
requires "chroma >= 0.2.6"
|
requires "chroma >= 0.2.6"
|
||||||
requires "zippy >= 0.10.3"
|
requires "zippy >= 0.10.3"
|
||||||
requires "flatty >= 0.3.4"
|
requires "flatty >= 0.3.4"
|
||||||
requires "nimsimd >= 1.1.7"
|
requires "nimsimd >= 1.1.8"
|
||||||
requires "bumpy >= 1.1.1"
|
requires "bumpy >= 1.1.1"
|
||||||
|
|
||||||
task bindings, "Generate bindings":
|
task bindings, "Generate bindings":
|
||||||
|
|
|
@ -20,6 +20,7 @@ when allowSimd:
|
||||||
|
|
||||||
elif defined(arm64):
|
elif defined(arm64):
|
||||||
import simd/neon
|
import simd/neon
|
||||||
|
export neon
|
||||||
|
|
||||||
import nimsimd/neon as nimsimdneon
|
import nimsimd/neon as nimsimdneon
|
||||||
export nimsimdneon
|
export nimsimdneon
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import chroma, internal, nimsimd/neon, pixie/common
|
import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
||||||
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
||||||
vst4_u8(cast[pointer](p), channels)
|
vst4_u8(cast[pointer](p), channels)
|
||||||
p += 32
|
p += 32
|
||||||
i += 8
|
i += 8 * iterations
|
||||||
|
|
||||||
for i in i ..< data.len:
|
for i in i ..< data.len:
|
||||||
var c = data[i]
|
var c = data[i]
|
||||||
|
@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} =
|
||||||
for i in i ..< image.data.len:
|
for i in i ..< image.data.len:
|
||||||
result.data[i] = image.data[i].a
|
result.data[i] = image.data[i].a
|
||||||
|
|
||||||
|
proc invertNeon*(image: Image) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
# Align to 16 bytes
|
||||||
|
while i < image.data.len and (p and 15) != 0:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = image.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var channels = vld4q_u8(cast[pointer](p))
|
||||||
|
channels.val[0] = vsubq_u8(vec255, channels.val[0])
|
||||||
|
channels.val[1] = vsubq_u8(vec255, channels.val[1])
|
||||||
|
channels.val[2] = vsubq_u8(vec255, channels.val[2])
|
||||||
|
channels.val[3] = vsubq_u8(vec255, channels.val[3])
|
||||||
|
vst4q_u8(cast[pointer](p), channels)
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = 255 - rgbx.r
|
||||||
|
rgbx.g = 255 - rgbx.g
|
||||||
|
rgbx.b = 255 - rgbx.b
|
||||||
|
rgbx.a = 255 - rgbx.a
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
toPremultipliedAlphaNeon(image.data)
|
||||||
|
|
||||||
|
proc invertNeon*(mask: Mask) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
# Align to 16 bytes
|
||||||
|
while i < mask.data.len and (p and 15) != 0:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
inc i
|
||||||
|
inc p
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = mask.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let values = vld1q_u8(cast[pointer](p))
|
||||||
|
vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values))
|
||||||
|
p += 16
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
mask.data[i] = 255 - mask.data[i]
|
||||||
|
|
||||||
|
proc ceilNeon*(mask: Mask) {.simd.} =
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
zeroVec = vmovq_n_u8(0)
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = mask.data.len div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var values = vld1q_u8(cast[pointer](p))
|
||||||
|
values = vceqq_u8(values, zeroVec)
|
||||||
|
values = vbicq_u8(vec255, values)
|
||||||
|
vst1q_u8(cast[pointer](p), values)
|
||||||
|
p += 16
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
if mask.data[i] != 0:
|
||||||
|
mask.data[i] = 255
|
||||||
|
|
||||||
|
proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint8
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len)
|
||||||
|
return
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](image.data[0].addr)
|
||||||
|
|
||||||
|
proc apply(c, o: uint8x8): uint8x8 {.inline.} =
|
||||||
|
let co = vmull_u8(c, o)
|
||||||
|
vraddhn_u16(co, vrshrq_n_u16(co, 8))
|
||||||
|
|
||||||
|
let
|
||||||
|
opacityVec = vmov_n_u8(opacity)
|
||||||
|
iterations = image.data.len div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
var channels = vld4_u8(cast[pointer](p))
|
||||||
|
channels.val[0] = apply(channels.val[0], opacityVec)
|
||||||
|
channels.val[1] = apply(channels.val[1], opacityVec)
|
||||||
|
channels.val[2] = apply(channels.val[2], opacityVec)
|
||||||
|
channels.val[3] = apply(channels.val[3], opacityVec)
|
||||||
|
vst4_u8(cast[pointer](p), channels)
|
||||||
|
p += 32
|
||||||
|
i += 8 * iterations
|
||||||
|
|
||||||
|
for i in i ..< image.data.len:
|
||||||
|
var rgbx = image.data[i]
|
||||||
|
rgbx.r = ((rgbx.r * opacity) div 255).uint8
|
||||||
|
rgbx.g = ((rgbx.g * opacity) div 255).uint8
|
||||||
|
rgbx.b = ((rgbx.b * opacity) div 255).uint8
|
||||||
|
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||||
|
image.data[i] = rgbx
|
||||||
|
|
||||||
|
proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} =
|
||||||
|
let opacity = round(255 * opacity).uint8
|
||||||
|
if opacity == 255:
|
||||||
|
return
|
||||||
|
|
||||||
|
if opacity == 0:
|
||||||
|
nimSetMem(mask.data[0].addr, 0.cint, mask.data.len)
|
||||||
|
|
||||||
|
var
|
||||||
|
i: int
|
||||||
|
p = cast[uint](mask.data[0].addr)
|
||||||
|
|
||||||
|
let
|
||||||
|
opacityVec = vmov_n_u8(opacity)
|
||||||
|
iterations = mask.data.len div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values = vld1_u8(cast[pointer](p))
|
||||||
|
multiplied = vmull_u8(values, opacityVec)
|
||||||
|
rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8))
|
||||||
|
vst1_u8(cast[pointer](p), rounded)
|
||||||
|
p += 8
|
||||||
|
i += 8 * iterations
|
||||||
|
|
||||||
|
for i in i ..< mask.data.len:
|
||||||
|
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import benchy, chroma, pixie
|
import benchy, pixie
|
||||||
|
|
||||||
let mask = newMask(2560, 1440)
|
let mask = newMask(2560, 1440)
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ timeIt "invert":
|
||||||
reset()
|
reset()
|
||||||
|
|
||||||
timeIt "applyOpacity":
|
timeIt "applyOpacity":
|
||||||
|
reset()
|
||||||
mask.applyOpacity(0.5)
|
mask.applyOpacity(0.5)
|
||||||
|
|
||||||
reset()
|
reset()
|
||||||
|
|
Loading…
Reference in a new issue