From 8670e0edec6feb4c6ad94b8c21f0af2dae649f8a Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 23 Jul 2022 00:13:19 -0500 Subject: [PATCH] neon procs --- pixie.nimble | 2 +- src/pixie/simd.nim | 1 + src/pixie/simd/neon.nim | 150 +++++++++++++++++++++++++++++++++++++++- tests/bench_masks.nim | 3 +- 4 files changed, 152 insertions(+), 4 deletions(-) diff --git a/pixie.nimble b/pixie.nimble index 4b819c3..b899c8d 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.6" requires "zippy >= 0.10.3" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.7" +requires "nimsimd >= 1.1.8" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index ecf74da..57b0b8d 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -20,6 +20,7 @@ when allowSimd: elif defined(arm64): import simd/neon + export neon import nimsimd/neon as nimsimdneon export nimsimdneon diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 739a224..19fa0a3 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/neon, pixie/common +import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = channels.val[2] = premultiply(channels.val[2], channels.val[3]) vst4_u8(cast[pointer](p), channels) p += 32 - i += 8 + i += 8 * iterations for i in i ..< data.len: var c = data[i] @@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a +proc invertNeon*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + + let + vec255 = vmovq_n_u8(255) + iterations = image.data.len div 16 + for _ in 0 ..< iterations: + var channels = vld4q_u8(cast[pointer](p)) + channels.val[0] = vsubq_u8(vec255, channels.val[0]) + channels.val[1] = vsubq_u8(vec255, channels.val[1]) + channels.val[2] = vsubq_u8(vec255, channels.val[2]) + channels.val[3] = vsubq_u8(vec255, channels.val[3]) + vst4q_u8(cast[pointer](p), channels) + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + + toPremultipliedAlphaNeon(image.data) + +proc invertNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p + + let + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + let values = vld1q_u8(cast[pointer](p)) + vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values)) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] + +proc ceilNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + + let + zeroVec = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + var values = vld1q_u8(cast[pointer](p)) + values = vceqq_u8(values, zeroVec) + values = vbicq_u8(vec255, values) + vst1q_u8(cast[pointer](p), values) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + if mask.data[i] != 0: + mask.data[i] = 255 + +proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) + return + + var + i: int + p = cast[uint](image.data[0].addr) + + proc apply(c, o: uint8x8): uint8x8 {.inline.} = + let co = vmull_u8(c, o) + vraddhn_u16(co, vrshrq_n_u16(co, 8)) + + let + opacityVec = vmov_n_u8(opacity) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + var channels = vld4_u8(cast[pointer](p)) + channels.val[0] = apply(channels.val[0], opacityVec) + channels.val[1] = apply(channels.val[1], opacityVec) + channels.val[2] = apply(channels.val[2], opacityVec) + channels.val[3] = apply(channels.val[3], opacityVec) + vst4_u8(cast[pointer](p), channels) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + opacityVec = vmov_n_u8(opacity) + iterations = mask.data.len div 8 + for _ in 0 ..< iterations: + let + values = vld1_u8(cast[pointer](p)) + multiplied = vmull_u8(values, opacityVec) + rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8)) + vst1_u8(cast[pointer](p), rounded) + p += 8 + i += 8 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 + when defined(release): {.pop.} diff --git a/tests/bench_masks.nim b/tests/bench_masks.nim index cd7a527..abf15de 100644 --- a/tests/bench_masks.nim +++ b/tests/bench_masks.nim @@ -1,4 +1,4 @@ -import benchy, chroma, pixie +import benchy, pixie let mask = newMask(2560, 1440) @@ -25,6 +25,7 @@ timeIt "invert": reset() timeIt "applyOpacity": + reset() mask.applyOpacity(0.5) reset()