From 7f62ac5810dfda0c6ab189058bfe2cb1ae42f7c7 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 10 Jul 2022 18:40:51 -0500 Subject: [PATCH 1/2] start on neon --- pixie.nimble | 2 +- src/pixie/simd.nim | 27 +++--- src/pixie/simd/internal.nim | 7 +- src/pixie/simd/neon.nim | 161 ++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 12 deletions(-) create mode 100644 src/pixie/simd/neon.nim diff --git a/pixie.nimble b/pixie.nimble index ef57b10..c139064 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.5" requires "zippy >= 0.10.2" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.5" +requires "nimsimd >= 1.1.6" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 4988bd1..ecf74da 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -4,15 +4,22 @@ export internal const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) -when allowSimd and defined(amd64): - import simd/sse2, simd/avx, simd/avx2 - export sse2, avx, avx2 +when allowSimd: + when defined(amd64): + import simd/sse2, simd/avx, simd/avx2 + export sse2, avx, avx2 - when not defined(pixieNoAvx): - import nimsimd/runtimecheck - let - cpuHasAvx* = checkInstructionSets({AVX}) - cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) + when not defined(pixieNoAvx): + import nimsimd/runtimecheck + let + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) - import nimsimd/sse2 as nimsimdsse2 - export nimsimdsse2 + import nimsimd/sse2 as nimsimdsse2 + export nimsimdsse2 + + elif defined(arm64): + import simd/neon + + import nimsimd/neon as nimsimdneon + export nimsimdneon diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index 25885f9..18870ed 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -46,6 +46,7 @@ macro hasSimd*(procedure: untyped) = let name = procedure.procName() originalBody = procedure[6] + nameNeon = name & "Neon" nameSse2 = name & "Sse2" nameAvx = name & "Avx" nameAvx2 = name & "Avx2" @@ -54,7 +55,7 @@ macro hasSimd*(procedure: untyped) = var body = newStmtList() - when not defined(pixieNoAvx): + when defined(amd64) and not defined(pixieNoAvx): if nameAvx2 in simdProcs: body.add quote do: if cpuHasAvx2: @@ -69,6 +70,10 @@ macro hasSimd*(procedure: untyped) = let bodySse2 = simdProcs[nameSse2][6] body.add quote do: `bodySse2` + elif nameNeon in simdProcs: + let bodyNeon = simdProcs[nameNeon][6] + body.add quote do: + `bodyNeon` else: body.add quote do: `originalBody` diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim new file mode 100644 index 0000000..48523db --- /dev/null +++ b/src/pixie/simd/neon.nim @@ -0,0 +1,161 @@ +import chroma, internal, nimsimd/neon, pixie/common + +when defined(release): + {.push checks: off.} + +proc fillUnsafeNeon*( + data: var seq[ColorRGBX], + color: SomeColor, + start, len: int +) {.simd.} = + let rgbx = color.asRgbx() + + var + i = start + p = cast[uint](data[i].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + data[i] = rgbx + inc i + p += 4 + + let + colors = vmovq_n_u32(cast[uint32](rgbx)) + x4 = vld4q_dup_u32(colors.unsafeAddr) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + vst1q_u32_x4(data[i].addr, x4) + i += 16 + + for i in i ..< start + len: + data[i] = rgbx + +proc isOneColorNeon*(image: Image): bool {.simd.} = + result = true + + let color = image.data[0] + + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + if image.data[i] != color: + return false + inc i + p += 4 + + let + colorVecs = vld4q_dup_u8(color.unsafeAddr) + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: + let + deinterleved = vld4q_u8(image.data[i].addr) + rEq = vceqq_u8(deinterleved.val[0], colorVecs.val[0]) + gEq = vceqq_u8(deinterleved.val[1], colorVecs.val[1]) + bEq = vceqq_u8(deinterleved.val[2], colorVecs.val[2]) + aEq = vceqq_u8(deinterleved.val[3], colorVecs.val[3]) + rgEq = vandq_u8(rEq, gEq) + baEq = vandq_u8(bEq, aEq) + rgbaEq = vandq_u8(rgEq, baEq) + mask = + cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and + cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq))) + if mask != uint64.high: + return false + i += 16 + + for i in i ..< image.data.len: + if image.data[i] != color: + return false + +proc isTransparentNeon*(image: Image): bool {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + if image.data[i].a != 0: + return false + inc i + p += 4 + + result = true + + let iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: + let + alphas = vld4q_u8(image.data[i].addr).val[3] + eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0)) + mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq)) + if mask != uint64.high: + return false + i += 16 + + for i in i ..< image.data.len: + if image.data[i].a != 0: + return false + +proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = + result = true + + var + i = start + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + alphas = vld4q_u8(data[i].addr).val[3] + eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high)) + mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq)) + if mask != uint64.high: + return false + i += 16 + + for i in i ..< start + len: + if data[i].a != 255: + return false + +proc newImageNeon*(mask: Mask): Image {.simd.} = + result = newImage(mask.width, mask.height) + + var i: int + for _ in 0 ..< mask.data.len div 16: + let alphas = vld1q_u8(mask.data[i].addr) + template doLane(lane: int) = + let packed = vgetq_lane_u32(cast[uint32x4](alphas), lane) + var unpacked = cast[uint8x16](vmovq_n_u32(packed)) + unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked) + unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked) + vst1q_u8(result.data[i + lane * 4].addr, unpacked) + doLane(0) + doLane(1) + doLane(2) + doLane(3) + i += 16 + + for i in i ..< mask.data.len: + let v = mask.data[i] + result.data[i] = rgbx(v, v, v, v) + +proc newMaskNeon*(image: Image): Mask {.simd.} = + result = newMask(image.width, image.height) + + var i: int + for _ in 0 ..< image.data.len div 16: + let alphas = vld4q_u8(image.data[i].addr).val[3] + vst1q_u8(result.data[i].addr, alphas) + i += 16 + + for i in i ..< image.data.len: + result.data[i] = image.data[i].a + +when defined(release): + {.pop.} From f36a162e9b45a41bafddc86eed41f7c489fd4811 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 10 Jul 2022 19:19:48 -0500 Subject: [PATCH 2/2] fix newImageNeon --- src/pixie/simd/neon.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 48523db..1386ec0 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -132,8 +132,8 @@ proc newImageNeon*(mask: Mask): Image {.simd.} = template doLane(lane: int) = let packed = vgetq_lane_u32(cast[uint32x4](alphas), lane) var unpacked = cast[uint8x16](vmovq_n_u32(packed)) - unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked) - unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked) + unpacked = vzip1q_u8(unpacked, unpacked) + unpacked = vzip1q_u8(unpacked, unpacked) vst1q_u8(result.data[i + lane * 4].addr, unpacked) doLane(0) doLane(1)