start on neon
This commit is contained in:
parent
4a8894b664
commit
7f62ac5810
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
|||
requires "chroma >= 0.2.5"
|
||||
requires "zippy >= 0.10.2"
|
||||
requires "flatty >= 0.3.4"
|
||||
requires "nimsimd >= 1.1.5"
|
||||
requires "nimsimd >= 1.1.6"
|
||||
requires "bumpy >= 1.1.1"
|
||||
|
||||
task bindings, "Generate bindings":
|
||||
|
|
|
@ -4,15 +4,22 @@ export internal
|
|||
|
||||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||
|
||||
when allowSimd and defined(amd64):
|
||||
import simd/sse2, simd/avx, simd/avx2
|
||||
export sse2, avx, avx2
|
||||
when allowSimd:
|
||||
when defined(amd64):
|
||||
import simd/sse2, simd/avx, simd/avx2
|
||||
export sse2, avx, avx2
|
||||
|
||||
when not defined(pixieNoAvx):
|
||||
import nimsimd/runtimecheck
|
||||
let
|
||||
cpuHasAvx* = checkInstructionSets({AVX})
|
||||
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
|
||||
when not defined(pixieNoAvx):
|
||||
import nimsimd/runtimecheck
|
||||
let
|
||||
cpuHasAvx* = checkInstructionSets({AVX})
|
||||
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
|
||||
|
||||
import nimsimd/sse2 as nimsimdsse2
|
||||
export nimsimdsse2
|
||||
import nimsimd/sse2 as nimsimdsse2
|
||||
export nimsimdsse2
|
||||
|
||||
elif defined(arm64):
|
||||
import simd/neon
|
||||
|
||||
import nimsimd/neon as nimsimdneon
|
||||
export nimsimdneon
|
||||
|
|
|
@ -46,6 +46,7 @@ macro hasSimd*(procedure: untyped) =
|
|||
let
|
||||
name = procedure.procName()
|
||||
originalBody = procedure[6]
|
||||
nameNeon = name & "Neon"
|
||||
nameSse2 = name & "Sse2"
|
||||
nameAvx = name & "Avx"
|
||||
nameAvx2 = name & "Avx2"
|
||||
|
@ -54,7 +55,7 @@ macro hasSimd*(procedure: untyped) =
|
|||
|
||||
var body = newStmtList()
|
||||
|
||||
when not defined(pixieNoAvx):
|
||||
when defined(amd64) and not defined(pixieNoAvx):
|
||||
if nameAvx2 in simdProcs:
|
||||
body.add quote do:
|
||||
if cpuHasAvx2:
|
||||
|
@ -69,6 +70,10 @@ macro hasSimd*(procedure: untyped) =
|
|||
let bodySse2 = simdProcs[nameSse2][6]
|
||||
body.add quote do:
|
||||
`bodySse2`
|
||||
elif nameNeon in simdProcs:
|
||||
let bodyNeon = simdProcs[nameNeon][6]
|
||||
body.add quote do:
|
||||
`bodyNeon`
|
||||
else:
|
||||
body.add quote do:
|
||||
`originalBody`
|
||||
|
|
161
src/pixie/simd/neon.nim
Normal file
161
src/pixie/simd/neon.nim
Normal file
|
@ -0,0 +1,161 @@
|
|||
import chroma, internal, nimsimd/neon, pixie/common
|
||||
|
||||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
||||
proc fillUnsafeNeon*(
|
||||
data: var seq[ColorRGBX],
|
||||
color: SomeColor,
|
||||
start, len: int
|
||||
) {.simd.} =
|
||||
let rgbx = color.asRgbx()
|
||||
|
||||
var
|
||||
i = start
|
||||
p = cast[uint](data[i].addr)
|
||||
# Align to 16 bytes
|
||||
while i < (start + len) and (p and 15) != 0:
|
||||
data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
|
||||
let
|
||||
colors = vmovq_n_u32(cast[uint32](rgbx))
|
||||
x4 = vld4q_dup_u32(colors.unsafeAddr)
|
||||
iterations = (start + len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
vst1q_u32_x4(data[i].addr, x4)
|
||||
i += 16
|
||||
|
||||
for i in i ..< start + len:
|
||||
data[i] = rgbx
|
||||
|
||||
proc isOneColorNeon*(image: Image): bool {.simd.} =
|
||||
result = true
|
||||
|
||||
let color = image.data[0]
|
||||
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](image.data[0].addr)
|
||||
# Align to 16 bytes
|
||||
while i < image.data.len and (p and 15) != 0:
|
||||
if image.data[i] != color:
|
||||
return false
|
||||
inc i
|
||||
p += 4
|
||||
|
||||
let
|
||||
colorVecs = vld4q_dup_u8(color.unsafeAddr)
|
||||
iterations = (image.data.len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
deinterleved = vld4q_u8(image.data[i].addr)
|
||||
rEq = vceqq_u8(deinterleved.val[0], colorVecs.val[0])
|
||||
gEq = vceqq_u8(deinterleved.val[1], colorVecs.val[1])
|
||||
bEq = vceqq_u8(deinterleved.val[2], colorVecs.val[2])
|
||||
aEq = vceqq_u8(deinterleved.val[3], colorVecs.val[3])
|
||||
rgEq = vandq_u8(rEq, gEq)
|
||||
baEq = vandq_u8(bEq, aEq)
|
||||
rgbaEq = vandq_u8(rgEq, baEq)
|
||||
mask =
|
||||
cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and
|
||||
cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq)))
|
||||
if mask != uint64.high:
|
||||
return false
|
||||
i += 16
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
if image.data[i] != color:
|
||||
return false
|
||||
|
||||
proc isTransparentNeon*(image: Image): bool {.simd.} =
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](image.data[0].addr)
|
||||
# Align to 16 bytes
|
||||
while i < image.data.len and (p and 15) != 0:
|
||||
if image.data[i].a != 0:
|
||||
return false
|
||||
inc i
|
||||
p += 4
|
||||
|
||||
result = true
|
||||
|
||||
let iterations = (image.data.len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
alphas = vld4q_u8(image.data[i].addr).val[3]
|
||||
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0))
|
||||
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
|
||||
if mask != uint64.high:
|
||||
return false
|
||||
i += 16
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
if image.data[i].a != 0:
|
||||
return false
|
||||
|
||||
proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
|
||||
result = true
|
||||
|
||||
var
|
||||
i = start
|
||||
p = cast[uint](data[0].addr)
|
||||
# Align to 16 bytes
|
||||
while i < (start + len) and (p and 15) != 0:
|
||||
if data[i].a != 255:
|
||||
return false
|
||||
inc i
|
||||
p += 4
|
||||
|
||||
let iterations = (start + len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
alphas = vld4q_u8(data[i].addr).val[3]
|
||||
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high))
|
||||
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
|
||||
if mask != uint64.high:
|
||||
return false
|
||||
i += 16
|
||||
|
||||
for i in i ..< start + len:
|
||||
if data[i].a != 255:
|
||||
return false
|
||||
|
||||
proc newImageNeon*(mask: Mask): Image {.simd.} =
|
||||
result = newImage(mask.width, mask.height)
|
||||
|
||||
var i: int
|
||||
for _ in 0 ..< mask.data.len div 16:
|
||||
let alphas = vld1q_u8(mask.data[i].addr)
|
||||
template doLane(lane: int) =
|
||||
let packed = vgetq_lane_u32(cast[uint32x4](alphas), lane)
|
||||
var unpacked = cast[uint8x16](vmovq_n_u32(packed))
|
||||
unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked)
|
||||
unpacked = vzip1q_u8(vmovq_n_u8(0), unpacked)
|
||||
vst1q_u8(result.data[i + lane * 4].addr, unpacked)
|
||||
doLane(0)
|
||||
doLane(1)
|
||||
doLane(2)
|
||||
doLane(3)
|
||||
i += 16
|
||||
|
||||
for i in i ..< mask.data.len:
|
||||
let v = mask.data[i]
|
||||
result.data[i] = rgbx(v, v, v, v)
|
||||
|
||||
proc newMaskNeon*(image: Image): Mask {.simd.} =
|
||||
result = newMask(image.width, image.height)
|
||||
|
||||
var i: int
|
||||
for _ in 0 ..< image.data.len div 16:
|
||||
let alphas = vld4q_u8(image.data[i].addr).val[3]
|
||||
vst1q_u8(result.data[i].addr, alphas)
|
||||
i += 16
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
result.data[i] = image.data[i].a
|
||||
|
||||
when defined(release):
|
||||
{.pop.}
|
Loading…
Reference in a new issue