diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index e8d380a..576c7ed 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -308,10 +308,7 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = proc ceil*(mask: Mask) {.raises: [].} = ## A value of 0 stays 0. Anything else turns into 255. when allowSimd and compiles(invertImageSimd): - ceilMaskSimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + ceilMaskSimd(mask.data) return for i in 0 ..< mask.data.len: diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 454c925..11b1da4 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -317,19 +317,24 @@ when defined(amd64): for i in i ..< data.len: data[i] = 255 - data[i] - proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = - var i: int + proc ceilMaskSimd*(data: var seq[uint8]) = + var + i: int + p = cast[uint](data[0].addr) + let zeroVec = mm_setzero_si128() vec255 = mm_set1_epi8(255) - for _ in 0 ..< len div 16: - var values = mm_loadu_si128(data[i].addr) + iterations = data.len div 16 + for _ in 0 ..< iterations: + var values = mm_loadu_si128(cast[pointer](p)) values = mm_cmpeq_epi8(values, zeroVec) values = mm_andnot_si128(values, vec255) - mm_storeu_si128(data[i].addr, values) - i += 16 + mm_storeu_si128(cast[pointer](p), values) + p += 16 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i] != 0: data[i] = 255