invertImageSimd invertMaskSimd

This commit is contained in:
Ryan Oldenburg 2022-06-30 10:12:16 -05:00
parent 9644894903
commit af5045ccb8
4 changed files with 71 additions and 41 deletions

View file

@ -386,10 +386,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
proc invert*(image: Image) {.raises: [].} = proc invert*(image: Image) {.raises: [].} =
## Inverts all of the colors and alpha. ## Inverts all of the colors and alpha.
when allowSimd and compiles(invertImageSimd): when allowSimd and compiles(invertImageSimd):
invertImageSimd( invertImageSimd(image.data)
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
return return
for i in 0 ..< image.data.len: for i in 0 ..< image.data.len:

View file

@ -1,6 +1,9 @@
import common, internal, vmath import common, internal, vmath
when defined(amd64) and allowSimd: when allowSimd:
import simd
when defined(amd64):
import nimsimd/sse2 import nimsimd/sse2
type type
@ -234,11 +237,8 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
proc invert*(mask: Mask) {.raises: [].} = proc invert*(mask: Mask) {.raises: [].} =
## Inverts all of the values - creates a negative of the mask. ## Inverts all of the values - creates a negative of the mask.
when allowSimd and compiles(invertImageSimd): when allowSimd and compiles(invertMaskSimd):
invertMaskSimd( invertMaskSimd(mask.data)
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len
)
return return
for i in 0 ..< mask.data.len: for i in 0 ..< mask.data.len:

View file

@ -23,8 +23,8 @@ proc fillUnsafeAvx*(
p += 4 p += 4
let let
iterations = (start + len - i) div 8
colorVec = mm256_set1_epi32(cast[int32](rgbx)) colorVec = mm256_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
mm256_store_si256(cast[pointer](p), colorVec) mm256_store_si256(cast[pointer](p), colorVec)
p += 32 p += 32

View file

@ -243,22 +243,38 @@ when defined(amd64):
for i in i ..< len: for i in i ..< len:
dst[i] = src[i].a dst[i] = src[i].a
proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) = proc invertImageSimd*(data: var seq[ColorRGBX]) =
var i: int var
let vec255 = mm_set1_epi8(cast[int8](255)) i: int
for _ in 0 ..< len div 16: p = cast[uint](data[0].addr)
let # Align to 16 bytes
a = mm_loadu_si128(data[i + 0].addr) while i < data.len and (p and 15) != 0:
b = mm_loadu_si128(data[i + 4].addr) var rgbx = data[i]
c = mm_loadu_si128(data[i + 8].addr) rgbx.r = 255 - rgbx.r
d = mm_loadu_si128(data[i + 12].addr) rgbx.g = 255 - rgbx.g
mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a)) rgbx.b = 255 - rgbx.b
mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b)) rgbx.a = 255 - rgbx.a
mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c)) data[i] = rgbx
mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d)) inc i
i += 16 p += 4
for i in i ..< len: let
vec255 = mm_set1_epi8(255)
iterations = data.len div 16
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 16 * iterations
for i in i ..< data.len:
var rgbx = data[i] var rgbx = data[i]
rgbx.r = 255 - rgbx.r rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g rgbx.g = 255 - rgbx.g
@ -266,19 +282,36 @@ when defined(amd64):
rgbx.a = 255 - rgbx.a rgbx.a = 255 - rgbx.a
data[i] = rgbx data[i] = rgbx
toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len) toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data[0].addr), data.len)
proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = proc invertMaskSimd*(data: var seq[uint8]) =
var i: int var
let vec255 = mm_set1_epi8(255) i: int
for _ in 0 ..< len div 16: p = cast[uint](data[0].addr)
var values = mm_loadu_si128(data[i].addr) # Align to 16 bytes
values = mm_sub_epi8(vec255, values) while i < data.len and (p and 15) != 0:
mm_storeu_si128(data[i].addr, values) data[i] = 255 - data[i]
i += 16 inc i
inc p
for j in i ..< len: let
data[j] = 255 - data[j] vec255 = mm_set1_epi8(255)
iterations = data.len div 64
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 64 * iterations
for i in i ..< data.len:
data[i] = 255 - data[i]
proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) =
var i: int var i: int
@ -303,10 +336,10 @@ when defined(amd64):
) = ) =
var i: int var i: int
let let
oddMask = mm_set1_epi16(cast[int16](0xff00)) oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(cast[int16](0x8081)) div255 = mm_set1_epi16(0x8081)
zeroVec = mm_setzero_si128() zeroVec = mm_setzero_si128()
opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
for _ in 0 ..< len div 16: for _ in 0 ..< len div 16:
let values = mm_loadu_si128(data[i].addr) let values = mm_loadu_si128(data[i].addr)
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: