Merge pull request #459 from guzba/varseqsimd

var seq simd
This commit is contained in:
Andre von Houck 2022-06-30 11:09:27 -07:00 committed by GitHub
commit 66d5535ae9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 293 additions and 242 deletions

View file

@ -33,11 +33,7 @@ proc newImage*(mask: Mask): Image {.raises: [PixieError].} =
result = newImage(mask.width, mask.height)
when allowSimd and compiles(newImageFromMaskSimd):
newImageFromMaskSimd(
cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr),
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len
)
newImageFromMaskSimd(result.data, mask.data)
return
for i in 0 ..< mask.data.len:
@ -102,10 +98,7 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
proc isOneColor*(image: Image): bool {.raises: [].} =
## Checks if the entire image is the same color.
when allowSimd and compiles(isOneColorSimd):
return isOneColorSimd(
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
return isOneColorSimd(image.data)
result = true
@ -117,10 +110,7 @@ proc isOneColor*(image: Image): bool {.raises: [].} =
proc isTransparent*(image: Image): bool {.raises: [].} =
## Checks if this image is fully transparent or not.
when allowSimd and compiles(isTransparentSimd):
return isTransparentSimd(
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
return isTransparentSimd(image.data)
result = true
@ -368,11 +358,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
return
when allowSimd and compiles(applyOpacitySimd):
applyOpacitySimd(
cast[ptr UncheckedArray[uint8]](image.data[0].addr),
image.data.len * 4,
opacity
)
applyOpacitySimd(image.data, opacity)
return
for i in 0 ..< image.data.len:
@ -386,10 +372,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
proc invert*(image: Image) {.raises: [].} =
## Inverts all of the colors and alpha.
when allowSimd and compiles(invertImageSimd):
invertImageSimd(
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
invertImageSimd(image.data)
return
for i in 0 ..< image.data.len:
@ -471,11 +454,7 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} =
result = newMask(image.width, image.height)
when allowSimd and compiles(newMaskFromImageSimd):
newMaskFromImageSimd(
cast[ptr UncheckedArray[uint8]](result.data[0].addr),
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
newMaskFromImageSimd(result.data, image.data)
return
for i in 0 ..< image.data.len:

View file

@ -79,16 +79,12 @@ proc fillUnsafe*(
) {.raises: [].} =
## Fills the image data with the color starting at index start and
## continuing for len indices.
let rgbx = color.asRgbx()
when allowSimd and compiles(fillUnsafeSimd):
fillUnsafeSimd(
cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
len,
rgbx
)
fillUnsafeSimd(data, start, len, color)
return
let rgbx = color.asRgbx()
# Use memset when every byte has the same value
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
@ -117,10 +113,7 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
## Converts an image to premultiplied alpha from straight alpha.
when allowSimd and compiles(toPremultipliedAlphaSimd):
toPremultipliedAlphaSimd(
cast[ptr UncheckedArray[uint32]](data[0].addr),
data.len
)
toPremultipliedAlphaSimd(data)
return
for i in 0 ..< data.len:
@ -133,10 +126,7 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
when allowSimd and compiles(isOpaqueSimd):
return isOpaqueSimd(
cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
len
)
return isOpaqueSimd(data, start, len)
result = true

View file

@ -1,7 +1,10 @@
import common, internal, vmath
when defined(amd64) and allowSimd:
import nimsimd/sse2
when allowSimd:
import simd
when defined(amd64):
import nimsimd/sse2
type
Mask* = ref object
@ -194,11 +197,7 @@ proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} =
return
when allowSimd and compiles(applyOpacitySimd):
applyOpacitySimd(
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len,
opacity
)
applyOpacitySimd(mask.data, opacity)
return
for i in 0 ..< mask.data.len:
@ -234,11 +233,8 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
proc invert*(mask: Mask) {.raises: [].} =
## Inverts all of the values - creates a negative of the mask.
when allowSimd and compiles(invertImageSimd):
invertMaskSimd(
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len
)
when allowSimd and compiles(invertMaskSimd):
invertMaskSimd(mask.data)
return
for i in 0 ..< mask.data.len:
@ -308,10 +304,7 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
proc ceil*(mask: Mask) {.raises: [].} =
## A value of 0 stays 0. Anything else turns into 255.
when allowSimd and compiles(invertImageSimd):
ceilMaskSimd(
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len
)
ceilMaskSimd(mask.data)
return
for i in 0 ..< mask.data.len:

View file

@ -7,23 +7,30 @@ when defined(release):
{.push checks: off.}
proc fillUnsafeAvx*(
data: ptr UncheckedArray[ColorRGBX],
len: int,
rgbx: ColorRGBX
data: var seq[ColorRGBX],
start, len: int,
color: SomeColor
) =
var i: int
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
let rgbx = color.asRgbx()
var
i = start
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
data[i] = rgbx
inc i
p += 4
let
iterations = (len - i) div 8
colorVec = mm256_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations:
mm256_store_si256(data[i].addr, colorVec)
i += 8
# Fill whatever is left the slow way
for i in i ..< len:
mm256_store_si256(cast[pointer](p), colorVec)
p += 32
i += 8 * iterations
for i in i ..< start + len:
data[i] = rgbx
when defined(release):

View file

@ -6,20 +6,21 @@ when defined(gcc) or defined(clang):
when defined(release):
{.push checks: off.}
proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
result = true
let color = data[0]
var i: int
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
# Align to 32 bytes
while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
if data[i] != color:
return false
inc i
let
colorVec = mm256_set1_epi32(cast[int32](color))
iterations = (len - i) div 16
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(data[i].addr)
@ -31,22 +32,23 @@ proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
return false
i += 16
for i in i ..< len:
for i in i ..< data.len:
if data[i] != color:
return false
proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool =
result = true
var i: int
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
# Align to 32 bytes
while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
if data[i].a != 0:
return false
inc i
let
vecZero = mm256_setzero_si256()
iterations = (len - i) div 16
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(data[i].addr)
@ -57,22 +59,23 @@ proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
return false
i += 16
for i in i ..< len:
for i in i ..< data.len:
if data[i].a != 0:
return false
proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
result = true
var i: int
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
var i = start
# Align to 32 bytes
while i < (start + len) and (cast[uint](data[i].addr) and 31) != 0:
if data[i].a != 255:
return false
inc i
let
vec255 = mm256_set1_epi8(255)
iterations = (len - i) div 16
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(data[i].addr)
@ -83,21 +86,21 @@ proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
return false
i += 16
for i in i ..< len:
for i in i ..< start + len:
if data[i].a != 255:
return false
proc toPremultipliedAlphaAvx2*(
data: ptr UncheckedArray[uint32],
len: int
): int =
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) =
var i: int
let
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081))
for _ in 0 ..< len div 8:
oddMask = mm256_set1_epi16(0xff00)
div255 = mm256_set1_epi16(0x8081)
iterations = data.len div 8
for _ in 0 ..< iterations:
let
values = mm256_loadu_si256(data[result].addr)
values = mm256_loadu_si256(data[i].addr)
alpha = mm256_and_si256(values, alphaMask)
eq = mm256_cmpeq_epi8(values, alphaMask)
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
@ -112,10 +115,18 @@ proc toPremultipliedAlphaAvx2*(
colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7)
mm256_storeu_si256(
data[result].addr,
data[i].addr,
mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8))
)
result += 8
i += 8
for i in i ..< data.len:
var c = data[i]
if c.a != 255:
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
data[i] = c
when defined(release):
{.pop.}

View file

@ -1,4 +1,4 @@
import chroma, vmath
import chroma
when defined(release):
{.push checks: off.}
@ -31,52 +31,64 @@ when defined(amd64):
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
proc fillUnsafeSimd*(
data: ptr UncheckedArray[ColorRGBX],
len: int,
rgbx: ColorRGBX
data: var seq[ColorRGBX],
start, len: int,
color: SomeColor
) =
if cpuHasAvx and len >= 64:
fillUnsafeAvx(data, len, rgbx)
else:
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
data[i] = rgbx
inc i
if cpuHasAvx:
fillUnsafeAvx(data, start, len, color)
return
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(data[i].addr, colorVec)
mm_store_si128(data[i + 4].addr, colorVec)
i += 8
let rgbx = color.asRgbx()
for i in i ..< len:
data[i] = rgbx
var
i = start
p = cast[uint](data[i].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx
inc i
p += 4
proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(cast[pointer](p), colorVec)
mm_store_si128(cast[pointer](p + 16), colorVec)
p += 32
i += iterations * 8
for i in i ..< start + len:
data[i] = rgbx
proc isOneColorSimd*(data: var seq[ColorRGBX]): bool =
if cpuHasAvx2:
return isOneColorAvx2(data, len)
return isOneColorAvx2(data)
result = true
let color = data[0]
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
if data[i] != color:
return false
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (len - i) div 16
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
@ -84,123 +96,133 @@ when defined(amd64):
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
i += 16
p += 64
i += 16 * iterations
for i in i ..< len:
for i in i ..< data.len:
if data[i] != color:
return false
proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
proc isTransparentSimd*(data: var seq[ColorRGBX]): bool =
if cpuHasAvx2:
return isTransparentAvx2(data, len)
return isTransparentAvx2(data)
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
if data[i].a != 0:
return false
inc i
p += 4
result = true
let
vecZero = mm_setzero_si128()
iterations = (len - i) div 16
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
i += 16
p += 64
i += 16 * iterations
for i in i ..< len:
for i in i ..< data.len:
if data[i].a != 0:
return false
proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool =
if cpuHasAvx2:
return isOpaqueAvx2(data, len)
return isOpaqueAvx2(data, start, len)
result = true
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
var
i = start
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
vec255 = mm_set1_epi8(255)
iterations = (len - i) div 16
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
i += 16
p += 64
i += 16 * iterations
for i in i ..< len:
for i in i ..< start + len:
if data[i].a != 255:
return false
proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) =
var i: int
proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) =
if cpuHasAvx2:
i = toPremultipliedAlphaAvx2(data, len)
else:
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
for _ in 0 ..< len div 4:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
toPremultipliedAlphaAvx2(data)
return
for i in i ..< len:
var c: ColorRGBX
copyMem(c.addr, data[i].addr, 4)
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
copyMem(data[i].addr, c.addr, 4)
proc newImageFromMaskSimd*(
dst: ptr UncheckedArray[ColorRGBX],
src: ptr UncheckedArray[uint8],
len: int
) =
var i: int
for _ in 0 ..< len div 16:
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
iterations = data.len div 4
for _ in 0 ..< iterations:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
for i in i ..< data.len:
var c = data[i]
if c.a != 255:
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
data[i] = c
proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) =
var i: int
for _ in 0 ..< src.len div 16:
var alphas = mm_loadu_si128(src[i].addr)
for j in 0 ..< 4:
var unpacked = unpackAlphaValues(alphas)
@ -210,17 +232,13 @@ when defined(amd64):
alphas = mm_srli_si128(alphas, 4)
i += 16
for i in i ..< len:
for i in i ..< src.len:
let v = src[i]
dst[i] = rgbx(v, v, v, v)
proc newMaskFromImageSimd*(
dst: ptr UncheckedArray[uint8],
src: ptr UncheckedArray[ColorRGBX],
len: int
) =
proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) =
var i: int
for _ in 0 ..< len div 16:
for _ in 0 ..< src.len div 16:
let
a = mm_loadu_si128(src[i + 0].addr)
b = mm_loadu_si128(src[i + 4].addr)
@ -232,25 +250,41 @@ when defined(amd64):
)
i += 16
for i in i ..< len:
for i in i ..< src.len:
dst[i] = src[i].a
proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) =
var i: int
let vec255 = mm_set1_epi8(cast[int8](255))
for _ in 0 ..< len div 16:
let
a = mm_loadu_si128(data[i + 0].addr)
b = mm_loadu_si128(data[i + 4].addr)
c = mm_loadu_si128(data[i + 8].addr)
d = mm_loadu_si128(data[i + 12].addr)
mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a))
mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b))
mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c))
mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d))
i += 16
proc invertImageSimd*(data: var seq[ColorRGBX]) =
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
var rgbx = data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
data[i] = rgbx
inc i
p += 4
for i in i ..< len:
let
vec255 = mm_set1_epi8(255)
iterations = data.len div 16
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 16 * iterations
for i in i ..< data.len:
var rgbx = data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
@ -258,49 +292,76 @@ when defined(amd64):
rgbx.a = 255 - rgbx.a
data[i] = rgbx
toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len)
toPremultipliedAlphaSimd(data)
proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) =
var i: int
let vec255 = mm_set1_epi8(255)
for _ in 0 ..< len div 16:
var values = mm_loadu_si128(data[i].addr)
values = mm_sub_epi8(vec255, values)
mm_storeu_si128(data[i].addr, values)
i += 16
proc invertMaskSimd*(data: var seq[uint8]) =
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
data[i] = 255 - data[i]
inc i
inc p
for j in i ..< len:
data[j] = 255 - data[j]
let
vec255 = mm_set1_epi8(255)
iterations = data.len div 64
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 64 * iterations
for i in i ..< data.len:
data[i] = 255 - data[i]
proc ceilMaskSimd*(data: var seq[uint8]) =
var
i: int
p = cast[uint](data[0].addr)
proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) =
var i: int
let
zeroVec = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
for _ in 0 ..< len div 16:
var values = mm_loadu_si128(data[i].addr)
iterations = data.len div 16
for _ in 0 ..< iterations:
var values = mm_loadu_si128(cast[pointer](p))
values = mm_cmpeq_epi8(values, zeroVec)
values = mm_andnot_si128(values, vec255)
mm_storeu_si128(data[i].addr, values)
i += 16
mm_storeu_si128(cast[pointer](p), values)
p += 16
i += 16 * iterations
for i in i ..< len:
for i in i ..< data.len:
if data[i] != 0:
data[i] = 255
proc applyOpacitySimd*(
data: ptr UncheckedArray[uint8],
len: int,
opacity: uint16
) =
var i: int
proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) =
var
i: int
p = cast[uint](data[0].addr)
len =
when data is seq[ColorRGBX]:
data.len * 4
else:
data.len
let
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
zeroVec = mm_setzero_si128()
opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8)
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
iterations = len div 16
for _ in 0 ..< len div 16:
let values = mm_loadu_si128(data[i].addr)
let values = mm_loadu_si128(cast[pointer](p))
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
var
valuesEven = mm_slli_epi16(values, 8)
@ -310,13 +371,23 @@ when defined(amd64):
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
cast[pointer](p),
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
i += 16
p += 16
i += 16 * iterations
for i in i ..< len:
data[i] = ((data[i] * opacity) div 255).uint8
when data is seq[ColorRGBX]:
for i in i div 4 ..< data.len:
var rgbx = data[i]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
data[i] = rgbx
else:
for i in i ..< data.len:
data[i] = ((data[i] * opacity) div 255).uint8
when defined(release):
{.pop.}