aligned sse2 + avx2 versions isOneColor isOpaque isTransparent

This commit is contained in:
Ryan Oldenburg 2022-06-28 17:59:50 -05:00
parent c244b8cb81
commit ffc2b5b4d5
4 changed files with 190 additions and 37 deletions

View file

@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
requires "chroma >= 0.2.5"
requires "zippy >= 0.10.2"
requires "flatty >= 0.3.4"
requires "nimsimd >= 1.1.1"
requires "nimsimd >= 1.1.5"
requires "bumpy >= 1.1.1"
task bindings, "Generate bindings":

View file

@ -1,7 +1,7 @@
import blends, bumpy, chroma, common, masks, pixie/internal, vmath
when defined(amd64) and allowSimd:
import nimsimd/sse2
import nimsimd/sse2, runtimechecked/avx2
const h = 0.5.float32
@ -101,54 +101,84 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
proc isOneColor*(image: Image): bool {.raises: [].} =
## Checks if the entire image is the same color.
when defined(amd64) and allowSimd:
if cpuHasAvx2:
return isOneColorAvx2(image.data, 0, image.data.len)
result = true
let color = image.data[0]
var i: int
when defined(amd64) and allowSimd:
let colorVec = mm_set1_epi32(cast[int32](color))
for _ in 0 ..< image.data.len div 16:
# Align to 16 bytes
var p = cast[uint](image.data[i].addr)
while i < image.data.len and (p and 15) != 0:
if image.data[i] != color:
return false
inc i
p += 4
let
values0 = mm_loadu_si128(image.data[i + 0].addr)
values1 = mm_loadu_si128(image.data[i + 4].addr)
values2 = mm_loadu_si128(image.data[i + 8].addr)
values3 = mm_loadu_si128(image.data[i + 12].addr)
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
eq3 = mm_cmpeq_epi8(values3, colorVec)
eq = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq) != 0xffff:
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
i += 16
p += 64
i += 16 * iterations
for j in i ..< image.data.len:
if image.data[j] != color:
for i in i ..< image.data.len:
if image.data[i] != color:
return false
proc isTransparent*(image: Image): bool {.raises: [].} =
## Checks if this image is fully transparent or not.
when defined(amd64) and allowSimd:
if cpuHasAvx2:
return isTransparentAvx2(image.data, 0, image.data.len)
result = true
var i: int
when defined(amd64) and allowSimd:
let vecZero = mm_setzero_si128()
for _ in 0 ..< image.data.len div 16:
# Align to 16 bytes
var p = cast[uint](image.data[i].addr)
while i < image.data.len and (p and 15) != 0:
if image.data[i].a != 0:
return false
inc i
p += 4
let
values0 = mm_loadu_si128(image.data[i + 0].addr)
values1 = mm_loadu_si128(image.data[i + 4].addr)
values2 = mm_loadu_si128(image.data[i + 8].addr)
values3 = mm_loadu_si128(image.data[i + 12].addr)
vecZero = mm_setzero_si128()
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values, vecZero)) != 0xffff:
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
i += 16
p += 64
i += 16 * iterations
for j in i ..< image.data.len:
if image.data[j].a != 0:
for i in i ..< image.data.len:
if image.data[i].a != 0:
return false
proc isOpaque*(image: Image): bool {.raises: [].} =

View file

@ -3,8 +3,10 @@ import bumpy, chroma, common, system/memory, vmath
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
when defined(amd64) and allowSimd:
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx
let cpuHasAvx* = checkInstructionSets({AVX})
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2
let
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
template currentExceptionAsPixieError*(): untyped =
## Gets the current exception and returns it as a PixieError with stack trace.
@ -178,27 +180,42 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
data[i] = c
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
when defined(amd64) and allowSimd:
if cpuHasAvx2 and len >= 64:
return isOpaqueAvx2(data, start, len)
result = true
var i = start
when defined(amd64) and allowSimd:
let vec255 = mm_set1_epi32(cast[int32](uint32.high))
for _ in start ..< (start + len) div 16:
# Align to 16 bytes
var p = cast[uint](data[i].addr)
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
values0 = mm_loadu_si128(data[i + 0].addr)
values1 = mm_loadu_si128(data[i + 4].addr)
values2 = mm_loadu_si128(data[i + 8].addr)
values3 = mm_loadu_si128(data[i + 12].addr)
vec255 = mm_set1_epi8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values, vec255)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
i += 16
p += 64
i += 16 * iterations
for j in i ..< start + len:
if data[j].a != 255:
for i in i ..< start + len:
if data[i].a != 255:
return false
when defined(amd64) and allowSimd:

View file

@ -0,0 +1,106 @@
import chroma, nimsimd/avx2
when defined(gcc) or defined(clang):
{.localPassc: "-mavx2".}
when defined(release):
{.push checks: off.}
proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
result = true
let color = data[0]
var
i = start
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i] != color:
return false
inc i
p += 4
let
colorVec = mm256_set1_epi32(cast[int32](color))
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(cast[pointer](p))
values1 = mm256_load_si256(cast[pointer](p + 32))
eq0 = mm256_cmpeq_epi8(values0, colorVec)
eq1 = mm256_cmpeq_epi8(values1, colorVec)
eq01 = mm256_and_si256(eq0, eq1)
if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i] != color:
return false
proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
result = true
var
i = start
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i].a != 0:
return false
inc i
p += 4
let
vecZero = mm256_setzero_si256()
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(cast[pointer](p))
values1 = mm256_load_si256(cast[pointer](p + 32))
values01 = mm256_or_si256(values0, values1)
eq = mm256_cmpeq_epi8(values01, vecZero)
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i].a != 0:
return false
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
result = true
var
i = start
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
vec255 = mm256_set1_epi8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(cast[pointer](p))
values1 = mm256_load_si256(cast[pointer](p + 32))
values01 = mm256_and_si256(values0, values1)
eq = mm256_cmpeq_epi8(values01, vec255)
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i].a != 255:
return false
when defined(release):
{.pop.}