new simd approach
This commit is contained in:
parent
28a880b2b7
commit
9f2c018c9e
5 changed files with 255 additions and 229 deletions
|
@ -1,7 +1,10 @@
|
||||||
import blends, bumpy, chroma, common, masks, pixie/internal, vmath
|
import blends, bumpy, chroma, common, masks, internal, vmath
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when allowSimd:
|
||||||
import nimsimd/sse2, runtimechecked/avx2
|
import simd
|
||||||
|
|
||||||
|
when defined(amd64):
|
||||||
|
import nimsimd/sse2
|
||||||
|
|
||||||
const h = 0.5.float32
|
const h = 0.5.float32
|
||||||
|
|
||||||
|
@ -101,83 +104,30 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
|
||||||
|
|
||||||
proc isOneColor*(image: Image): bool {.raises: [].} =
|
proc isOneColor*(image: Image): bool {.raises: [].} =
|
||||||
## Checks if the entire image is the same color.
|
## Checks if the entire image is the same color.
|
||||||
when defined(amd64) and allowSimd:
|
when allowSimd and compiles(isOneColorSimd):
|
||||||
if cpuHasAvx2:
|
return isOneColorSimd(
|
||||||
return isOneColorAvx2(image.data, 0, image.data.len)
|
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
|
||||||
|
image.data.len
|
||||||
|
)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
let color = image.data[0]
|
let color = cast[uint32](image.data[0])
|
||||||
|
for i in 0 ..< image.data.len:
|
||||||
var i: int
|
if cast[uint32](image.data[i]) != color:
|
||||||
when defined(amd64) and allowSimd:
|
|
||||||
# Align to 16 bytes
|
|
||||||
var p = cast[uint](image.data[i].addr)
|
|
||||||
while i < image.data.len and (p and 15) != 0:
|
|
||||||
if image.data[i] != color:
|
|
||||||
return false
|
|
||||||
inc i
|
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
|
||||||
colorVec = mm_set1_epi32(cast[int32](color))
|
|
||||||
iterations = (image.data.len - i) div 16
|
|
||||||
for _ in 0 ..< iterations:
|
|
||||||
let
|
|
||||||
values0 = mm_load_si128(cast[pointer](p))
|
|
||||||
values1 = mm_load_si128(cast[pointer](p + 16))
|
|
||||||
values2 = mm_load_si128(cast[pointer](p + 32))
|
|
||||||
values3 = mm_load_si128(cast[pointer](p + 48))
|
|
||||||
eq0 = mm_cmpeq_epi8(values0, colorVec)
|
|
||||||
eq1 = mm_cmpeq_epi8(values1, colorVec)
|
|
||||||
eq2 = mm_cmpeq_epi8(values2, colorVec)
|
|
||||||
eq3 = mm_cmpeq_epi8(values3, colorVec)
|
|
||||||
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
|
|
||||||
if mm_movemask_epi8(eq0123) != 0xffff:
|
|
||||||
return false
|
|
||||||
p += 64
|
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< image.data.len:
|
|
||||||
if image.data[i] != color:
|
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc isTransparent*(image: Image): bool {.raises: [].} =
|
proc isTransparent*(image: Image): bool {.raises: [].} =
|
||||||
## Checks if this image is fully transparent or not.
|
## Checks if this image is fully transparent or not.
|
||||||
when defined(amd64) and allowSimd:
|
when allowSimd and compiles(isTransparentSimd):
|
||||||
if cpuHasAvx2:
|
return isTransparentSimd(
|
||||||
return isTransparentAvx2(image.data, 0, image.data.len)
|
cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
|
||||||
|
image.data.len
|
||||||
|
)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var i: int
|
for i in 0 ..< image.data.len:
|
||||||
when defined(amd64) and allowSimd:
|
|
||||||
# Align to 16 bytes
|
|
||||||
var p = cast[uint](image.data[i].addr)
|
|
||||||
while i < image.data.len and (p and 15) != 0:
|
|
||||||
if image.data[i].a != 0:
|
|
||||||
return false
|
|
||||||
inc i
|
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
|
||||||
vecZero = mm_setzero_si128()
|
|
||||||
iterations = (image.data.len - i) div 16
|
|
||||||
for _ in 0 ..< iterations:
|
|
||||||
let
|
|
||||||
values0 = mm_load_si128(cast[pointer](p))
|
|
||||||
values1 = mm_load_si128(cast[pointer](p + 16))
|
|
||||||
values2 = mm_load_si128(cast[pointer](p + 32))
|
|
||||||
values3 = mm_load_si128(cast[pointer](p + 48))
|
|
||||||
values01 = mm_or_si128(values0, values1)
|
|
||||||
values23 = mm_or_si128(values2, values3)
|
|
||||||
values0123 = mm_or_si128(values01, values23)
|
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
|
|
||||||
return false
|
|
||||||
p += 64
|
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< image.data.len:
|
|
||||||
if image.data[i].a != 0:
|
if image.data[i].a != 0:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,11 @@ import bumpy, chroma, common, system/memory, vmath
|
||||||
|
|
||||||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when allowSimd:
|
||||||
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2
|
import simd
|
||||||
let
|
|
||||||
cpuHasAvx* = checkInstructionSets({AVX})
|
when defined(amd64):
|
||||||
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
|
import nimsimd/sse2
|
||||||
|
|
||||||
template currentExceptionAsPixieError*(): untyped =
|
template currentExceptionAsPixieError*(): untyped =
|
||||||
## Gets the current exception and returns it as a PixieError with stack trace.
|
## Gets the current exception and returns it as a PixieError with stack trace.
|
||||||
|
@ -81,45 +81,20 @@ proc fillUnsafe*(
|
||||||
## continuing for len indices.
|
## continuing for len indices.
|
||||||
let rgbx = color.asRgbx()
|
let rgbx = color.asRgbx()
|
||||||
|
|
||||||
# If we can use AVX, do so
|
when allowSimd and compiles(fillUnsafeSimd):
|
||||||
when defined(amd64) and allowSimd:
|
fillUnsafeSimd(
|
||||||
if cpuHasAvx and len >= 64:
|
cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
|
||||||
fillUnsafeAvx(data, rgbx, start, len)
|
len,
|
||||||
return
|
rgbx
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
# Use memset when every byte has the same value
|
# Use memset when every byte has the same value
|
||||||
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
||||||
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
||||||
else:
|
else:
|
||||||
var i = start
|
for color in data.mitems:
|
||||||
when defined(amd64) and allowSimd:
|
color = rgbx
|
||||||
# Align to 16 bytes
|
|
||||||
var p = cast[uint](data[i].addr)
|
|
||||||
while i < (start + len) and (p and 15) != 0:
|
|
||||||
data[i] = rgbx
|
|
||||||
inc i
|
|
||||||
p += 4
|
|
||||||
# When supported, SIMD fill until we run out of room
|
|
||||||
let
|
|
||||||
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
|
||||||
iterations = (start + len - i) div 8
|
|
||||||
for _ in 0 ..< iterations:
|
|
||||||
mm_store_si128(cast[pointer](p), colorVec)
|
|
||||||
mm_store_si128(cast[pointer](p + 16), colorVec)
|
|
||||||
p += 32
|
|
||||||
i += iterations * 8
|
|
||||||
else:
|
|
||||||
when sizeof(int) == 8:
|
|
||||||
# Fill 8 bytes at a time when possible
|
|
||||||
var
|
|
||||||
u32 = cast[uint32](rgbx)
|
|
||||||
u64 = cast[uint64]([u32, u32])
|
|
||||||
for _ in 0 ..< len div 2:
|
|
||||||
copyMem(data[i].addr, u64.addr, 8)
|
|
||||||
i += 2
|
|
||||||
# Fill whatever is left the slow way
|
|
||||||
for i in i ..< start + len:
|
|
||||||
data[i] = rgbx
|
|
||||||
|
|
||||||
const straightAlphaTable = block:
|
const straightAlphaTable = block:
|
||||||
var table: array[256, array[256, uint8]]
|
var table: array[256, array[256, uint8]]
|
||||||
|
@ -141,39 +116,14 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
|
||||||
|
|
||||||
proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
|
proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
|
||||||
## Converts an image to premultiplied alpha from straight alpha.
|
## Converts an image to premultiplied alpha from straight alpha.
|
||||||
var i: int
|
when allowSimd and compiles(toPremultipliedAlphaSimd):
|
||||||
when defined(amd64) and allowSimd:
|
toPremultipliedAlphaSimd(
|
||||||
if cpuHasAvx2:
|
cast[ptr UncheckedArray[uint32]](data[0].addr),
|
||||||
i = toPremultipliedAlphaAvx2(data)
|
data.len
|
||||||
else:
|
)
|
||||||
let
|
return
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
|
||||||
for _ in 0 ..< data.len div 4:
|
|
||||||
let
|
|
||||||
values = mm_loadu_si128(data[i].addr)
|
|
||||||
alpha = mm_and_si128(values, alphaMask)
|
|
||||||
eq = mm_cmpeq_epi8(values, alphaMask)
|
|
||||||
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
|
||||||
let
|
|
||||||
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
|
|
||||||
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
|
|
||||||
var
|
|
||||||
colorsEven = mm_slli_epi16(values, 8)
|
|
||||||
colorsOdd = mm_and_si128(values, oddMask)
|
|
||||||
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
|
|
||||||
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
|
|
||||||
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
|
|
||||||
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
|
|
||||||
mm_storeu_si128(
|
|
||||||
data[i].addr,
|
|
||||||
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
|
|
||||||
)
|
|
||||||
i += 4
|
|
||||||
|
|
||||||
# Convert whatever is left
|
for i in 0 ..< data.len:
|
||||||
for i in i ..< data.len:
|
|
||||||
var c = data[i]
|
var c = data[i]
|
||||||
if c.a != 255:
|
if c.a != 255:
|
||||||
c.r = ((c.r.uint32 * c.a) div 255).uint8
|
c.r = ((c.r.uint32 * c.a) div 255).uint8
|
||||||
|
@ -182,41 +132,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
|
||||||
data[i] = c
|
data[i] = c
|
||||||
|
|
||||||
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
|
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
|
||||||
when defined(amd64) and allowSimd:
|
when allowSimd and compiles(isOpaqueSimd):
|
||||||
if cpuHasAvx2 and len >= 64:
|
return isOpaqueSimd(
|
||||||
return isOpaqueAvx2(data, start, len)
|
cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
|
||||||
|
len
|
||||||
|
)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var i = start
|
for i in start ..< start + len:
|
||||||
when defined(amd64) and allowSimd:
|
|
||||||
# Align to 16 bytes
|
|
||||||
var p = cast[uint](data[i].addr)
|
|
||||||
while i < (start + len) and (p and 15) != 0:
|
|
||||||
if data[i].a != 255:
|
|
||||||
return false
|
|
||||||
inc i
|
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
|
||||||
vec255 = mm_set1_epi8(255)
|
|
||||||
iterations = (start + len - i) div 16
|
|
||||||
for _ in 0 ..< iterations:
|
|
||||||
let
|
|
||||||
values0 = mm_load_si128(cast[pointer](p))
|
|
||||||
values1 = mm_load_si128(cast[pointer](p + 16))
|
|
||||||
values2 = mm_load_si128(cast[pointer](p + 32))
|
|
||||||
values3 = mm_load_si128(cast[pointer](p + 48))
|
|
||||||
values01 = mm_and_si128(values0, values1)
|
|
||||||
values23 = mm_and_si128(values2, values3)
|
|
||||||
values0123 = mm_and_si128(values01, values23)
|
|
||||||
eq = mm_cmpeq_epi8(values0123, vec255)
|
|
||||||
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
|
||||||
return false
|
|
||||||
p += 64
|
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< start + len:
|
|
||||||
if data[i].a != 255:
|
if data[i].a != 255:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
|
|
@ -7,28 +7,23 @@ when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
|
||||||
proc fillUnsafeAvx*(
|
proc fillUnsafeAvx*(
|
||||||
data: var seq[ColorRGBX],
|
data: ptr UncheckedArray[ColorRGBX],
|
||||||
rgbx: ColorRGBX,
|
len: int,
|
||||||
start, len: int
|
rgbx: ColorRGBX
|
||||||
) =
|
) =
|
||||||
var
|
var i: int
|
||||||
i = start
|
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
|
||||||
p = cast[uint](data[i].addr)
|
|
||||||
# Align to 32 bytes
|
|
||||||
while i < (start + len) and (p and 31) != 0:
|
|
||||||
data[i] = rgbx
|
data[i] = rgbx
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
|
||||||
# When supported, SIMD fill until we run out of room
|
|
||||||
let
|
let
|
||||||
iterations = (start + len - i) div 8
|
iterations = (len - i) div 8
|
||||||
colorVec = mm256_set1_epi32(cast[int32](rgbx))
|
colorVec = mm256_set1_epi32(cast[int32](rgbx))
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
mm256_store_si256(cast[pointer](p), colorVec)
|
mm256_store_si256(data[i].addr, colorVec)
|
||||||
p += 32
|
i += 8
|
||||||
i += iterations * 8
|
|
||||||
# Fill whatever is left the slow way
|
# Fill whatever is left the slow way
|
||||||
for i in i ..< start + len:
|
for i in i ..< len:
|
||||||
data[i] = rgbx
|
data[i] = rgbx
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
|
|
|
@ -6,108 +6,93 @@ when defined(gcc) or defined(clang):
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
|
||||||
proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
let color = data[0]
|
let color = data[0]
|
||||||
|
|
||||||
var
|
var i: int
|
||||||
i = start
|
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
|
||||||
p = cast[uint](data[i].addr)
|
|
||||||
# Align to 32 bytes
|
|
||||||
while i < (start + len) and (p and 31) != 0:
|
|
||||||
if data[i] != color:
|
if data[i] != color:
|
||||||
return false
|
return false
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
let
|
||||||
colorVec = mm256_set1_epi32(cast[int32](color))
|
colorVec = mm256_set1_epi32(cast[int32](color))
|
||||||
iterations = (start + len - i) div 16
|
iterations = (len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm256_load_si256(cast[pointer](p))
|
values0 = mm256_load_si256(data[i].addr)
|
||||||
values1 = mm256_load_si256(cast[pointer](p + 32))
|
values1 = mm256_load_si256(data[i + 8].addr)
|
||||||
eq0 = mm256_cmpeq_epi8(values0, colorVec)
|
eq0 = mm256_cmpeq_epi8(values0, colorVec)
|
||||||
eq1 = mm256_cmpeq_epi8(values1, colorVec)
|
eq1 = mm256_cmpeq_epi8(values1, colorVec)
|
||||||
eq01 = mm256_and_si256(eq0, eq1)
|
eq01 = mm256_and_si256(eq0, eq1)
|
||||||
if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
|
if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
|
||||||
return false
|
return false
|
||||||
p += 64
|
i += 16
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< start + len:
|
for i in i ..< len:
|
||||||
if data[i] != color:
|
if data[i] != color:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var
|
var i: int
|
||||||
i = start
|
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
|
||||||
p = cast[uint](data[i].addr)
|
|
||||||
# Align to 32 bytes
|
|
||||||
while i < (start + len) and (p and 31) != 0:
|
|
||||||
if data[i].a != 0:
|
if data[i].a != 0:
|
||||||
return false
|
return false
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
let
|
||||||
vecZero = mm256_setzero_si256()
|
vecZero = mm256_setzero_si256()
|
||||||
iterations = (start + len - i) div 16
|
iterations = (len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm256_load_si256(cast[pointer](p))
|
values0 = mm256_load_si256(data[i].addr)
|
||||||
values1 = mm256_load_si256(cast[pointer](p + 32))
|
values1 = mm256_load_si256(data[i + 8].addr)
|
||||||
values01 = mm256_or_si256(values0, values1)
|
values01 = mm256_or_si256(values0, values1)
|
||||||
eq = mm256_cmpeq_epi8(values01, vecZero)
|
eq = mm256_cmpeq_epi8(values01, vecZero)
|
||||||
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
|
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
|
||||||
return false
|
return false
|
||||||
p += 64
|
i += 16
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< start + len:
|
for i in i ..< len:
|
||||||
if data[i].a != 0:
|
if data[i].a != 0:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var
|
var i: int
|
||||||
i = start
|
while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
|
||||||
p = cast[uint](data[i].addr)
|
|
||||||
# Align to 32 bytes
|
|
||||||
while i < (start + len) and (p and 31) != 0:
|
|
||||||
if data[i].a != 255:
|
if data[i].a != 255:
|
||||||
return false
|
return false
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
|
||||||
|
|
||||||
let
|
let
|
||||||
vec255 = mm256_set1_epi8(255)
|
vec255 = mm256_set1_epi8(255)
|
||||||
iterations = (start + len - i) div 16
|
iterations = (len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm256_load_si256(cast[pointer](p))
|
values0 = mm256_load_si256(data[i].addr)
|
||||||
values1 = mm256_load_si256(cast[pointer](p + 32))
|
values1 = mm256_load_si256(data[i + 8].addr)
|
||||||
values01 = mm256_and_si256(values0, values1)
|
values01 = mm256_and_si256(values0, values1)
|
||||||
eq = mm256_cmpeq_epi8(values01, vec255)
|
eq = mm256_cmpeq_epi8(values01, vec255)
|
||||||
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
|
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
|
||||||
return false
|
return false
|
||||||
p += 64
|
i += 16
|
||||||
i += 16 * iterations
|
|
||||||
|
|
||||||
for i in i ..< start + len:
|
for i in i ..< len:
|
||||||
if data[i].a != 255:
|
if data[i].a != 255:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int =
|
proc toPremultipliedAlphaAvx2*(data: ptr UncheckedArray[uint32], len: int): int =
|
||||||
let
|
let
|
||||||
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
||||||
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
||||||
div255 = mm256_set1_epi16(cast[int16](0x8081))
|
div255 = mm256_set1_epi16(cast[int16](0x8081))
|
||||||
for _ in 0 ..< data.len div 8:
|
for _ in 0 ..< len div 8:
|
||||||
let
|
let
|
||||||
values = mm256_loadu_si256(data[result].addr)
|
values = mm256_loadu_si256(data[result].addr)
|
||||||
alpha = mm256_and_si256(values, alphaMask)
|
alpha = mm256_and_si256(values, alphaMask)
|
||||||
|
|
172
src/pixie/simd.nim
Normal file
172
src/pixie/simd.nim
Normal file
|
@ -0,0 +1,172 @@
|
||||||
|
import chroma
|
||||||
|
|
||||||
|
when defined(amd64):
|
||||||
|
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2
|
||||||
|
|
||||||
|
let
|
||||||
|
cpuHasAvx* = checkInstructionSets({AVX})
|
||||||
|
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
|
||||||
|
|
||||||
|
proc fillUnsafeSimd*(
|
||||||
|
data: ptr UncheckedArray[ColorRGBX],
|
||||||
|
len: int,
|
||||||
|
rgbx: ColorRGBX
|
||||||
|
) =
|
||||||
|
if cpuHasAvx and len >= 64:
|
||||||
|
fillUnsafeAvx(data, len, rgbx)
|
||||||
|
else:
|
||||||
|
var i: int
|
||||||
|
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
|
||||||
|
data[i] = rgbx
|
||||||
|
inc i
|
||||||
|
|
||||||
|
let
|
||||||
|
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||||
|
iterations = (len - i) div 8
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
mm_store_si128(data[i].addr, colorVec)
|
||||||
|
mm_store_si128(data[i + 4].addr, colorVec)
|
||||||
|
i += 8
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
data[i] = rgbx
|
||||||
|
|
||||||
|
proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
|
if cpuHasAvx2:
|
||||||
|
return isOneColorAvx2(data, len)
|
||||||
|
|
||||||
|
result = true
|
||||||
|
|
||||||
|
let color = data[0]
|
||||||
|
|
||||||
|
var i: int
|
||||||
|
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
|
||||||
|
if data[i] != color:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
|
||||||
|
let
|
||||||
|
colorVec = mm_set1_epi32(cast[int32](color))
|
||||||
|
iterations = (len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm_load_si128(data[i].addr)
|
||||||
|
values1 = mm_load_si128(data[i + 4].addr)
|
||||||
|
values2 = mm_load_si128(data[i + 8].addr)
|
||||||
|
values3 = mm_load_si128(data[i + 12].addr)
|
||||||
|
eq0 = mm_cmpeq_epi8(values0, colorVec)
|
||||||
|
eq1 = mm_cmpeq_epi8(values1, colorVec)
|
||||||
|
eq2 = mm_cmpeq_epi8(values2, colorVec)
|
||||||
|
eq3 = mm_cmpeq_epi8(values3, colorVec)
|
||||||
|
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
|
||||||
|
if mm_movemask_epi8(eq0123) != 0xffff:
|
||||||
|
return false
|
||||||
|
i += 16
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
if data[i] != color:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
|
if cpuHasAvx2:
|
||||||
|
return isTransparentAvx2(data, len)
|
||||||
|
|
||||||
|
var i: int
|
||||||
|
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
|
||||||
|
if data[i].a != 0:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
|
||||||
|
result = true
|
||||||
|
|
||||||
|
let
|
||||||
|
vecZero = mm_setzero_si128()
|
||||||
|
iterations = (len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm_load_si128(data[i].addr)
|
||||||
|
values1 = mm_load_si128(data[i + 4].addr)
|
||||||
|
values2 = mm_load_si128(data[i + 8].addr)
|
||||||
|
values3 = mm_load_si128(data[i + 12].addr)
|
||||||
|
values01 = mm_or_si128(values0, values1)
|
||||||
|
values23 = mm_or_si128(values2, values3)
|
||||||
|
values0123 = mm_or_si128(values01, values23)
|
||||||
|
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
|
||||||
|
return false
|
||||||
|
i += 16
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
if data[i].a != 0:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
|
||||||
|
if cpuHasAvx2:
|
||||||
|
return isOpaqueAvx2(data, len)
|
||||||
|
|
||||||
|
result = true
|
||||||
|
|
||||||
|
var i: int
|
||||||
|
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
|
||||||
|
if data[i].a != 255:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = mm_set1_epi8(255)
|
||||||
|
iterations = (len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm_load_si128(data[i].addr)
|
||||||
|
values1 = mm_load_si128(data[i + 4].addr)
|
||||||
|
values2 = mm_load_si128(data[i + 8].addr)
|
||||||
|
values3 = mm_load_si128(data[i + 12].addr)
|
||||||
|
values01 = mm_and_si128(values0, values1)
|
||||||
|
values23 = mm_and_si128(values2, values3)
|
||||||
|
values0123 = mm_and_si128(values01, values23)
|
||||||
|
eq = mm_cmpeq_epi8(values0123, vec255)
|
||||||
|
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
||||||
|
return false
|
||||||
|
i += 16
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
if data[i].a != 255:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) =
|
||||||
|
var i: int
|
||||||
|
if cpuHasAvx2:
|
||||||
|
i = toPremultipliedAlphaAvx2(data, len)
|
||||||
|
else:
|
||||||
|
let
|
||||||
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
||||||
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
|
for _ in 0 ..< len div 4:
|
||||||
|
let
|
||||||
|
values = mm_loadu_si128(data[i].addr)
|
||||||
|
alpha = mm_and_si128(values, alphaMask)
|
||||||
|
eq = mm_cmpeq_epi8(values, alphaMask)
|
||||||
|
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
||||||
|
let
|
||||||
|
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
|
||||||
|
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
|
||||||
|
var
|
||||||
|
colorsEven = mm_slli_epi16(values, 8)
|
||||||
|
colorsOdd = mm_and_si128(values, oddMask)
|
||||||
|
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
|
||||||
|
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
|
||||||
|
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
|
||||||
|
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
|
||||||
|
mm_storeu_si128(
|
||||||
|
data[i].addr,
|
||||||
|
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
|
||||||
|
)
|
||||||
|
i += 4
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
var c: ColorRGBX
|
||||||
|
copyMem(c.addr, data[i].addr, 4)
|
||||||
|
c.r = ((c.r.uint32 * c.a) div 255).uint8
|
||||||
|
c.g = ((c.g.uint32 * c.a) div 255).uint8
|
||||||
|
c.b = ((c.b.uint32 * c.a) div 255).uint8
|
||||||
|
copyMem(data[i].addr, c.addr, 4)
|
Loading…
Reference in a new issue