avx fill
This commit is contained in:
parent
3bdc6c3266
commit
1cfaea9357
2 changed files with 49 additions and 5 deletions
|
@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
|
|||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||
|
||||
when defined(amd64) and allowSimd:
|
||||
import nimsimd/sse2
|
||||
import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
|
||||
let cpuHasAvx* = checkInstructionSets({AVX})
|
||||
|
||||
template currentExceptionAsPixieError*(): untyped =
|
||||
## Gets the current exception and returns it as a PixieError with stack trace.
|
||||
|
@ -63,6 +64,13 @@ proc fillUnsafe*(
|
|||
## Fills the image data with the color starting at index start and
|
||||
## continuing for len indices.
|
||||
let rgbx = color.asRgbx()
|
||||
|
||||
# If we can use AVX, do so
|
||||
when defined(amd64) and allowSimd:
|
||||
if cpuHasAvx and len >= 64:
|
||||
fillUnsafeAvx(data, rgbx, start, len)
|
||||
return
|
||||
|
||||
# Use memset when every byte has the same value
|
||||
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
||||
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
||||
|
@ -70,14 +78,15 @@ proc fillUnsafe*(
|
|||
var i = start
|
||||
when defined(amd64) and allowSimd:
|
||||
# Align to 16 bytes
|
||||
while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0:
|
||||
var p = cast[uint](data[i].addr)
|
||||
while i < (start + len) and (p and 15) != 0:
|
||||
data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
# When supported, SIMD fill until we run out of room
|
||||
let
|
||||
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||
iterations = (start + len - i) div 8
|
||||
var p = cast[uint](data[i].addr)
|
||||
for _ in 0 ..< iterations:
|
||||
mm_store_si128(cast[pointer](p), colorVec)
|
||||
mm_store_si128(cast[pointer](p + 16), colorVec)
|
||||
|
@ -93,8 +102,8 @@ proc fillUnsafe*(
|
|||
copyMem(data[i].addr, u64.addr, 8)
|
||||
i += 2
|
||||
# Fill whatever is left the slow way
|
||||
for j in i ..< start + len:
|
||||
data[j] = rgbx
|
||||
for i in i ..< start + len:
|
||||
data[i] = rgbx
|
||||
|
||||
const straightAlphaTable = block:
|
||||
var table: array[256, array[256, uint8]]
|
||||
|
|
35
src/pixie/simd/avx.nim
Normal file
35
src/pixie/simd/avx.nim
Normal file
|
@ -0,0 +1,35 @@
|
|||
import chroma, nimsimd/avx
|
||||
|
||||
when defined(gcc) or defined(clang):
|
||||
{.localPassc: "-mavx".}
|
||||
|
||||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
||||
proc fillUnsafeAvx*(
|
||||
data: var seq[ColorRGBX],
|
||||
rgbx: ColorRGBX,
|
||||
start, len: int
|
||||
) =
|
||||
var
|
||||
i = start
|
||||
p = cast[uint](data[i].addr)
|
||||
# Align to 32 bytes
|
||||
while i < (start + len) and (p and 31) != 0:
|
||||
data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
# When supported, SIMD fill until we run out of room
|
||||
let
|
||||
iterations = (start + len - i) div 8
|
||||
colorVec = mm256_set1_epi32(cast[int32](rgbx))
|
||||
for _ in 0 ..< iterations:
|
||||
mm256_store_si256(cast[pointer](p), colorVec)
|
||||
p += 32
|
||||
i += iterations * 8
|
||||
# Fill whatever is left the slow way
|
||||
for i in i ..< start + len:
|
||||
data[i] = rgbx
|
||||
|
||||
when defined(release):
|
||||
{.pop.}
|
Loading…
Reference in a new issue