Merge pull request #450 from guzba/master
a little faster partitioning, runtime-checked avx fill
This commit is contained in:
commit
4afb0e58c4
3 changed files with 80 additions and 18 deletions
|
@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
|
|||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||
|
||||
when defined(amd64) and allowSimd:
|
||||
import nimsimd/sse2
|
||||
import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
|
||||
let cpuHasAvx* = checkInstructionSets({AVX})
|
||||
|
||||
template currentExceptionAsPixieError*(): untyped =
|
||||
## Gets the current exception and returns it as a PixieError with stack trace.
|
||||
|
@ -63,6 +64,13 @@ proc fillUnsafe*(
|
|||
## Fills the image data with the color starting at index start and
|
||||
## continuing for len indices.
|
||||
let rgbx = color.asRgbx()
|
||||
|
||||
# If we can use AVX, do so
|
||||
when defined(amd64) and allowSimd:
|
||||
if cpuHasAvx and len >= 64:
|
||||
fillUnsafeAvx(data, rgbx, start, len)
|
||||
return
|
||||
|
||||
# Use memset when every byte has the same value
|
||||
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
||||
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
||||
|
@ -70,14 +78,15 @@ proc fillUnsafe*(
|
|||
var i = start
|
||||
when defined(amd64) and allowSimd:
|
||||
# Align to 16 bytes
|
||||
while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0:
|
||||
var p = cast[uint](data[i].addr)
|
||||
while i < (start + len) and (p and 15) != 0:
|
||||
data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
# When supported, SIMD fill until we run out of room
|
||||
let
|
||||
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||
iterations = (start + len - i) div 8
|
||||
var p = cast[uint](data[i].addr)
|
||||
for _ in 0 ..< iterations:
|
||||
mm_store_si128(cast[pointer](p), colorVec)
|
||||
mm_store_si128(cast[pointer](p + 16), colorVec)
|
||||
|
@ -93,8 +102,8 @@ proc fillUnsafe*(
|
|||
copyMem(data[i].addr, u64.addr, 8)
|
||||
i += 2
|
||||
# Fill whatever is left the slow way
|
||||
for j in i ..< start + len:
|
||||
data[j] = rgbx
|
||||
for i in i ..< start + len:
|
||||
data[i] = rgbx
|
||||
|
||||
const straightAlphaTable = block:
|
||||
var table: array[256, array[256, uint8]]
|
||||
|
|
|
@ -1131,25 +1131,43 @@ proc partitionSegments(
|
|||
startY = top.uint32
|
||||
partitionHeight = height.uint32 div numPartitions
|
||||
|
||||
for (segment, winding) in segments:
|
||||
var entry = initPartitionEntry(segment, winding)
|
||||
if partitionHeight == 0:
|
||||
result[0].entries.add(move entry)
|
||||
else:
|
||||
var entries = newSeq[PartitionEntry](segments.len)
|
||||
for i, (segment, winding) in segments:
|
||||
entries[i] = initPartitionEntry(segment, winding)
|
||||
|
||||
if numPartitions == 1:
|
||||
result[0].entries = move entries
|
||||
else:
|
||||
iterator partitionRange(
|
||||
segment: Segment,
|
||||
numPartitions, startY, partitionHeight: uint32
|
||||
): uint32 =
|
||||
var
|
||||
atPartition = max(0, segment.at.y - startY.float32).uint32
|
||||
toPartition = max(0, segment.to.y - startY.float32).uint32
|
||||
atPartition = atPartition div partitionHeight
|
||||
toPartition = toPartition div partitionHeight
|
||||
atPartition = min(atPartition, result.high.uint32)
|
||||
toPartition = min(toPartition, result.high.uint32)
|
||||
for i in atPartition .. toPartition:
|
||||
result[i].entries.add(entry)
|
||||
atPartition = min(atPartition, numPartitions - 1)
|
||||
toPartition = min(toPartition, numPartitions - 1)
|
||||
for partitionIndex in atPartition .. toPartition:
|
||||
yield partitionIndex
|
||||
|
||||
var entryCounts = newSeq[int](numPartitions)
|
||||
for (segment, _) in segments:
|
||||
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
||||
inc entryCounts[partitionIndex]
|
||||
|
||||
for partitionIndex, entryCounts in entryCounts:
|
||||
result[partitionIndex].entries.setLen(entryCounts)
|
||||
|
||||
var indexes = newSeq[int](numPartitions)
|
||||
for i, (segment, winding) in segments:
|
||||
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
||||
result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
|
||||
inc indexes[partitionIndex]
|
||||
|
||||
# Set the bottom values for the partitions (y value where this partition ends)
|
||||
|
||||
var partitionBottom = top + partitionHeight.int
|
||||
|
||||
for partition in result.mitems:
|
||||
partition.bottom = partitionBottom
|
||||
partition.requiresAntiAliasing =
|
||||
|
@ -1313,7 +1331,7 @@ proc computeCoverage(
|
|||
if fillLen > 0:
|
||||
var i = fillStart
|
||||
when defined(amd64) and allowSimd:
|
||||
let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage))
|
||||
let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
|
||||
for _ in 0 ..< fillLen div 16:
|
||||
var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
|
||||
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
|
||||
|
@ -1354,7 +1372,7 @@ proc fillCoverage(
|
|||
let
|
||||
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
|
||||
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
|
||||
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255)))
|
||||
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
|
||||
allZeroes = mm_movemask_epi8(eqZero) == 0xffff
|
||||
all255 = mm_movemask_epi8(eq255) == 0xffff
|
||||
yield (coverageVec, allZeroes, all255)
|
||||
|
|
35
src/pixie/simd/avx.nim
Normal file
35
src/pixie/simd/avx.nim
Normal file
|
@ -0,0 +1,35 @@
|
|||
import chroma, nimsimd/avx
|
||||
|
||||
when defined(gcc) or defined(clang):
|
||||
{.localPassc: "-mavx".}
|
||||
|
||||
when defined(release):
|
||||
{.push checks: off.}
|
||||
|
||||
proc fillUnsafeAvx*(
|
||||
data: var seq[ColorRGBX],
|
||||
rgbx: ColorRGBX,
|
||||
start, len: int
|
||||
) =
|
||||
var
|
||||
i = start
|
||||
p = cast[uint](data[i].addr)
|
||||
# Align to 32 bytes
|
||||
while i < (start + len) and (p and 31) != 0:
|
||||
data[i] = rgbx
|
||||
inc i
|
||||
p += 4
|
||||
# When supported, SIMD fill until we run out of room
|
||||
let
|
||||
iterations = (start + len - i) div 8
|
||||
colorVec = mm256_set1_epi32(cast[int32](rgbx))
|
||||
for _ in 0 ..< iterations:
|
||||
mm256_store_si256(cast[pointer](p), colorVec)
|
||||
p += 32
|
||||
i += iterations * 8
|
||||
# Fill whatever is left the slow way
|
||||
for i in i ..< start + len:
|
||||
data[i] = rgbx
|
||||
|
||||
when defined(release):
|
||||
{.pop.}
|
Loading…
Reference in a new issue