Merge pull request #450 from guzba/master
a little faster partitioning, runtime-checked avx fill
This commit is contained in:
commit
4afb0e58c4
3 changed files with 80 additions and 18 deletions
|
@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
|
||||||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
import nimsimd/sse2
|
import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
|
||||||
|
let cpuHasAvx* = checkInstructionSets({AVX})
|
||||||
|
|
||||||
template currentExceptionAsPixieError*(): untyped =
|
template currentExceptionAsPixieError*(): untyped =
|
||||||
## Gets the current exception and returns it as a PixieError with stack trace.
|
## Gets the current exception and returns it as a PixieError with stack trace.
|
||||||
|
@ -63,6 +64,13 @@ proc fillUnsafe*(
|
||||||
## Fills the image data with the color starting at index start and
|
## Fills the image data with the color starting at index start and
|
||||||
## continuing for len indices.
|
## continuing for len indices.
|
||||||
let rgbx = color.asRgbx()
|
let rgbx = color.asRgbx()
|
||||||
|
|
||||||
|
# If we can use AVX, do so
|
||||||
|
when defined(amd64) and allowSimd:
|
||||||
|
if cpuHasAvx and len >= 64:
|
||||||
|
fillUnsafeAvx(data, rgbx, start, len)
|
||||||
|
return
|
||||||
|
|
||||||
# Use memset when every byte has the same value
|
# Use memset when every byte has the same value
|
||||||
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
||||||
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
||||||
|
@ -70,14 +78,15 @@ proc fillUnsafe*(
|
||||||
var i = start
|
var i = start
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
# Align to 16 bytes
|
# Align to 16 bytes
|
||||||
while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0:
|
var p = cast[uint](data[i].addr)
|
||||||
|
while i < (start + len) and (p and 15) != 0:
|
||||||
data[i] = rgbx
|
data[i] = rgbx
|
||||||
inc i
|
inc i
|
||||||
|
p += 4
|
||||||
# When supported, SIMD fill until we run out of room
|
# When supported, SIMD fill until we run out of room
|
||||||
let
|
let
|
||||||
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||||
iterations = (start + len - i) div 8
|
iterations = (start + len - i) div 8
|
||||||
var p = cast[uint](data[i].addr)
|
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
mm_store_si128(cast[pointer](p), colorVec)
|
mm_store_si128(cast[pointer](p), colorVec)
|
||||||
mm_store_si128(cast[pointer](p + 16), colorVec)
|
mm_store_si128(cast[pointer](p + 16), colorVec)
|
||||||
|
@ -93,8 +102,8 @@ proc fillUnsafe*(
|
||||||
copyMem(data[i].addr, u64.addr, 8)
|
copyMem(data[i].addr, u64.addr, 8)
|
||||||
i += 2
|
i += 2
|
||||||
# Fill whatever is left the slow way
|
# Fill whatever is left the slow way
|
||||||
for j in i ..< start + len:
|
for i in i ..< start + len:
|
||||||
data[j] = rgbx
|
data[i] = rgbx
|
||||||
|
|
||||||
const straightAlphaTable = block:
|
const straightAlphaTable = block:
|
||||||
var table: array[256, array[256, uint8]]
|
var table: array[256, array[256, uint8]]
|
||||||
|
|
|
@ -1131,25 +1131,43 @@ proc partitionSegments(
|
||||||
startY = top.uint32
|
startY = top.uint32
|
||||||
partitionHeight = height.uint32 div numPartitions
|
partitionHeight = height.uint32 div numPartitions
|
||||||
|
|
||||||
for (segment, winding) in segments:
|
var entries = newSeq[PartitionEntry](segments.len)
|
||||||
var entry = initPartitionEntry(segment, winding)
|
for i, (segment, winding) in segments:
|
||||||
if partitionHeight == 0:
|
entries[i] = initPartitionEntry(segment, winding)
|
||||||
result[0].entries.add(move entry)
|
|
||||||
else:
|
if numPartitions == 1:
|
||||||
|
result[0].entries = move entries
|
||||||
|
else:
|
||||||
|
iterator partitionRange(
|
||||||
|
segment: Segment,
|
||||||
|
numPartitions, startY, partitionHeight: uint32
|
||||||
|
): uint32 =
|
||||||
var
|
var
|
||||||
atPartition = max(0, segment.at.y - startY.float32).uint32
|
atPartition = max(0, segment.at.y - startY.float32).uint32
|
||||||
toPartition = max(0, segment.to.y - startY.float32).uint32
|
toPartition = max(0, segment.to.y - startY.float32).uint32
|
||||||
atPartition = atPartition div partitionHeight
|
atPartition = atPartition div partitionHeight
|
||||||
toPartition = toPartition div partitionHeight
|
toPartition = toPartition div partitionHeight
|
||||||
atPartition = min(atPartition, result.high.uint32)
|
atPartition = min(atPartition, numPartitions - 1)
|
||||||
toPartition = min(toPartition, result.high.uint32)
|
toPartition = min(toPartition, numPartitions - 1)
|
||||||
for i in atPartition .. toPartition:
|
for partitionIndex in atPartition .. toPartition:
|
||||||
result[i].entries.add(entry)
|
yield partitionIndex
|
||||||
|
|
||||||
|
var entryCounts = newSeq[int](numPartitions)
|
||||||
|
for (segment, _) in segments:
|
||||||
|
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
||||||
|
inc entryCounts[partitionIndex]
|
||||||
|
|
||||||
|
for partitionIndex, entryCounts in entryCounts:
|
||||||
|
result[partitionIndex].entries.setLen(entryCounts)
|
||||||
|
|
||||||
|
var indexes = newSeq[int](numPartitions)
|
||||||
|
for i, (segment, winding) in segments:
|
||||||
|
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
||||||
|
result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
|
||||||
|
inc indexes[partitionIndex]
|
||||||
|
|
||||||
# Set the bottom values for the partitions (y value where this partition ends)
|
# Set the bottom values for the partitions (y value where this partition ends)
|
||||||
|
|
||||||
var partitionBottom = top + partitionHeight.int
|
var partitionBottom = top + partitionHeight.int
|
||||||
|
|
||||||
for partition in result.mitems:
|
for partition in result.mitems:
|
||||||
partition.bottom = partitionBottom
|
partition.bottom = partitionBottom
|
||||||
partition.requiresAntiAliasing =
|
partition.requiresAntiAliasing =
|
||||||
|
@ -1313,7 +1331,7 @@ proc computeCoverage(
|
||||||
if fillLen > 0:
|
if fillLen > 0:
|
||||||
var i = fillStart
|
var i = fillStart
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage))
|
let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
|
||||||
for _ in 0 ..< fillLen div 16:
|
for _ in 0 ..< fillLen div 16:
|
||||||
var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
|
var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
|
||||||
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
|
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
|
||||||
|
@ -1354,7 +1372,7 @@ proc fillCoverage(
|
||||||
let
|
let
|
||||||
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
|
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
|
||||||
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
|
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
|
||||||
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255)))
|
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
|
||||||
allZeroes = mm_movemask_epi8(eqZero) == 0xffff
|
allZeroes = mm_movemask_epi8(eqZero) == 0xffff
|
||||||
all255 = mm_movemask_epi8(eq255) == 0xffff
|
all255 = mm_movemask_epi8(eq255) == 0xffff
|
||||||
yield (coverageVec, allZeroes, all255)
|
yield (coverageVec, allZeroes, all255)
|
||||||
|
|
35
src/pixie/simd/avx.nim
Normal file
35
src/pixie/simd/avx.nim
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
import chroma, nimsimd/avx
|
||||||
|
|
||||||
|
when defined(gcc) or defined(clang):
|
||||||
|
{.localPassc: "-mavx".}
|
||||||
|
|
||||||
|
when defined(release):
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
|
proc fillUnsafeAvx*(
|
||||||
|
data: var seq[ColorRGBX],
|
||||||
|
rgbx: ColorRGBX,
|
||||||
|
start, len: int
|
||||||
|
) =
|
||||||
|
var
|
||||||
|
i = start
|
||||||
|
p = cast[uint](data[i].addr)
|
||||||
|
# Align to 32 bytes
|
||||||
|
while i < (start + len) and (p and 31) != 0:
|
||||||
|
data[i] = rgbx
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
# When supported, SIMD fill until we run out of room
|
||||||
|
let
|
||||||
|
iterations = (start + len - i) div 8
|
||||||
|
colorVec = mm256_set1_epi32(cast[int32](rgbx))
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
mm256_store_si256(cast[pointer](p), colorVec)
|
||||||
|
p += 32
|
||||||
|
i += iterations * 8
|
||||||
|
# Fill whatever is left the slow way
|
||||||
|
for i in i ..< start + len:
|
||||||
|
data[i] = rgbx
|
||||||
|
|
||||||
|
when defined(release):
|
||||||
|
{.pop.}
|
Loading…
Reference in a new issue