Merge pull request #450 from guzba/master

a little faster partitioning, runtime-checked avx fill
This commit is contained in:
Andre von Houck 2022-06-23 14:45:16 -07:00 committed by GitHub
commit 4afb0e58c4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 18 deletions

View file

@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
when defined(amd64) and allowSimd: when defined(amd64) and allowSimd:
import nimsimd/sse2 import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
let cpuHasAvx* = checkInstructionSets({AVX})
template currentExceptionAsPixieError*(): untyped = template currentExceptionAsPixieError*(): untyped =
## Gets the current exception and returns it as a PixieError with stack trace. ## Gets the current exception and returns it as a PixieError with stack trace.
@ -63,6 +64,13 @@ proc fillUnsafe*(
## Fills the image data with the color starting at index start and ## Fills the image data with the color starting at index start and
## continuing for len indices. ## continuing for len indices.
let rgbx = color.asRgbx() let rgbx = color.asRgbx()
# If we can use AVX, do so
when defined(amd64) and allowSimd:
if cpuHasAvx and len >= 64:
fillUnsafeAvx(data, rgbx, start, len)
return
# Use memset when every byte has the same value # Use memset when every byte has the same value
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
nimSetMem(data[start].addr, rgbx.r.cint, len * 4) nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
@ -70,14 +78,15 @@ proc fillUnsafe*(
var i = start var i = start
when defined(amd64) and allowSimd: when defined(amd64) and allowSimd:
# Align to 16 bytes # Align to 16 bytes
while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0: var p = cast[uint](data[i].addr)
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx data[i] = rgbx
inc i inc i
p += 4
# When supported, SIMD fill until we run out of room # When supported, SIMD fill until we run out of room
let let
colorVec = mm_set1_epi32(cast[int32](rgbx)) colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8 iterations = (start + len - i) div 8
var p = cast[uint](data[i].addr)
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
mm_store_si128(cast[pointer](p), colorVec) mm_store_si128(cast[pointer](p), colorVec)
mm_store_si128(cast[pointer](p + 16), colorVec) mm_store_si128(cast[pointer](p + 16), colorVec)
@ -93,8 +102,8 @@ proc fillUnsafe*(
copyMem(data[i].addr, u64.addr, 8) copyMem(data[i].addr, u64.addr, 8)
i += 2 i += 2
# Fill whatever is left the slow way # Fill whatever is left the slow way
for j in i ..< start + len: for i in i ..< start + len:
data[j] = rgbx data[i] = rgbx
const straightAlphaTable = block: const straightAlphaTable = block:
var table: array[256, array[256, uint8]] var table: array[256, array[256, uint8]]

View file

@ -1131,25 +1131,43 @@ proc partitionSegments(
startY = top.uint32 startY = top.uint32
partitionHeight = height.uint32 div numPartitions partitionHeight = height.uint32 div numPartitions
for (segment, winding) in segments: var entries = newSeq[PartitionEntry](segments.len)
var entry = initPartitionEntry(segment, winding) for i, (segment, winding) in segments:
if partitionHeight == 0: entries[i] = initPartitionEntry(segment, winding)
result[0].entries.add(move entry)
else: if numPartitions == 1:
result[0].entries = move entries
else:
iterator partitionRange(
segment: Segment,
numPartitions, startY, partitionHeight: uint32
): uint32 =
var var
atPartition = max(0, segment.at.y - startY.float32).uint32 atPartition = max(0, segment.at.y - startY.float32).uint32
toPartition = max(0, segment.to.y - startY.float32).uint32 toPartition = max(0, segment.to.y - startY.float32).uint32
atPartition = atPartition div partitionHeight atPartition = atPartition div partitionHeight
toPartition = toPartition div partitionHeight toPartition = toPartition div partitionHeight
atPartition = min(atPartition, result.high.uint32) atPartition = min(atPartition, numPartitions - 1)
toPartition = min(toPartition, result.high.uint32) toPartition = min(toPartition, numPartitions - 1)
for i in atPartition .. toPartition: for partitionIndex in atPartition .. toPartition:
result[i].entries.add(entry) yield partitionIndex
var entryCounts = newSeq[int](numPartitions)
for (segment, _) in segments:
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
inc entryCounts[partitionIndex]
for partitionIndex, entryCounts in entryCounts:
result[partitionIndex].entries.setLen(entryCounts)
var indexes = newSeq[int](numPartitions)
for i, (segment, winding) in segments:
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
inc indexes[partitionIndex]
# Set the bottom values for the partitions (y value where this partition ends) # Set the bottom values for the partitions (y value where this partition ends)
var partitionBottom = top + partitionHeight.int var partitionBottom = top + partitionHeight.int
for partition in result.mitems: for partition in result.mitems:
partition.bottom = partitionBottom partition.bottom = partitionBottom
partition.requiresAntiAliasing = partition.requiresAntiAliasing =
@ -1313,7 +1331,7 @@ proc computeCoverage(
if fillLen > 0: if fillLen > 0:
var i = fillStart var i = fillStart
when defined(amd64) and allowSimd: when defined(amd64) and allowSimd:
let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage)) let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
for _ in 0 ..< fillLen div 16: for _ in 0 ..< fillLen div 16:
var coverageVec = mm_loadu_si128(coverages[i - startX].addr) var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec) coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
@ -1354,7 +1372,7 @@ proc fillCoverage(
let let
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr) coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128()) eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255))) eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
allZeroes = mm_movemask_epi8(eqZero) == 0xffff allZeroes = mm_movemask_epi8(eqZero) == 0xffff
all255 = mm_movemask_epi8(eq255) == 0xffff all255 = mm_movemask_epi8(eq255) == 0xffff
yield (coverageVec, allZeroes, all255) yield (coverageVec, allZeroes, all255)

35
src/pixie/simd/avx.nim Normal file
View file

@ -0,0 +1,35 @@
import chroma, nimsimd/avx
when defined(gcc) or defined(clang):
{.localPassc: "-mavx".}
when defined(release):
{.push checks: off.}
proc fillUnsafeAvx*(
data: var seq[ColorRGBX],
rgbx: ColorRGBX,
start, len: int
) =
var
i = start
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
data[i] = rgbx
inc i
p += 4
# When supported, SIMD fill until we run out of room
let
iterations = (start + len - i) div 8
colorVec = mm256_set1_epi32(cast[int32](rgbx))
for _ in 0 ..< iterations:
mm256_store_si256(cast[pointer](p), colorVec)
p += 32
i += iterations * 8
# Fill whatever is left the slow way
for i in i ..< start + len:
data[i] = rgbx
when defined(release):
{.pop.}