From f93da30d085891ef83237fd9f75f8a906d70550f Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 22 Jun 2022 11:10:54 -0500 Subject: [PATCH 1/3] simpler --- src/pixie/paths.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 1c8efe1..f92a43f 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1313,7 +1313,7 @@ proc computeCoverage( if fillLen > 0: var i = fillStart when defined(amd64) and allowSimd: - let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage)) + let sampleCoverageVec = mm_set1_epi8(sampleCoverage) for _ in 0 ..< fillLen div 16: var coverageVec = mm_loadu_si128(coverages[i - startX].addr) coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec) @@ -1354,7 +1354,7 @@ proc fillCoverage( let coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr) eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255))) + eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255)) allZeroes = mm_movemask_epi8(eqZero) == 0xffff all255 = mm_movemask_epi8(eq255) == 0xffff yield (coverageVec, allZeroes, all255) From 3bdc6c32663e45b6378b7d58d8988c8c409370b9 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 22 Jun 2022 13:21:11 -0500 Subject: [PATCH 2/3] faster partitioning by pre-sizing partition entries --- src/pixie/paths.nim | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index f92a43f..527016c 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1131,25 +1131,43 @@ proc partitionSegments( startY = top.uint32 partitionHeight = height.uint32 div numPartitions - for (segment, winding) in segments: - var entry = initPartitionEntry(segment, winding) - if partitionHeight == 0: - result[0].entries.add(move entry) - else: + var entries = newSeq[PartitionEntry](segments.len) + for i, (segment, winding) in segments: + entries[i] = initPartitionEntry(segment, winding) + + if numPartitions == 1: + result[0].entries = move entries + else: + iterator partitionRange( + segment: Segment, + numPartitions, startY, partitionHeight: uint32 + ): uint32 = var atPartition = max(0, segment.at.y - startY.float32).uint32 toPartition = max(0, segment.to.y - startY.float32).uint32 atPartition = atPartition div partitionHeight toPartition = toPartition div partitionHeight - atPartition = min(atPartition, result.high.uint32) - toPartition = min(toPartition, result.high.uint32) - for i in atPartition .. toPartition: - result[i].entries.add(entry) + atPartition = min(atPartition, numPartitions - 1) + toPartition = min(toPartition, numPartitions - 1) + for partitionIndex in atPartition .. toPartition: + yield partitionIndex + + var entryCounts = newSeq[int](numPartitions) + for (segment, _) in segments: + for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight): + inc entryCounts[partitionIndex] + + for partitionIndex, entryCounts in entryCounts: + result[partitionIndex].entries.setLen(entryCounts) + + var indexes = newSeq[int](numPartitions) + for i, (segment, winding) in segments: + for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight): + result[partitionIndex].entries[indexes[partitionIndex]] = entries[i] + inc indexes[partitionIndex] # Set the bottom values for the partitions (y value where this partition ends) - var partitionBottom = top + partitionHeight.int - for partition in result.mitems: partition.bottom = partitionBottom partition.requiresAntiAliasing = From 1cfaea935796b42ef79f24bd2c274590874ba6cb Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 22 Jun 2022 14:57:04 -0500 Subject: [PATCH 3/3] avx fill --- src/pixie/internal.nim | 19 ++++++++++++++----- src/pixie/simd/avx.nim | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 src/pixie/simd/avx.nim diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index f9b255d..448d955 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) when defined(amd64) and allowSimd: - import nimsimd/sse2 + import nimsimd/runtimecheck, nimsimd/sse2, simd/avx + let cpuHasAvx* = checkInstructionSets({AVX}) template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -63,6 +64,13 @@ proc fillUnsafe*( ## Fills the image data with the color starting at index start and ## continuing for len indices. let rgbx = color.asRgbx() + + # If we can use AVX, do so + when defined(amd64) and allowSimd: + if cpuHasAvx and len >= 64: + fillUnsafeAvx(data, rgbx, start, len) + return + # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) @@ -70,14 +78,15 @@ proc fillUnsafe*( var i = start when defined(amd64) and allowSimd: # Align to 16 bytes - while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0: + var p = cast[uint](data[i].addr) + while i < (start + len) and (p and 15) != 0: data[i] = rgbx inc i + p += 4 # When supported, SIMD fill until we run out of room let colorVec = mm_set1_epi32(cast[int32](rgbx)) iterations = (start + len - i) div 8 - var p = cast[uint](data[i].addr) for _ in 0 ..< iterations: mm_store_si128(cast[pointer](p), colorVec) mm_store_si128(cast[pointer](p + 16), colorVec) @@ -93,8 +102,8 @@ proc fillUnsafe*( copyMem(data[i].addr, u64.addr, 8) i += 2 # Fill whatever is left the slow way - for j in i ..< start + len: - data[j] = rgbx + for i in i ..< start + len: + data[i] = rgbx const straightAlphaTable = block: var table: array[256, array[256, uint8]] diff --git a/src/pixie/simd/avx.nim b/src/pixie/simd/avx.nim new file mode 100644 index 0000000..2a3b9d2 --- /dev/null +++ b/src/pixie/simd/avx.nim @@ -0,0 +1,35 @@ +import chroma, nimsimd/avx + +when defined(gcc) or defined(clang): + {.localPassc: "-mavx".} + +when defined(release): + {.push checks: off.} + +proc fillUnsafeAvx*( + data: var seq[ColorRGBX], + rgbx: ColorRGBX, + start, len: int +) = + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + data[i] = rgbx + inc i + p += 4 + # When supported, SIMD fill until we run out of room + let + iterations = (start + len - i) div 8 + colorVec = mm256_set1_epi32(cast[int32](rgbx)) + for _ in 0 ..< iterations: + mm256_store_si256(cast[pointer](p), colorVec) + p += 32 + i += iterations * 8 + # Fill whatever is left the slow way + for i in i ..< start + len: + data[i] = rgbx + +when defined(release): + {.pop.}