Merge pull request #450 from guzba/master

a little faster partitioning, runtime-checked avx fill
2022-06-23 14:45:16 -07:00 · 2022-06-23 14:45:16 -07:00 · 4afb0e58c4
commit 4afb0e58c4
parent b2e145130a 1cfaea9357
3 changed files with 80 additions and 18 deletions
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
 const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)

 when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+  import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
+  let cpuHasAvx* = checkInstructionSets({AVX})

 template currentExceptionAsPixieError*(): untyped =
  ## Gets the current exception and returns it as a PixieError with stack trace.
@ -63,6 +64,13 @@ proc fillUnsafe*(
  ## Fills the image data with the color starting at index start and
  ## continuing for len indices.
  let rgbx = color.asRgbx()
+
+  # If we can use AVX, do so
+  when defined(amd64) and allowSimd:
+    if cpuHasAvx and len >= 64:
+      fillUnsafeAvx(data, rgbx, start, len)
+      return
+
  # Use memset when every byte has the same value
  if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
    nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
@ -70,14 +78,15 @@ proc fillUnsafe*(
    var i = start
    when defined(amd64) and allowSimd:
      # Align to 16 bytes
-      while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0:
+      var p = cast[uint](data[i].addr)
+      while i < (start + len) and (p and 15) != 0:
        data[i] = rgbx
        inc i
+        p += 4
      # When supported, SIMD fill until we run out of room
      let
        colorVec = mm_set1_epi32(cast[int32](rgbx))
        iterations = (start + len - i) div 8
-      var p = cast[uint](data[i].addr)
      for _ in 0 ..< iterations:
        mm_store_si128(cast[pointer](p), colorVec)
        mm_store_si128(cast[pointer](p + 16), colorVec)
@ -93,8 +102,8 @@ proc fillUnsafe*(
          copyMem(data[i].addr, u64.addr, 8)
          i += 2
    # Fill whatever is left the slow way
-    for j in i ..< start + len:
-      data[j] = rgbx
+    for i in i ..< start + len:
+      data[i] = rgbx

 const straightAlphaTable = block:
  var table: array[256, array[256, uint8]]
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -1131,25 +1131,43 @@ proc partitionSegments(
    startY = top.uint32
    partitionHeight = height.uint32 div numPartitions

-  for (segment, winding) in segments:
-    var entry = initPartitionEntry(segment, winding)
-    if partitionHeight == 0:
-      result[0].entries.add(move entry)
-    else:
+  var entries = newSeq[PartitionEntry](segments.len)
+  for i, (segment, winding) in segments:
+    entries[i] = initPartitionEntry(segment, winding)
+
+  if numPartitions == 1:
+    result[0].entries = move entries
+  else:
+    iterator partitionRange(
+      segment: Segment,
+      numPartitions, startY, partitionHeight: uint32
+    ): uint32 =
      var
        atPartition = max(0, segment.at.y - startY.float32).uint32
        toPartition = max(0, segment.to.y - startY.float32).uint32
      atPartition = atPartition div partitionHeight
      toPartition = toPartition div partitionHeight
-      atPartition = min(atPartition, result.high.uint32)
-      toPartition = min(toPartition, result.high.uint32)
-      for i in atPartition .. toPartition:
-        result[i].entries.add(entry)
+      atPartition = min(atPartition, numPartitions - 1)
+      toPartition = min(toPartition, numPartitions - 1)
+      for partitionIndex in atPartition .. toPartition:
+        yield partitionIndex
+
+    var entryCounts = newSeq[int](numPartitions)
+    for (segment, _) in segments:
+      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
+        inc entryCounts[partitionIndex]
+
+    for partitionIndex, entryCounts in entryCounts:
+      result[partitionIndex].entries.setLen(entryCounts)
+
+    var indexes = newSeq[int](numPartitions)
+    for i, (segment, winding) in segments:
+      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
+        result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
+        inc indexes[partitionIndex]

  # Set the bottom values for the partitions (y value where this partition ends)
-
  var partitionBottom = top + partitionHeight.int
-
  for partition in result.mitems:
    partition.bottom = partitionBottom
    partition.requiresAntiAliasing =
@ -1313,7 +1331,7 @@ proc computeCoverage(
        if fillLen > 0:
          var i = fillStart
          when defined(amd64) and allowSimd:
-            let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage))
+            let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
            for _ in 0 ..< fillLen div 16:
              var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
              coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
@ -1354,7 +1372,7 @@ proc fillCoverage(
          let
            coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
            eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
-            eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255)))
+            eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
            allZeroes = mm_movemask_epi8(eqZero) == 0xffff
            all255 = mm_movemask_epi8(eq255) == 0xffff
          yield (coverageVec, allZeroes, all255)
--- a/src/pixie/simd/avx.nim
+++ b/src/pixie/simd/avx.nim
@ -0,0 +1,35 @@
+import chroma, nimsimd/avx
+
+when defined(gcc) or defined(clang):
+  {.localPassc: "-mavx".}
+
+when defined(release):
+  {.push checks: off.}
+
+proc fillUnsafeAvx*(
+  data: var seq[ColorRGBX],
+  rgbx: ColorRGBX,
+  start, len: int
+) =
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 32 bytes
+  while i < (start + len) and (p and 31) != 0:
+    data[i] = rgbx
+    inc i
+    p += 4
+  # When supported, SIMD fill until we run out of room
+  let
+    iterations = (start + len - i) div 8
+    colorVec = mm256_set1_epi32(cast[int32](rgbx))
+  for _ in 0 ..< iterations:
+    mm256_store_si256(cast[pointer](p), colorVec)
+    p += 32
+  i += iterations * 8
+  # Fill whatever is left the slow way
+  for i in i ..< start + len:
+    data[i] = rgbx
+
+when defined(release):
+  {.pop.}