Merge pull request #450 from guzba/master

a little faster partitioning, runtime-checked avx fill
2022-06-23 14:45:16 -07:00 · 2022-06-23 14:45:16 -07:00 · 4afb0e58c4
commit 4afb0e58c4
parent b2e145130a 1cfaea9357
3 changed files with 80 additions and 18 deletions
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@ -3,7 +3,8 @@ import chroma, common, system/memory, vmath
 const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
 when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+  import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
  let cpuHasAvx* = checkInstructionSets({AVX})
 template currentExceptionAsPixieError*(): untyped =
  ## Gets the current exception and returns it as a PixieError with stack trace.
@ -63,6 +64,13 @@ proc fillUnsafe*(
  ## Fills the image data with the color starting at index start and
  ## continuing for len indices.
  let rgbx = color.asRgbx()
  # If we can use AVX, do so
  when defined(amd64) and allowSimd:
    if cpuHasAvx and len >= 64:
      fillUnsafeAvx(data, rgbx, start, len)
      return
  # Use memset when every byte has the same value
  if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
    nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
@ -70,14 +78,15 @@ proc fillUnsafe*(
    var i = start
    when defined(amd64) and allowSimd:
      # Align to 16 bytes
-      while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0:
+      var p = cast[uint](data[i].addr)
      while i < (start + len) and (p and 15) != 0:
        data[i] = rgbx
        inc i
        p += 4
      # When supported, SIMD fill until we run out of room
      let
        colorVec = mm_set1_epi32(cast[int32](rgbx))
        iterations = (start + len - i) div 8
      var p = cast[uint](data[i].addr)
      for _ in 0 ..< iterations:
        mm_store_si128(cast[pointer](p), colorVec)
        mm_store_si128(cast[pointer](p + 16), colorVec)
@ -93,8 +102,8 @@ proc fillUnsafe*(
          copyMem(data[i].addr, u64.addr, 8)
          i += 2
    # Fill whatever is left the slow way
-    for j in i ..< start + len:
+    for i in i ..< start + len:
-      data[j] = rgbx
+      data[i] = rgbx
 const straightAlphaTable = block:
  var table: array[256, array[256, uint8]]
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -1131,25 +1131,43 @@ proc partitionSegments(
    startY = top.uint32
    partitionHeight = height.uint32 div numPartitions
-  for (segment, winding) in segments:
+  var entries = newSeq[PartitionEntry](segments.len)
-    var entry = initPartitionEntry(segment, winding)
+  for i, (segment, winding) in segments:
-    if partitionHeight == 0:
+    entries[i] = initPartitionEntry(segment, winding)
-      result[0].entries.add(move entry)
+
-    else:
+  if numPartitions == 1:
    result[0].entries = move entries
  else:
    iterator partitionRange(
      segment: Segment,
      numPartitions, startY, partitionHeight: uint32
    ): uint32 =
      var
        atPartition = max(0, segment.at.y - startY.float32).uint32
        toPartition = max(0, segment.to.y - startY.float32).uint32
      atPartition = atPartition div partitionHeight
      toPartition = toPartition div partitionHeight
-      atPartition = min(atPartition, result.high.uint32)
+      atPartition = min(atPartition, numPartitions - 1)
-      toPartition = min(toPartition, result.high.uint32)
+      toPartition = min(toPartition, numPartitions - 1)
-      for i in atPartition .. toPartition:
+      for partitionIndex in atPartition .. toPartition:
-        result[i].entries.add(entry)
+        yield partitionIndex
    var entryCounts = newSeq[int](numPartitions)
    for (segment, _) in segments:
      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
        inc entryCounts[partitionIndex]
    for partitionIndex, entryCounts in entryCounts:
      result[partitionIndex].entries.setLen(entryCounts)
    var indexes = newSeq[int](numPartitions)
    for i, (segment, winding) in segments:
      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
        result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
        inc indexes[partitionIndex]
  # Set the bottom values for the partitions (y value where this partition ends)
  var partitionBottom = top + partitionHeight.int
  for partition in result.mitems:
    partition.bottom = partitionBottom
    partition.requiresAntiAliasing =
@ -1313,7 +1331,7 @@ proc computeCoverage(
        if fillLen > 0:
          var i = fillStart
          when defined(amd64) and allowSimd:
-            let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage))
+            let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
            for _ in 0 ..< fillLen div 16:
              var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
              coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
@ -1354,7 +1372,7 @@ proc fillCoverage(
          let
            coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
            eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
-            eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(cast[int8](255)))
+            eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
            allZeroes = mm_movemask_epi8(eqZero) == 0xffff
            all255 = mm_movemask_epi8(eq255) == 0xffff
          yield (coverageVec, allZeroes, all255)
--- a/src/pixie/simd/avx.nim
+++ b/src/pixie/simd/avx.nim
@ -0,0 +1,35 @@
 import chroma, nimsimd/avx
 when defined(gcc) or defined(clang):
  {.localPassc: "-mavx".}
 when defined(release):
  {.push checks: off.}
 proc fillUnsafeAvx*(
  data: var seq[ColorRGBX],
  rgbx: ColorRGBX,
  start, len: int
 ) =
  var
    i = start
    p = cast[uint](data[i].addr)
  # Align to 32 bytes
  while i < (start + len) and (p and 31) != 0:
    data[i] = rgbx
    inc i
    p += 4
  # When supported, SIMD fill until we run out of room
  let
    iterations = (start + len - i) div 8
    colorVec = mm256_set1_epi32(cast[int32](rgbx))
  for _ in 0 ..< iterations:
    mm256_store_si256(cast[pointer](p), colorVec)
    p += 32
  i += iterations * 8
  # Fill whatever is left the slow way
  for i in i ..< start + len:
    data[i] = rgbx
 when defined(release):
  {.pop.}