faster

2022-06-22 01:03:32 -05:00 · 2022-06-22 01:03:32 -05:00 · 82c7c8b864
commit 82c7c8b864
parent 34158f3f28
2 changed files with 28 additions and 30 deletions
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@ -52,7 +52,7 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
 proc fillUnsafe*(
  data: var seq[uint8], value: uint8, start, len: int
-) {.raises: [].} =
+) {.inline, raises: [].} =
  ## Fills the mask data with the value starting at index start and
  ## continuing for len indices.
  nimSetMem(data[start].addr, value.cint, len)
@ -62,9 +62,7 @@ proc fillUnsafe*(
 ) {.raises: [].} =
  ## Fills the image data with the color starting at index start and
  ## continuing for len indices.
  let rgbx = color.asRgbx()
  # Use memset when every byte has the same value
  if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
    nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
@ -78,11 +76,13 @@ proc fillUnsafe*(
      # When supported, SIMD fill until we run out of room
      let
        colorVec = mm_set1_epi32(cast[int32](rgbx))
-        remaining = start + len - i
+        iterations = (start + len - i) div 8
-      for _ in 0 ..< remaining div 8:
+      var p = cast[uint](data[i].addr)
-        mm_store_si128(data[i + 0].addr, colorVec)
+      for _ in 0 ..< iterations:
-        mm_store_si128(data[i + 4].addr, colorVec)
+        mm_store_si128(cast[pointer](p), colorVec)
-        i += 8
+        mm_store_si128(cast[pointer](p + 16), colorVec)
        p += 32
      i += iterations * 8
    else:
      when sizeof(int) == 8:
        # Fill 8 bytes at a time when possible
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -1133,9 +1133,9 @@ proc partitionSegments(
  result.partitionHeight = height.uint32 div numPartitions
  for (segment, winding) in segments:
-    let entry = initPartitionEntry(segment, winding)
+    var entry = initPartitionEntry(segment, winding)
    if result.partitionHeight == 0:
-      result.partitions[0].entries.add(entry)
+      result.partitions[0].entries.add(move entry)
    else:
      var
        atPartition = max(0, segment.at.y - result.startY.float32).uint32
@ -1619,16 +1619,15 @@ proc fillHits(
  template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) =
    when allowSimd:
      when defined(amd64):
-        let colorVec = mm_set1_epi32(cast[int32](rgbx))
+        var p = cast[uint](image.data[image.dataIndex(x, y)].addr)
-        var dataIndex = image.dataIndex(x, y)
+        let
-        for _ in 0 ..< len div 4:
+          iterations = len div 4
-          let backdrop = mm_loadu_si128(image.data[dataIndex].addr)
+          colorVec = mm_set1_epi32(cast[int32](rgbx))
-          mm_storeu_si128(
+        for _ in 0 ..< iterations:
-            image.data[dataIndex].addr,
+          let backdrop = mm_loadu_si128(cast[pointer](p))
-            blendProc(backdrop, colorVec)
+          mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec))
-          )
+          p += 16
-          x += 4
+        x += iterations * 4
          dataIndex += 4
  case blendMode:
  of OverwriteBlend:
@ -1714,16 +1713,15 @@ proc fillHits(
  template simdBlob(mask: Mask, x: var int, len: int, blendProc: untyped) =
    when allowSimd:
      when defined(amd64):
-        let vec255 = mm_set1_epi8(255)
+        var p = cast[uint](mask.data[mask.dataIndex(x, y)].addr)
-        var dataIndex = mask.dataIndex(x, y)
+        let
-        for _ in 0 ..< len div 16:
+          iterations = len div 16
-          let backdrop = mm_loadu_si128(mask.data[dataIndex].addr)
+          vec255 = mm_set1_epi8(255)
-          mm_storeu_si128(
+        for _ in 0 ..< iterations:
-            mask.data[dataIndex].addr,
+          let backdrop = mm_loadu_si128(cast[pointer](p))
-            blendProc(backdrop, vec255)
+          mm_storeu_si128(cast[pointer](p), blendProc(backdrop, vec255))
-          )
+          p += 16
-          x += 16
+        x += iterations * 16
          dataIndex += 16
  case blendMode:
  of NormalBlend, OverwriteBlend: