Merge pull request #453 from guzba/master

aligned sse2 + avx2 isOneColor isOpaque isTransparent, toPremultipliedAlphaAvx2
2022-06-28 19:07:33 -07:00 · 2022-06-28 19:07:33 -07:00 · 21c15a2680
commit 21c15a2680
parent 3faba86b95 f41f895e24
8 changed files with 403 additions and 236 deletions
--- a/experiments/benchmark_cairo.nim
+++ b/experiments/benchmark_cairo.nim
@ -194,6 +194,8 @@ block:
      surface = imageSurfaceCreate(FORMAT_ARGB32, 900, 900)
      ctx = surface.create()

+    ctx.setLineWidth(1)
+
    timeIt "[cairo] " & benchmark.name:
      for fill in benchmark.fills:
        if fill.shapes.len > 0:
@ -221,6 +223,7 @@ block:
              FillRuleEvenOdd
          )
          ctx.fill()
+          # ctx.stroke()

    # discard surface.writeToPng(("cairo_" & benchmark.name & ".png").cstring)

@ -242,5 +245,11 @@ block:
            fill.transform,
            fill.windingRule
          )
+          # image.strokePath(
+          #   p,
+          #   fill.paint,
+          #   fill.transform,
+          #   1
+          # )

    # image.writeFile("pixie_" & benchmark.name & ".png")
--- a/pixie.nimble
+++ b/pixie.nimble
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
 requires "chroma >= 0.2.5"
 requires "zippy >= 0.10.2"
 requires "flatty >= 0.3.4"
-requires "nimsimd >= 1.1.1"
+requires "nimsimd >= 1.1.5"
 requires "bumpy >= 1.1.1"

 task bindings, "Generate bindings":
--- a/src/pixie/fileformats/jpeg.nim
+++ b/src/pixie/fileformats/jpeg.nim
@ -1,5 +1,5 @@
 import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
-    pixie/masks, sequtils, std/decls, strutils
+    pixie/masks, std/decls, std/sequtils, std/strutils

 when defined(amd64) and allowSimd:
  import nimsimd/sse2
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@ -1,7 +1,7 @@
 import blends, bumpy, chroma, common, masks, pixie/internal, vmath

 when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+  import nimsimd/sse2, runtimechecked/avx2

 const h = 0.5.float32

@ -101,54 +101,84 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =

 proc isOneColor*(image: Image): bool {.raises: [].} =
  ## Checks if the entire image is the same color.
+  when defined(amd64) and allowSimd:
+    if cpuHasAvx2:
+      return isOneColorAvx2(image.data, 0, image.data.len)
+
  result = true

  let color = image.data[0]

  var i: int
  when defined(amd64) and allowSimd:
-    let colorVec = mm_set1_epi32(cast[int32](color))
-    for _ in 0 ..< image.data.len div 16:
+    # Align to 16 bytes
+    var p = cast[uint](image.data[i].addr)
+    while i < image.data.len and (p and 15) != 0:
+      if image.data[i] != color:
+        return false
+      inc i
+      p += 4
+
+    let
+      colorVec = mm_set1_epi32(cast[int32](color))
+      iterations = (image.data.len - i) div 16
+    for _ in 0 ..< iterations:
      let
-        values0 = mm_loadu_si128(image.data[i + 0].addr)
-        values1 = mm_loadu_si128(image.data[i + 4].addr)
-        values2 = mm_loadu_si128(image.data[i + 8].addr)
-        values3 = mm_loadu_si128(image.data[i + 12].addr)
+        values0 = mm_load_si128(cast[pointer](p))
+        values1 = mm_load_si128(cast[pointer](p + 16))
+        values2 = mm_load_si128(cast[pointer](p + 32))
+        values3 = mm_load_si128(cast[pointer](p + 48))
        eq0 = mm_cmpeq_epi8(values0, colorVec)
        eq1 = mm_cmpeq_epi8(values1, colorVec)
        eq2 = mm_cmpeq_epi8(values2, colorVec)
        eq3 = mm_cmpeq_epi8(values3, colorVec)
-        eq = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
-      if mm_movemask_epi8(eq) != 0xffff:
+        eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
+      if mm_movemask_epi8(eq0123) != 0xffff:
        return false
-      i += 16
+      p += 64
+    i += 16 * iterations

-  for j in i ..< image.data.len:
-    if image.data[j] != color:
+  for i in i ..< image.data.len:
+    if image.data[i] != color:
      return false

 proc isTransparent*(image: Image): bool {.raises: [].} =
  ## Checks if this image is fully transparent or not.
+  when defined(amd64) and allowSimd:
+    if cpuHasAvx2:
+      return isTransparentAvx2(image.data, 0, image.data.len)
+
  result = true

  var i: int
  when defined(amd64) and allowSimd:
-    let vecZero = mm_setzero_si128()
-    for _ in 0 ..< image.data.len div 16:
+    # Align to 16 bytes
+    var p = cast[uint](image.data[i].addr)
+    while i < image.data.len and (p and 15) != 0:
+      if image.data[i].a != 0:
+        return false
+      inc i
+      p += 4
+
+    let
+      vecZero = mm_setzero_si128()
+      iterations = (image.data.len - i) div 16
+    for _ in 0 ..< iterations:
      let
-        values0 = mm_loadu_si128(image.data[i + 0].addr)
-        values1 = mm_loadu_si128(image.data[i + 4].addr)
-        values2 = mm_loadu_si128(image.data[i + 8].addr)
-        values3 = mm_loadu_si128(image.data[i + 12].addr)
+        values0 = mm_load_si128(cast[pointer](p))
+        values1 = mm_load_si128(cast[pointer](p + 16))
+        values2 = mm_load_si128(cast[pointer](p + 32))
+        values3 = mm_load_si128(cast[pointer](p + 48))
        values01 = mm_or_si128(values0, values1)
        values23 = mm_or_si128(values2, values3)
-        values = mm_or_si128(values01, values23)
-      if mm_movemask_epi8(mm_cmpeq_epi8(values, vecZero)) != 0xffff:
+        values0123 = mm_or_si128(values01, values23)
+      if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
        return false
-      i += 16
+      p += 64
+    i += 16 * iterations

-  for j in i ..< image.data.len:
-    if image.data[j].a != 0:
+  for i in i ..< image.data.len:
+    if image.data[i].a != 0:
      return false

 proc isOpaque*(image: Image): bool {.raises: [].} =
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@ -3,8 +3,10 @@ import bumpy, chroma, common, system/memory, vmath
 const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)

 when defined(amd64) and allowSimd:
-  import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
-  let cpuHasAvx* = checkInstructionSets({AVX})
+  import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2
+  let
+    cpuHasAvx* = checkInstructionSets({AVX})
+    cpuHasAvx2* = checkInstructionSets({AVX, AVX2})

 template currentExceptionAsPixieError*(): untyped =
  ## Gets the current exception and returns it as a PixieError with stack trace.
@ -141,70 +143,87 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
  ## Converts an image to premultiplied alpha from straight alpha.
  var i: int
  when defined(amd64) and allowSimd:
-    # When supported, SIMD convert as much as possible
-    let
-      alphaMask = mm_set1_epi32(cast[int32](0xff000000))
-      oddMask = mm_set1_epi16(cast[int16](0xff00))
-      div255 = mm_set1_epi16(cast[int16](0x8081))
-    for _ in 0 ..< data.len div 4:
+    if cpuHasAvx2:
+      i = toPremultipliedAlphaAvx2(data)
+    else:
      let
-        values = mm_loadu_si128(data[i].addr)
-        alpha = mm_and_si128(values, alphaMask)
-        eq = mm_cmpeq_epi8(values, alphaMask)
-      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+        alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+        oddMask = mm_set1_epi16(cast[int16](0xff00))
+        div255 = mm_set1_epi16(cast[int16](0x8081))
+      for _ in 0 ..< data.len div 4:
        let
-          evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
-          oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
-        var
-          colorsEven = mm_slli_epi16(values, 8)
-          colorsOdd = mm_and_si128(values, oddMask)
-        colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
-        colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
-        colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
-        colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
-        mm_storeu_si128(
-          data[i].addr,
-          mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
-        )
-      i += 4
+          values = mm_loadu_si128(data[i].addr)
+          alpha = mm_and_si128(values, alphaMask)
+          eq = mm_cmpeq_epi8(values, alphaMask)
+        if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+          let
+            evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
+            oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
+          var
+            colorsEven = mm_slli_epi16(values, 8)
+            colorsOdd = mm_and_si128(values, oddMask)
+          colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
+          colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
+          colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
+          colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
+          mm_storeu_si128(
+            data[i].addr,
+            mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
+          )
+        i += 4

  # Convert whatever is left
-  for j in i ..< data.len:
-    var c = data[j]
+  for i in i ..< data.len:
+    var c = data[i]
    if c.a != 255:
-      c.r = ((c.r.uint32 * c.a.uint32) div 255).uint8
-      c.g = ((c.g.uint32 * c.a.uint32) div 255).uint8
-      c.b = ((c.b.uint32 * c.a.uint32) div 255).uint8
-      data[j] = c
+      c.r = ((c.r.uint32 * c.a) div 255).uint8
+      c.g = ((c.g.uint32 * c.a) div 255).uint8
+      c.b = ((c.b.uint32 * c.a) div 255).uint8
+      data[i] = c

 proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
+  when defined(amd64) and allowSimd:
+    if cpuHasAvx2 and len >= 64:
+      return isOpaqueAvx2(data, start, len)
+
  result = true

  var i = start
  when defined(amd64) and allowSimd:
-    let vec255 = mm_set1_epi32(cast[int32](uint32.high))
-    for _ in start ..< (start + len) div 16:
+    # Align to 16 bytes
+    var p = cast[uint](data[i].addr)
+    while i < (start + len) and (p and 15) != 0:
+      if data[i].a != 255:
+        return false
+      inc i
+      p += 4
+
+    let
+      vec255 = mm_set1_epi8(255)
+      iterations = (start + len - i) div 16
+    for _ in 0 ..< iterations:
      let
-        values0 = mm_loadu_si128(data[i + 0].addr)
-        values1 = mm_loadu_si128(data[i + 4].addr)
-        values2 = mm_loadu_si128(data[i + 8].addr)
-        values3 = mm_loadu_si128(data[i + 12].addr)
+        values0 = mm_load_si128(cast[pointer](p))
+        values1 = mm_load_si128(cast[pointer](p + 16))
+        values2 = mm_load_si128(cast[pointer](p + 32))
+        values3 = mm_load_si128(cast[pointer](p + 48))
        values01 = mm_and_si128(values0, values1)
        values23 = mm_and_si128(values2, values3)
-        values = mm_and_si128(values01, values23)
-        eq = mm_cmpeq_epi8(values, vec255)
+        values0123 = mm_and_si128(values01, values23)
+        eq = mm_cmpeq_epi8(values0123, vec255)
      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
        return false
-      i += 16
+      p += 64
+    i += 16 * iterations

-  for j in i ..< start + len:
-    if data[j].a != 255:
+  for i in i ..< start + len:
+    if data[i].a != 255:
      return false

 when defined(amd64) and allowSimd:
  proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
-    let opacityVec =  mm_set1_ps(opacity)
-    var finalColor =  mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
+    let opacityVec = mm_set1_ps(opacity)
+    var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
    cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -1,5 +1,5 @@
-import blends, bumpy, chroma, common, fenv, images, internal, masks, paints,
-    strutils, vmath
+import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv,
+    std/strutils, vmath

 when defined(amd64) and allowSimd:
  import nimsimd/sse2
@ -1171,7 +1171,9 @@ proc partitionSegments(

    var entryCounts = newSeq[int](numPartitions)
    for (segment, _) in segments:
-      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
+      for partitionIndex in segment.partitionRange(
+        numPartitions, startY, partitionHeight
+      ):
        inc entryCounts[partitionIndex]

    for partitionIndex, entryCounts in entryCounts:
@ -1179,7 +1181,9 @@ proc partitionSegments(

    var indexes = newSeq[int](numPartitions)
    for i, (segment, winding) in segments:
-      for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
+      for partitionIndex in segment.partitionRange(
+        numPartitions, startY, partitionHeight
+      ):
        result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
        inc indexes[partitionIndex]

@ -1915,175 +1919,147 @@ proc fillShapes(
          break

    if allEntriesInScanlineSpanIt and tmp == 2:
-      var at: Vec2
-      if not intersectsInside(
-        partitions[partitionIndex].entries[entryIndices[0]].segment,
-        partitions[partitionIndex].entries[entryIndices[1]].segment,
-        at
-      ):
-        # We have 2 non-intersecting lines
-        var
-          left = partitions[partitionIndex].entries[entryIndices[0]]
-          right = partitions[partitionIndex].entries[entryIndices[1]]
-        block:
-          # Ensure left is actually on the left
-          let
-            maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x)
-            maybeRightMaxX = max(right.segment.at.x, right.segment.to.x)
-          if maybeLeftMaxX > maybeRightMaxX:
-            swap left, right
+      var
+        left = partitions[partitionIndex].entries[entryIndices[0]]
+        right = partitions[partitionIndex].entries[entryIndices[1]]
+      block:
+        # Ensure left is actually on the left
+        let
+          maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x)
+          maybeRightMaxX = max(right.segment.at.x, right.segment.to.x)
+        if maybeLeftMaxX > maybeRightMaxX:
+          swap left, right

-        let requiresAntiAliasing =
-          left.segment.requiresAntiAliasing or
-          right.segment.requiresAntiAliasing
+      # Use trapezoid coverage at the edges and fill in the middle

-        if requiresAntiAliasing:
-          # We have 2 non-intersecting lines that require anti-aliasing
-          # Use trapezoid coverage at the edges and fill in the middle
-
-          when allowSimd and defined(amd64):
-            let vecRgbx = mm_set_ps(
-              rgbx.a.float32,
-              rgbx.b.float32,
-              rgbx.g.float32,
-              rgbx.r.float32
-            )
-
-          proc solveX(entry: PartitionEntry, y: float32): float32 =
-            if entry.m == 0:
-              entry.b
-            else:
-              (y - entry.b) / entry.m
-
-          proc solveY(entry: PartitionEntry, x: float32): float32 =
-            entry.m * x + entry.b
-
-          var
-            leftTop = vec2(0, y.float32)
-            leftBottom = vec2(0, (y + 1).float32)
-          leftTop.x = left.solveX(leftTop.y.float32)
-          leftBottom.x = left.solveX(leftBottom.y)
-
-          var
-            rightTop = vec2(0, y.float32)
-            rightBottom = vec2(0, (y + 1).float32)
-          rightTop.x = right.solveX(rightTop.y)
-          rightBottom.x = right.solveX(rightBottom.y)
-
-          let
-            # leftMinX = min(leftTop.x, leftBottom.x)
-            leftMaxX = max(leftTop.x, leftBottom.x)
-            rightMinX = min(rightTop.x, rightBottom.x)
-            # rightMaxX = max(rightTop.x, rightBottom.x)
-            # leftCoverBegin = leftMinX.trunc
-            leftCoverEnd = leftMaxX.ceil.int
-            rightCoverBegin = rightMinX.trunc.int
-            # rightCoverEnd = rightMaxX.ceil
-
-          if leftCoverEnd < rightCoverBegin:
-            # Only take this shortcut if the partial coverage areas on the
-            # left and the right do not overlap
-
-            let blender = blendMode.blender()
-
-            block: # Left-side partial coverage
-              let
-                inverted = leftTop.x < leftBottom.x
-                sliverStart = min(leftTop.x, leftBottom.x)
-                rectStart = max(leftTop.x, leftBottom.x)
-              var
-                pen = sliverStart
-                prevPen = pen
-                penY = if inverted: y.float32 else: (y + 1).float32
-                prevPenY = penY
-              for x in sliverStart.int ..< rectStart.ceil.int:
-                prevPen = pen
-                pen = (x + 1).float32
-                var rightRectArea = 0.float32
-                if pen > rectStart:
-                  rightRectArea = pen - rectStart
-                  pen = rectStart
-                prevPenY = penY
-                penY = left.solveY(pen)
-                if x < 0 or x >= image.width:
-                  continue
-                let
-                  run = pen - prevPen
-                  triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
-                  rectArea =
-                    if inverted:
-                      (prevPenY - y.float32) * run
-                    else:
-                      ((y + 1).float32 - prevPenY) * run
-                  area = triangleArea + rectArea + rightRectArea
-                  dataIndex = image.dataIndex(x, y)
-                  backdrop = image.data[dataIndex]
-                  source =
-                    when allowSimd and defined(amd64):
-                      applyOpacity(vecRgbx, area)
-                    else:
-                      rgbx * area
-                image.data[dataIndex] = blender(backdrop, source)
-
-            block: # Right-side partial coverage
-              let
-                inverted = rightTop.x > rightBottom.x
-                rectEnd = min(rightTop.x, rightBottom.x)
-                sliverEnd = max(rightTop.x, rightBottom.x)
-              var
-                pen = rectEnd
-                prevPen = pen
-                penY = if inverted: (y + 1).float32 else: y.float32
-                prevPenY = penY
-              for x in rectEnd.int ..< sliverEnd.ceil.int:
-                prevPen = pen
-                pen = (x + 1).float32
-                let leftRectArea = prevPen.fractional
-                if pen > sliverEnd:
-                  pen = sliverEnd
-                prevPenY = penY
-                penY = right.solveY(pen)
-                if x < 0 or x >= image.width:
-                  continue
-                let
-                  run = pen - prevPen
-                  triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
-                  rectArea =
-                    if inverted:
-                      (penY - y.float32) * run
-                    else:
-                      ((y + 1).float32 - penY) * run
-                  area = leftRectArea + triangleArea + rectArea
-                  dataIndex = image.dataIndex(x, y)
-                  backdrop = image.data[dataIndex]
-                  source =
-                    when allowSimd and defined(amd64):
-                      applyOpacity(vecRgbx, area)
-                    else:
-                      rgbx * area
-                image.data[dataIndex] = blender(backdrop, source)
-
-            let
-              fillBegin = leftCoverEnd.clamp(0, image.width)
-              fillEnd = rightCoverBegin.clamp(0, image.width)
-            if fillEnd - fillBegin > 0:
-              hits[0] = (fixed32(fillBegin.float32), 1.int16)
-              hits[1] = (fixed32(fillEnd.float32), -1.int16)
-              image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode)
-
-            inc y
-            continue
+      when allowSimd and defined(amd64):
+        let vecRgbx = mm_set_ps(
+          rgbx.a.float32,
+          rgbx.b.float32,
+          rgbx.g.float32,
+          rgbx.r.float32
+        )

+      proc solveX(entry: PartitionEntry, y: float32): float32 =
+        if entry.m == 0:
+          entry.b
        else:
+          (y - entry.b) / entry.m
+
+      proc solveY(entry: PartitionEntry, x: float32): float32 =
+        entry.m * x + entry.b
+
+      var
+        leftTop = vec2(0, y.float32)
+        leftBottom = vec2(0, (y + 1).float32)
+      leftTop.x = left.solveX(leftTop.y.float32)
+      leftBottom.x = left.solveX(leftBottom.y)
+
+      var
+        rightTop = vec2(0, y.float32)
+        rightBottom = vec2(0, (y + 1).float32)
+      rightTop.x = right.solveX(rightTop.y)
+      rightBottom.x = right.solveX(rightBottom.y)
+
+      let
+        leftMaxX = max(leftTop.x, leftBottom.x)
+        rightMinX = min(rightTop.x, rightBottom.x)
+        leftCoverEnd = leftMaxX.ceil.int
+        rightCoverBegin = rightMinX.trunc.int
+
+      if leftCoverEnd < rightCoverBegin:
+        # Only take this shortcut if the partial coverage areas on the
+        # left and the right do not overlap
+
+        let blender = blendMode.blender()
+
+        block: # Left-side partial coverage
          let
-            minX = left.segment.at.x.int.clamp(0, image.width)
-            maxX = right.segment.at.x.int.clamp(0, image.width)
-          hits[0] = (cast[Fixed32](minX * 256), 1.int16)
-          hits[1] = (cast[Fixed32](maxX * 256), -1.int16)
+            inverted = leftTop.x < leftBottom.x
+            sliverStart = min(leftTop.x, leftBottom.x)
+            rectStart = max(leftTop.x, leftBottom.x)
+          var
+            pen = sliverStart
+            prevPen = pen
+            penY = if inverted: y.float32 else: (y + 1).float32
+            prevPenY = penY
+          for x in sliverStart.int ..< rectStart.ceil.int:
+            prevPen = pen
+            pen = (x + 1).float32
+            var rightRectArea = 0.float32
+            if pen > rectStart:
+              rightRectArea = pen - rectStart
+              pen = rectStart
+            prevPenY = penY
+            penY = left.solveY(pen)
+            if x < 0 or x >= image.width:
+              continue
+            let
+              run = pen - prevPen
+              triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
+              rectArea =
+                if inverted:
+                  (prevPenY - y.float32) * run
+                else:
+                  ((y + 1).float32 - prevPenY) * run
+              area = triangleArea + rectArea + rightRectArea
+              dataIndex = image.dataIndex(x, y)
+              backdrop = image.data[dataIndex]
+              source =
+                when allowSimd and defined(amd64):
+                  applyOpacity(vecRgbx, area)
+                else:
+                  rgbx * area
+            image.data[dataIndex] = blender(backdrop, source)
+
+        block: # Right-side partial coverage
+          let
+            inverted = rightTop.x > rightBottom.x
+            rectEnd = min(rightTop.x, rightBottom.x)
+            sliverEnd = max(rightTop.x, rightBottom.x)
+          var
+            pen = rectEnd
+            prevPen = pen
+            penY = if inverted: (y + 1).float32 else: y.float32
+            prevPenY = penY
+          for x in rectEnd.int ..< sliverEnd.ceil.int:
+            prevPen = pen
+            pen = (x + 1).float32
+            let leftRectArea = prevPen.fractional
+            if pen > sliverEnd:
+              pen = sliverEnd
+            prevPenY = penY
+            penY = right.solveY(pen)
+            if x < 0 or x >= image.width:
+              continue
+            let
+              run = pen - prevPen
+              triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
+              rectArea =
+                if inverted:
+                  (penY - y.float32) * run
+                else:
+                  ((y + 1).float32 - penY) * run
+              area = leftRectArea + triangleArea + rectArea
+              dataIndex = image.dataIndex(x, y)
+              backdrop = image.data[dataIndex]
+              source =
+                when allowSimd and defined(amd64):
+                  applyOpacity(vecRgbx, area)
+                else:
+                  rgbx * area
+            image.data[dataIndex] = blender(backdrop, source)
+
+        let
+          fillBegin = leftCoverEnd.clamp(0, image.width)
+          fillEnd = rightCoverBegin.clamp(0, image.width)
+        if fillEnd - fillBegin > 0:
+          hits[0] = (fixed32(fillBegin.float32), 1.int16)
+          hits[1] = (fixed32(fillEnd.float32), -1.int16)
          image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode)

-          inc y
-          continue
+        inc y
+        continue

    computeCoverage(
      cast[ptr UncheckedArray[uint8]](coverages[0].addr),
--- a/src/pixie/runtimechecked/avx.nim
+++ b/src/pixie/runtimechecked/avx.nim
--- a/src/pixie/runtimechecked/avx2.nim
+++ b/src/pixie/runtimechecked/avx2.nim
@ -0,0 +1,133 @@
+import chroma, nimsimd/avx2
+
+when defined(gcc) or defined(clang):
+  {.localPassc: "-mavx2".}
+
+when defined(release):
+  {.push checks: off.}
+
+proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
+  result = true
+
+  let color = data[0]
+
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 32 bytes
+  while i < (start + len) and (p and 31) != 0:
+    if data[i] != color:
+      return false
+    inc i
+    p += 4
+
+  let
+    colorVec = mm256_set1_epi32(cast[int32](color))
+    iterations = (start + len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm256_load_si256(cast[pointer](p))
+      values1 = mm256_load_si256(cast[pointer](p + 32))
+      eq0 = mm256_cmpeq_epi8(values0, colorVec)
+      eq1 = mm256_cmpeq_epi8(values1, colorVec)
+      eq01 = mm256_and_si256(eq0, eq1)
+    if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< start + len:
+    if data[i] != color:
+      return false
+
+proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
+  result = true
+
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 32 bytes
+  while i < (start + len) and (p and 31) != 0:
+    if data[i].a != 0:
+      return false
+    inc i
+    p += 4
+
+  let
+    vecZero = mm256_setzero_si256()
+    iterations = (start + len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm256_load_si256(cast[pointer](p))
+      values1 = mm256_load_si256(cast[pointer](p + 32))
+      values01 = mm256_or_si256(values0, values1)
+      eq = mm256_cmpeq_epi8(values01, vecZero)
+    if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< start + len:
+    if data[i].a != 0:
+      return false
+
+proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
+  result = true
+
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 32 bytes
+  while i < (start + len) and (p and 31) != 0:
+    if data[i].a != 255:
+      return false
+    inc i
+    p += 4
+
+  let
+    vec255 = mm256_set1_epi8(255)
+    iterations = (start + len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm256_load_si256(cast[pointer](p))
+      values1 = mm256_load_si256(cast[pointer](p + 32))
+      values01 = mm256_and_si256(values0, values1)
+      eq = mm256_cmpeq_epi8(values01, vec255)
+    if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< start + len:
+    if data[i].a != 255:
+      return false
+
+proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int =
+  let
+    alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm256_set1_epi16(cast[int16](0xff00))
+    div255 = mm256_set1_epi16(cast[int16](0x8081))
+  for _ in 0 ..< data.len div 8:
+    let
+      values = mm256_loadu_si256(data[result].addr)
+      alpha = mm256_and_si256(values, alphaMask)
+      eq = mm256_cmpeq_epi8(values, alphaMask)
+    if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
+      let
+        evenMultiplier = mm256_or_si256(alpha, mm256_srli_epi32(alpha, 16))
+        oddMultiplier = mm256_or_si256(evenMultiplier, alphaMask)
+      var
+        colorsEven = mm256_slli_epi16(values, 8)
+        colorsOdd = mm256_and_si256(values, oddMask)
+      colorsEven = mm256_mulhi_epu16(colorsEven, evenMultiplier)
+      colorsOdd = mm256_mulhi_epu16(colorsOdd, oddMultiplier)
+      colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7)
+      colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7)
+      mm256_storeu_si256(
+        data[result].addr,
+        mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8))
+      )
+    result += 8
+
+when defined(release):
+  {.pop.}