From 00f2741aa889ccd1f0da005413ab803556c40652 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 12:12:10 -0500
Subject: [PATCH 01/15] combine fuzzing

---
 tests/fuzz_image_draw.nim        | 28 ++++++++++++++++++++++++++++
 tests/fuzz_image_draw_smooth.nim | 31 -------------------------------
 2 files changed, 28 insertions(+), 31 deletions(-)
 delete mode 100644 tests/fuzz_image_draw_smooth.nim

diff --git a/tests/fuzz_image_draw.nim b/tests/fuzz_image_draw.nim
index d8d43b8..1271156 100644
--- a/tests/fuzz_image_draw.nim
+++ b/tests/fuzz_image_draw.nim
@@ -25,3 +25,31 @@ for i in 0 ..< 250:
 
     a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
     a.draw(b, translate(translation))
+
+for i in 0 ..< 25:
+  let a = newImage(rand(1 .. 20), rand(1 .. 20))
+  for j in 0 ..< 25:
+    let b = newImage(rand(1 .. 20), rand(1 .. 20))
+
+    let
+      translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5)
+      rotation = rand(2 * PI).float32
+
+    echo a, " ", b, " ", translation, " ", rotation
+
+    a.draw(b, translate(vec2(translation.x, translation.y)))
+    a.draw(b, translate(translation) * rotate(rotation))
+
+for i in 0 ..< 25:
+  let a = newImage(rand(1 .. 2000), rand(1 .. 2000))
+  for j in 0 ..< 25:
+    let b = newImage(rand(1 .. 1000), rand(1 .. 1000))
+
+    let
+      translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500)
+      rotation = rand(2 * PI).float32
+
+    echo a, " ", b, " ", translation, " ", rotation
+
+    a.draw(b, translate(vec2(translation.x, translation.y)))
+    a.draw(b, translate(translation) * rotate(rotation))
diff --git a/tests/fuzz_image_draw_smooth.nim b/tests/fuzz_image_draw_smooth.nim
deleted file mode 100644
index 0a80cd8..0000000
--- a/tests/fuzz_image_draw_smooth.nim
+++ /dev/null
@@ -1,31 +0,0 @@
-import pixie, random
-
-randomize()
-
-for i in 0 ..< 25:
-  let a = newImage(rand(1 .. 20), rand(1 .. 20))
-  for j in 0 ..< 25:
-    let b = newImage(rand(1 .. 20), rand(1 .. 20))
-
-    let
-      translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5)
-      rotation = rand(2 * PI).float32
-
-    echo a, " ", b, " ", translation, " ", rotation
-
-    a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
-    a.draw(b, translate(translation) * rotate(rotation))
-
-for i in 0 ..< 25:
-  let a = newImage(rand(1 .. 2000), rand(1 .. 2000))
-  for j in 0 ..< 25:
-    let b = newImage(rand(1 .. 1000), rand(1 .. 1000))
-
-    let
-      translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500)
-      rotation = rand(2 * PI).float32
-
-    echo a, " ", b, " ", translation, " ", rotation
-
-    a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
-    a.draw(b, translate(translation) * rotate(rotation))

From 77b5df9d00862707bad60bd2aaaf466450bfef02 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 12:23:43 -0500
Subject: [PATCH 02/15] morepretty

---
 src/pixie/images.nim | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index ddc1153..68d5f67 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -1,6 +1,6 @@
 import blends, bumpy, chroma, common, internal, simd, vmath
 
-export Image, newImage, copy, dataIndex
+export Image, copy, dataIndex, newImage
 
 const h = 0.5.float32
 
@@ -441,11 +441,15 @@ template getUncheckedArray(
 ): ptr UncheckedArray[ColorRGBX] =
   cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr)
 
-proc blitLine(a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender) {.inline.} =
+proc blitLine(
+  a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender
+) {.inline.} =
   for i in 0 ..< len:
     a[i] = blender(a[i], b[i])
 
-proc blitLineOverwrite(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.inline.} =
+proc blitLineOverwrite(
+  a, b: ptr UncheckedArray[ColorRGBX], len: int
+) {.inline.} =
   copyMem(a[0].addr, b[0].addr, len * 4)
 
 proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =

From 04fc992dc48eb61c4fa87c9ee694fe035d8f97db Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 12:51:37 -0500
Subject: [PATCH 03/15] simpler for now

---
 src/pixie/paths.nim | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index d296bbb..7ffabd2 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1588,24 +1588,6 @@ proc fillCoverage(
     image.clearUnsafe(0, y, startX, y)
     image.clearUnsafe(startX + coverages.len, y, image.width, y)
 
-  of SubtractMaskBlend:
-    for x in x ..< startX + coverages.len:
-      let coverage = coverages[x - startX]
-      if coverage == 255 and rgbx.a == 255:
-        image.data[dataIndex] = rgbx(0, 0, 0, 0)
-      elif coverage != 0:
-        let backdrop = image.data[dataIndex]
-        image.data[dataIndex] = blendSubtractMask(backdrop, source(rgbx, coverage))
-      inc dataIndex
-
-  of ExcludeMaskBlend:
-    for x in x ..< startX + coverages.len:
-      let
-        coverage = coverages[x - startX]
-        backdrop = image.data[dataIndex]
-      image.data[dataIndex] = blendExcludeMask(backdrop, source(rgbx, coverage))
-      inc dataIndex
-
   else:
     let blender = blendMode.blender()
     for x in x ..< startX + coverages.len:
@@ -1658,7 +1640,6 @@ proc fillHits(
 
   of MaskBlend:
     {.linearScanEnd.}
-
     var filledTo = startX
     for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
       if maskClears: # Clear any gap between this fill and the previous fill
@@ -1684,25 +1665,6 @@ proc fillHits(
       image.clearUnsafe(0, y, startX, y)
       image.clearUnsafe(filledTo, y, image.width, y)
 
-  of SubtractMaskBlend:
-    for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
-      var dataIndex = image.dataIndex(start, y)
-      for _ in 0 ..< len:
-        if rgbx.a == 255:
-          image.data[dataIndex] = rgbx(0, 0, 0, 0)
-        else:
-          let backdrop = image.data[dataIndex]
-          image.data[dataIndex] = blendSubtractMask(backdrop, rgbx)
-        inc dataIndex
-
-  of ExcludeMaskBlend:
-    for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
-      var dataIndex = image.dataIndex(start, y)
-      for _ in 0 ..< len:
-        let backdrop = image.data[dataIndex]
-        image.data[dataIndex] = blendExcludeMask(backdrop, rgbx)
-        inc dataIndex
-
   else:
     let blender = blendMode.blender()
     for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):

From dd7bf9f210a53d1982d6c388fbcff55686ad826f Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 13:31:13 -0500
Subject: [PATCH 04/15] blendLine rgbx

---
 src/pixie/images.nim    |  5 ---
 src/pixie/internal.nim  |  5 +++
 src/pixie/paths.nim     | 40 ++++++++-------------
 src/pixie/simd/sse2.nim | 77 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index 68d5f67..475e8f0 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -436,11 +436,6 @@ proc drawCorrect(
         blended = blender(backdrop, sample)
       a.unsafe[x, y] = blended
 
-template getUncheckedArray(
-  image: Image, x, y: int
-): ptr UncheckedArray[ColorRGBX] =
-  cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr)
-
 proc blitLine(
   a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender
 ) {.inline.} =
diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim
index 0120333..a4e9938 100644
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@@ -47,6 +47,11 @@ proc intersectsInside*(a, b: Segment, at: var Vec2): bool {.inline.} =
     at = a.at + (t * s1)
     return true
 
+template getUncheckedArray*(
+  image: Image, x, y: int
+): ptr UncheckedArray[ColorRGBX] =
+  cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr)
+
 proc fillUnsafe*(
   data: var seq[ColorRGBX], color: SomeColor, start, len: int
 ) {.hasSimd, raises: [].} =
diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index 7ffabd2..2cf7b09 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1597,6 +1597,18 @@ proc fillCoverage(
         image.data[dataIndex] = blender(backdrop, source(rgbx, coverage))
       inc dataIndex
 
+proc blendLineNormal(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.hasSimd.} =
+  for i in 0 ..< len:
+    line[i] = blendNormal(line[i], rgbx)
+
+proc blendLineMask(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.hasSimd.} =
+  for i in 0 ..< len:
+    line[i] = blendMask(line[i], rgbx)
+
 proc fillHits(
   image: Image,
   rgbx: ColorRGBX,
@@ -1607,19 +1619,6 @@ proc fillHits(
   blendMode: BlendMode,
   maskClears = true
 ) =
-  template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) =
-    when allowSimd:
-      when defined(amd64):
-        var p = cast[uint](image.data[image.dataIndex(x, y)].addr)
-        let
-          iterations = len div 4
-          colorVec = mm_set1_epi32(cast[int32](rgbx))
-        for _ in 0 ..< iterations:
-          let backdrop = mm_loadu_si128(cast[pointer](p))
-          mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec))
-          p += 16
-        x += iterations * 4
-
   case blendMode:
   of OverwriteBlend:
     for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
@@ -1630,13 +1629,7 @@ proc fillHits(
       if rgbx.a == 255:
         fillUnsafe(image.data, rgbx, image.dataIndex(start, y), len)
       else:
-        var x = start
-        simdBlob(image, x, len, blendNormalSimd)
-        var dataIndex = image.dataIndex(x, y)
-        for _ in x ..< start + len:
-          let backdrop = image.data[dataIndex]
-          image.data[dataIndex] = blendNormal(backdrop, rgbx)
-          inc dataIndex
+        blendLineNormal(image.getUncheckedArray(start, y), rgbx, len)
 
   of MaskBlend:
     {.linearScanEnd.}
@@ -1653,12 +1646,7 @@ proc fillHits(
           )
       block: # Handle this fill
         if rgbx.a != 255:
-          var x = start
-          simdBlob(image, x, len, blendMaskSimd)
-          var dataIndex = image.dataIndex(x, y)
-          for _ in x ..< start + len:
-            let backdrop = image.data[dataIndex]
-            image.data[dataIndex] = blendMask(backdrop, rgbx)
+          blendLineMask(image.getUncheckedArray(start, y), rgbx, len)
         filledTo = start + len
 
     if maskClears:
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index cc77910..1eead85 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -527,6 +527,47 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
         result.width * 4
       )
 
+proc blendLineNormalSse2*(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 15) != 0:
+    line[i] = blendNormal(line[i], rgbx)
+    inc i
+
+  let
+    source = mm_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(cast[int16](0xff00))
+    div255 = mm_set1_epi16(cast[int16](0x8081))
+    vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
+  while i < len - 4:
+    let backdrop = mm_load_si128(line[i].addr)
+    var
+      sourceAlpha = mm_and_si128(source, alphaMask)
+      backdropEven = mm_slli_epi16(backdrop, 8)
+      backdropOdd = mm_and_si128(backdrop, oddMask)
+
+    sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
+
+    let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
+
+    backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
+    backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
+    backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
+    backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
+
+    let added = mm_add_epi8(
+      source,
+      mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
+    )
+
+    mm_store_si128(line[i].addr, added)
+    i += 4
+
+  for i in i ..< len:
+    line[i] = blendNormal(line[i], rgbx)
+
 proc blitLineNormalSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
@@ -576,6 +617,42 @@ proc blitLineNormalSse2*(
   for i in i ..< len:
     a[i] = blendNormal(a[i], b[i])
 
+proc blendLineMaskSse2*(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 15) != 0:
+    line[i] = blendMask(line[i], rgbx)
+    inc i
+
+  let
+    source = mm_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(cast[int16](0xff00))
+    div255 = mm_set1_epi16(cast[int16](0x8081))
+  while i < len - 4:
+    let backdrop = mm_load_si128(line[i].addr)
+    var
+      sourceAlpha = mm_and_si128(source, alphaMask)
+      backdropEven = mm_slli_epi16(backdrop, 8)
+      backdropOdd = mm_and_si128(backdrop, oddMask)
+
+    sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
+
+    backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
+    backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
+    backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
+    backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
+
+    mm_store_si128(
+      line[i].addr,
+      mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
+    )
+    i += 4
+
+  for i in i ..< len:
+    line[i] = blendMask(line[i], rgbx)
+
 proc blitLineMaskSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =

From 6582f7c4ca15de3ab0b7e8ece08b3c7a0ceb8e58 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 13:32:12 -0500
Subject: [PATCH 05/15] rename

---
 src/pixie/images.nim    | 20 ++++++++++----------
 src/pixie/simd/sse2.nim |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index 475e8f0..f444328 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -436,26 +436,26 @@ proc drawCorrect(
         blended = blender(backdrop, sample)
       a.unsafe[x, y] = blended
 
-proc blitLine(
+proc blendLine(
   a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender
 ) {.inline.} =
   for i in 0 ..< len:
     a[i] = blender(a[i], b[i])
 
-proc blitLineOverwrite(
+proc blendLineOverwrite(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.inline.} =
   copyMem(a[0].addr, b[0].addr, len * 4)
 
-proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
+proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
   for i in 0 ..< len:
     a[i] = blendNormal(a[i], b[i])
 
-proc blitLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
+proc blendLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
   for i in 0 ..< len:
     a[i] = blendMask(a[i], b[i])
 
-proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
+proc blendRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
   let
     px = pos.x.int
     py = pos.y.int
@@ -474,14 +474,14 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
   case blendMode:
   of NormalBlend:
     for y in yStart ..< yEnd:
-      blitLineNormal(
+      blendLineNormal(
         a.getUncheckedArray(xStart + px, y + py),
         b.getUncheckedArray(xStart, y),
         xEnd - xStart
       )
   of OverwriteBlend:
     for y in yStart ..< yEnd:
-      blitLineOverwrite(
+      blendLineOverwrite(
         a.getUncheckedArray(xStart + px, y + py),
         b.getUncheckedArray(xStart, y),
         xEnd - xStart
@@ -493,7 +493,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
     for y in yStart ..< yEnd:
       if xStart + px > 0:
         zeroMem(a.data[a.dataIndex(0, y + py)].addr, (xStart + px) * 4)
-      blitLineMask(
+      blendLineMask(
         a.getUncheckedArray(xStart + px, y + py),
         b.getUncheckedArray(xStart, y),
         xEnd - xStart
@@ -511,7 +511,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
   else:
     let blender = blendMode.blender()
     for y in yStart ..< yEnd:
-      blitLine(
+      blendLine(
         a.getUncheckedArray(xStart + px, y + py),
         b.getUncheckedArray(xStart, y),
         xEnd - xStart,
@@ -559,7 +559,7 @@ proc draw*(
   if hasRotationOrScaling or smooth:
     a.drawCorrect(b, inverseTransform.inverse(), blendMode, false)
   else:
-    a.blitRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode)
+    a.blendRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode)
 
 proc drawTiled*(
   dst, src: Image, mat: Mat3, blendMode = NormalBlend
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index 1eead85..d611962 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -568,7 +568,7 @@ proc blendLineNormalSse2*(
   for i in i ..< len:
     line[i] = blendNormal(line[i], rgbx)
 
-proc blitLineNormalSse2*(
+proc blendLineNormalSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
@@ -653,7 +653,7 @@ proc blendLineMaskSse2*(
   for i in i ..< len:
     line[i] = blendMask(line[i], rgbx)
 
-proc blitLineMaskSse2*(
+proc blendLineMaskSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int

From a56fba39a7ae09eab370ab1da097911407c85769 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 13:47:43 -0500
Subject: [PATCH 06/15] avx2 versions

---
 src/pixie/simd/avx2.nim | 87 ++++++++++++++++++++++++++++++++++++++++-
 src/pixie/simd/sse2.nim |  2 -
 2 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index 9375075..f4f4ecc 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -380,6 +380,51 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
     # Set src as this result for if we do another power
     src = result
 
+proc blendLineNormalAvx2*(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 31) != 0:
+    line[i] = blendNormal(line[i], rgbx)
+    inc i
+
+  let
+    source = mm256_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm256_set1_epi16(cast[int16](0xff00))
+    div255 = mm256_set1_epi16(cast[int16](0x8081))
+    vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
+    shuffleControl = mm256_set_epi8(
+      15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
+      15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
+    )
+  while i < len - 8:
+    let backdrop = mm256_load_si256(line[i].addr)
+    var
+      sourceAlpha = mm256_and_si256(source, alphaMask)
+      backdropEven = mm256_slli_epi16(backdrop, 8)
+      backdropOdd = mm256_and_si256(backdrop, oddMask)
+
+    sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
+
+    let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
+
+    backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
+    backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
+    backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
+    backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
+
+    let added = mm256_add_epi8(
+      source,
+      mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
+    )
+
+    mm256_store_si256(line[i].addr, added)
+    i += 8
+
+  for i in i ..< len:
+    line[i] = blendNormal(line[i], rgbx)
+
 proc blitLineNormalAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
@@ -406,7 +451,6 @@ proc blitLineNormalAvx2*(
       mm256_storeu_si256(a[i].addr, source)
     else:
       let backdrop = mm256_load_si256(a[i].addr)
-
       var
         sourceAlpha = mm256_and_si256(source, alphaMask)
         backdropEven = mm256_slli_epi16(backdrop, 8)
@@ -433,6 +477,46 @@ proc blitLineNormalAvx2*(
   for i in i ..< len:
     a[i] = blendNormal(a[i], b[i])
 
+proc blendLineMaskAvx2*(
+  line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 31) != 0:
+    line[i] = blendMask(line[i], rgbx)
+    inc i
+
+  let
+    source = mm256_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm256_set1_epi16(cast[int16](0xff00))
+    div255 = mm256_set1_epi16(cast[int16](0x8081))
+    shuffleControl = mm256_set_epi8(
+      15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
+      15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
+    )
+  while i < len - 8:
+    let backdrop = mm256_load_si256(line[i].addr)
+    var
+      sourceAlpha = mm256_and_si256(source, alphaMask)
+      backdropEven = mm256_slli_epi16(backdrop, 8)
+      backdropOdd = mm256_and_si256(backdrop, oddMask)
+
+    sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
+
+    backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
+    backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
+    backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
+    backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
+
+    mm256_store_si256(
+      line[i].addr,
+      mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
+    )
+    i += 8
+
+  for i in i ..< len:
+    line[i] = blendMask(line[i], rgbx)
+
 proc blitLineMaskAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
@@ -458,7 +542,6 @@ proc blitLineMaskAvx2*(
       discard
     else:
       let backdrop = mm256_load_si256(a[i].addr)
-
       var
         sourceAlpha = mm256_and_si256(source, alphaMask)
         backdropEven = mm256_slli_epi16(backdrop, 8)
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index d611962..f2913ff 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -590,7 +590,6 @@ proc blendLineNormalSse2*(
       mm_storeu_si128(a[i].addr, source)
     else:
       let backdrop = mm_load_si128(a[i].addr)
-
       var
         sourceAlpha = mm_and_si128(source, alphaMask)
         backdropEven = mm_slli_epi16(backdrop, 8)
@@ -674,7 +673,6 @@ proc blendLineMaskSse2*(
       discard
     else:
       let backdrop = mm_load_si128(a[i].addr)
-
       var
         sourceAlpha = mm_and_si128(source, alphaMask)
         backdropEven = mm_slli_epi16(backdrop, 8)

From 2a2dd4b23119127c3c91d29651e0ea92d370ad02 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 13:51:30 -0500
Subject: [PATCH 07/15] rename

---
 src/pixie/simd/avx2.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index f4f4ecc..a60af01 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -425,7 +425,7 @@ proc blendLineNormalAvx2*(
   for i in i ..< len:
     line[i] = blendNormal(line[i], rgbx)
 
-proc blitLineNormalAvx2*(
+proc blendLineNormalAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
@@ -517,7 +517,7 @@ proc blendLineMaskAvx2*(
   for i in i ..< len:
     line[i] = blendMask(line[i], rgbx)
 
-proc blitLineMaskAvx2*(
+proc blendLineMaskAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int

From e0cb5c2b1191be66b5d965506cef4acea548ef45 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 13:54:09 -0500
Subject: [PATCH 08/15] f

---
 src/pixie/paths.nim | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index 2cf7b09..814a6ad 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1550,7 +1550,6 @@ proc fillCoverage(
 
   of MaskBlend:
     {.linearScanEnd.}
-
     when allowSimd:
       when defined(amd64):
         for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):

From 24b36b077e49e6b4bf4f6ec24314c77c66fd1793 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 15:08:27 -0500
Subject: [PATCH 09/15] tmp

---
 src/pixie/common.nim    |  13 ++
 src/pixie/paths.nim     | 197 +++++++++--------------------
 src/pixie/simd/avx2.nim | 113 ++++++-----------
 src/pixie/simd/sse2.nim | 274 +++++++++++++++++++++++++---------------
 4 files changed, 287 insertions(+), 310 deletions(-)

diff --git a/src/pixie/common.nim b/src/pixie/common.nim
index 902d55f..b8da007 100644
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
       a = ((color.a * x + 127) div 255).uint8
     rgbx(r, g, b, a)
 
+proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
+  if coverage == 0:
+    discard
+  elif coverage == 255:
+    result = rgbx
+  else:
+    result = rgbx(
+      ((rgbx.r.uint32 * coverage + 127) div 255).uint8,
+      ((rgbx.g.uint32 * coverage + 127) div 255).uint8,
+      ((rgbx.b.uint32 * coverage + 127) div 255).uint8,
+      ((rgbx.a.uint32 * coverage + 127) div 255).uint8
+    )
+
 proc snapToPixels*(rect: Rect): Rect {.raises: [].} =
   let
     xMin = rect.x
diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index 814a6ad..7c9bf0c 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1429,6 +1429,47 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) =
     len = image.dataIndex(toX, toY) - start
   fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len)
 
+proc blendLineCoverageOverwrite(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+ ) {.hasSimd.} =
+  for i in 0 ..< len:
+    let coverage = coverages[i]
+    if coverage != 0:
+      line[i] = rgbx * coverage
+
+proc blendLineCoverageNormal(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+) {.hasSimd.} =
+  for i in 0 ..< len:
+    let coverage = coverages[i]
+    if coverage == 255 and rgbx.a == 255:
+      line[i] = rgbx
+    elif coverage == 0:
+      discard
+    else:
+      line[i] = blendNormal(line[i], rgbx * coverage)
+
+proc blendLineCoverageMask(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+) {.hasSimd.} =
+  for i in 0 ..< len:
+    let coverage = coverages[i]
+    if coverage == 0:
+      line[i] = rgbx(0, 0, 0, 0)
+    elif coverage == 255:
+      discard
+    else:
+      line[i] = blendMask(line[i], rgbx * coverage)
+
 proc fillCoverage(
   image: Image,
   rgbx: ColorRGBX,
@@ -1440,149 +1481,31 @@ proc fillCoverage(
     x = startX
     dataIndex = image.dataIndex(x, y)
 
-  when allowSimd:
-    when defined(amd64):
-      iterator simd(
-        coverages: seq[uint8], x: var int, startX: int
-      ): (M128i, bool, bool) =
-        for _ in 0 ..< coverages.len div 16:
-          let
-            coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
-            eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
-            eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
-            allZeroes = mm_movemask_epi8(eqZero) == 0xffff
-            all255 = mm_movemask_epi8(eq255) == 0xffff
-          yield (coverageVec, allZeroes, all255)
-          x += 16
-
-      proc source(colorVec, coverageVec: M128i): M128i {.inline.} =
-        let
-          oddMask = mm_set1_epi16(0xff00)
-          div255 = mm_set1_epi16(0x8081)
-
-        var unpacked = unpackAlphaValues(coverageVec)
-        unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
-
-        var
-          sourceEven = mm_slli_epi16(colorVec, 8)
-          sourceOdd = mm_and_si128(colorVec, oddMask)
-        sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
-        sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
-        sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
-        sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
-        result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
-
-      let colorVec = mm_set1_epi32(cast[int32](rgbx))
-
-  proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
-    if coverage == 0:
-      discard
-    elif coverage == 255:
-      result = rgbx
-    else:
-      result = rgbx(
-        ((rgbx.r.uint32 * coverage) div 255).uint8,
-        ((rgbx.g.uint32 * coverage) div 255).uint8,
-        ((rgbx.b.uint32 * coverage) div 255).uint8,
-        ((rgbx.a.uint32 * coverage) div 255).uint8
-      )
-
   case blendMode:
   of OverwriteBlend:
-    when allowSimd:
-      when defined(amd64):
-        for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
-          if allZeroes:
-            dataIndex += 16
-          else:
-            if all255:
-              for i in 0 ..< 4:
-                mm_storeu_si128(image.data[dataIndex].addr, colorVec)
-                dataIndex += 4
-            else:
-              var coverageVec = coverageVec
-              for i in 0 ..< 4:
-                let source = source(colorVec, coverageVec)
-                mm_storeu_si128(image.data[dataIndex].addr, source)
-                coverageVec = mm_srli_si128(coverageVec, 4)
-                dataIndex += 4
-
-    for x in x ..< startX + coverages.len:
-      let coverage = coverages[x - startX]
-      if coverage != 0:
-        image.data[dataIndex] = source(rgbx, coverage)
-      inc dataIndex
+    blendLineCoverageOverwrite(
+      image.getUncheckedArray(startX, y),
+      cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
+      rgbx,
+      coverages.len
+    )
 
   of NormalBlend:
-    when allowSimd:
-      when defined(amd64):
-        for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
-          if allZeroes:
-            dataIndex += 16
-          else:
-            if all255 and rgbx.a == 255:
-              for i in 0 ..< 4:
-                mm_storeu_si128(image.data[dataIndex].addr, colorVec)
-                dataIndex += 4
-            else:
-              var coverageVec = coverageVec
-              for i in 0 ..< 4:
-                let
-                  backdrop = mm_loadu_si128(image.data[dataIndex].addr)
-                  source = source(colorVec, coverageVec)
-                mm_storeu_si128(
-                  image.data[dataIndex].addr,
-                  blendNormalSimd(backdrop, source)
-                )
-                coverageVec = mm_srli_si128(coverageVec, 4)
-                dataIndex += 4
-
-    for x in x ..< startX + coverages.len:
-      let coverage = coverages[x - startX]
-      if coverage == 255 and rgbx.a == 255:
-        image.data[dataIndex] = rgbx
-      elif coverage == 0:
-        discard
-      else:
-        let backdrop = image.data[dataIndex]
-        image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage))
-      inc dataIndex
+    blendLineCoverageNormal(
+      image.getUncheckedArray(startX, y),
+      cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
+      rgbx,
+      coverages.len
+    )
 
   of MaskBlend:
     {.linearScanEnd.}
-    when allowSimd:
-      when defined(amd64):
-        for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
-          if not allZeroes:
-            if all255:
-              dataIndex += 16
-            else:
-              var coverageVec = coverageVec
-              for i in 0 ..< 4:
-                let
-                  backdrop = mm_loadu_si128(image.data[dataIndex].addr)
-                  source = source(colorVec, coverageVec)
-                mm_storeu_si128(
-                  image.data[dataIndex].addr,
-                  blendMaskSimd(backdrop, source)
-                )
-                coverageVec = mm_srli_si128(coverageVec, 4)
-                dataIndex += 4
-          else:
-            for i in 0 ..< 4:
-              mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128())
-              dataIndex += 4
-
-    for x in x ..< startX + coverages.len:
-      let coverage = coverages[x - startX]
-      if coverage == 0:
-        image.data[dataIndex] = rgbx(0, 0, 0, 0)
-      elif coverage == 255:
-        discard
-      else:
-        let backdrop = image.data[dataIndex]
-        image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage))
-      inc dataIndex
+    blendLineCoverageMask(
+      image.getUncheckedArray(startX, y),
+      cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
+      rgbx,
+      coverages.len
+    )
 
     image.clearUnsafe(0, y, startX, y)
     image.clearUnsafe(startX + coverages.len, y, image.width, y)
@@ -1593,7 +1516,7 @@ proc fillCoverage(
       let coverage = coverages[x - startX]
       if coverage != 0:
         let backdrop = image.data[dataIndex]
-        image.data[dataIndex] = blender(backdrop, source(rgbx, coverage))
+        image.data[dataIndex] = blender(backdrop, rgbx * coverage)
       inc dataIndex
 
 proc blendLineNormal(
diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index a60af01..4ddc87d 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -6,6 +6,41 @@ when defined(gcc) or defined(clang):
 when defined(release):
   {.push checks: off.}
 
+template blendNormalSimd(backdrop, source: M256i): M256i =
+  var
+    sourceAlpha = mm256_and_si256(source, alphaMask)
+    backdropEven = mm256_slli_epi16(backdrop, 8)
+    backdropOdd = mm256_and_si256(backdrop, oddMask)
+
+  sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
+
+  let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
+
+  backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
+  backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
+  backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
+  backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
+
+  mm256_add_epi8(
+    source,
+    mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
+  )
+
+template blendMaskSimd(backdrop, source: M256i): M256i =
+  var
+    sourceAlpha = mm256_and_si256(source, alphaMask)
+    backdropEven = mm256_slli_epi16(backdrop, 8)
+    backdropOdd = mm256_and_si256(backdrop, oddMask)
+
+  sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
+
+  backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
+  backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
+  backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
+  backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
+
+  mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
+
 proc isOneColorAvx2*(image: Image): bool {.simd.} =
   result = true
 
@@ -400,26 +435,7 @@ proc blendLineNormalAvx2*(
     )
   while i < len - 8:
     let backdrop = mm256_load_si256(line[i].addr)
-    var
-      sourceAlpha = mm256_and_si256(source, alphaMask)
-      backdropEven = mm256_slli_epi16(backdrop, 8)
-      backdropOdd = mm256_and_si256(backdrop, oddMask)
-
-    sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
-
-    let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
-
-    backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
-    backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
-    backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
-    backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
-
-    let added = mm256_add_epi8(
-      source,
-      mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
-    )
-
-    mm256_store_si256(line[i].addr, added)
+    mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source))
     i += 8
 
   for i in i ..< len:
@@ -451,27 +467,7 @@ proc blendLineNormalAvx2*(
       mm256_storeu_si256(a[i].addr, source)
     else:
       let backdrop = mm256_load_si256(a[i].addr)
-      var
-        sourceAlpha = mm256_and_si256(source, alphaMask)
-        backdropEven = mm256_slli_epi16(backdrop, 8)
-        backdropOdd = mm256_and_si256(backdrop, oddMask)
-
-      sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
-
-      let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
-
-      backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
-      backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
-      backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
-      backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
-
-      let added = mm256_add_epi8(
-        source,
-        mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
-      )
-
-      mm256_store_si256(a[i].addr, added)
-
+      mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source))
     i += 8
 
   for i in i ..< len:
@@ -496,22 +492,7 @@ proc blendLineMaskAvx2*(
     )
   while i < len - 8:
     let backdrop = mm256_load_si256(line[i].addr)
-    var
-      sourceAlpha = mm256_and_si256(source, alphaMask)
-      backdropEven = mm256_slli_epi16(backdrop, 8)
-      backdropOdd = mm256_and_si256(backdrop, oddMask)
-
-    sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
-
-    backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
-    backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
-    backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
-    backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
-
-    mm256_store_si256(
-      line[i].addr,
-      mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
-    )
+    mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source))
     i += 8
 
   for i in i ..< len:
@@ -542,23 +523,7 @@ proc blendLineMaskAvx2*(
       discard
     else:
       let backdrop = mm256_load_si256(a[i].addr)
-      var
-        sourceAlpha = mm256_and_si256(source, alphaMask)
-        backdropEven = mm256_slli_epi16(backdrop, 8)
-        backdropOdd = mm256_and_si256(backdrop, oddMask)
-
-      sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
-
-      backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
-      backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
-      backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
-      backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
-
-      mm256_store_si256(
-        a[i].addr,
-        mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
-      )
-
+      mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source))
     i += 8
 
   for i in i ..< len:
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index f2913ff..87b4bce 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
   finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
   cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
 
-proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
-  ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
-  result = mm_unpacklo_epi8(mm_setzero_si128(), v)
-  result = mm_unpacklo_epi8(mm_setzero_si128(), result)
-
-proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
-  let
-    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
-    oddMask = mm_set1_epi16(cast[int16](0xff00))
-    div255 = mm_set1_epi16(cast[int16](0x8081))
-
+template blendNormalSimd*(backdrop, source: M128i): M128i =
   var
     sourceAlpha = mm_and_si128(source, alphaMask)
     backdropEven = mm_slli_epi16(backdrop, 8)
@@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
 
   sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
 
-  let k = mm_sub_epi32(
-    mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])),
-    sourceAlpha
-  )
-
-  backdropEven = mm_mulhi_epu16(backdropEven, k)
-  backdropOdd = mm_mulhi_epu16(backdropOdd, k)
+  let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
 
+  backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
+  backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
   backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
   backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
 
@@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
     mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
   )
 
-proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
-  let
-    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
-    oddMask = mm_set1_epi16(cast[int16](0xff00))
-    div255 = mm_set1_epi16(cast[int16](0x8081))
-
+template blendMaskSimd*(backdrop, source: M128i): M128i =
   var
     sourceAlpha = mm_and_si128(source, alphaMask)
     backdropEven = mm_slli_epi16(backdrop, 8)
@@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
 
   backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
   backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
-
   backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
   backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
 
@@ -527,6 +507,67 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
         result.width * 4
       )
 
+proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} =
+
+  proc unpackAlphaValues(v: M128i): M128i {.inline.} =
+    ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
+    result = mm_unpacklo_epi8(mm_setzero_si128(), v)
+    result = mm_unpacklo_epi8(mm_setzero_si128(), result)
+
+  let
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
+
+  var unpacked = unpackAlphaValues(coverage)
+  unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
+
+  var
+    rgbxEven = mm_slli_epi16(rgbxVec, 8)
+    rgbxOdd = mm_and_si128(rgbxVec, oddMask)
+  rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked)
+  rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked)
+  rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7)
+  rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7)
+
+  mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8))
+
+proc blendLineCoverageOverwriteSse2*(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+ ) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 15) != 0:
+    let coverage = coverages[i]
+    if coverage != 0:
+      line[i] = rgbx * coverage
+    inc i
+
+  let rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+  while i < len - 16:
+    let
+      coverage = mm_loadu_si128(coverages[i].addr)
+      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
+      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+    if mm_movemask_epi8(eqZero) == 0xffff:
+      i += 16
+    elif mm_movemask_epi8(eq255) == 0xffff:
+      for _ in 0 ..< 4:
+        mm_store_si128(line[i].addr, rgbxVec)
+        i += 4
+    else:
+      var coverage = coverage
+      for _ in 0 ..< 4:
+        mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage))
+        coverage = mm_srli_si128(coverage, 4)
+        i += 4
+
+  for i in i ..< len:
+    let coverage = coverages[i]
+    if coverage != 0:
+      line[i] = rgbx * coverage
+
 proc blendLineNormalSse2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
@@ -543,26 +584,7 @@ proc blendLineNormalSse2*(
     vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
   while i < len - 4:
     let backdrop = mm_load_si128(line[i].addr)
-    var
-      sourceAlpha = mm_and_si128(source, alphaMask)
-      backdropEven = mm_slli_epi16(backdrop, 8)
-      backdropOdd = mm_and_si128(backdrop, oddMask)
-
-    sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
-
-    let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
-
-    backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
-    backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
-    backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
-    backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
-
-    let added = mm_add_epi8(
-      source,
-      mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
-    )
-
-    mm_store_si128(line[i].addr, added)
+    mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source))
     i += 4
 
   for i in i ..< len:
@@ -590,32 +612,65 @@ proc blendLineNormalSse2*(
       mm_storeu_si128(a[i].addr, source)
     else:
       let backdrop = mm_load_si128(a[i].addr)
-      var
-        sourceAlpha = mm_and_si128(source, alphaMask)
-        backdropEven = mm_slli_epi16(backdrop, 8)
-        backdropOdd = mm_and_si128(backdrop, oddMask)
-
-      sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
-
-      let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
-
-      backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
-      backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
-      backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
-      backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
-
-      let added = mm_add_epi8(
-        source,
-        mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
-      )
-
-      mm_store_si128(a[i].addr, added)
-
+      mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source))
     i += 4
 
   for i in i ..< len:
     a[i] = blendNormal(a[i], b[i])
 
+proc blendLineCoverageNormalSse2*(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 15) != 0:
+    let coverage = coverages[i]
+    if coverage == 255 and rgbx.a == 255:
+      line[i] = rgbx
+    elif coverage == 0:
+      discard
+    else:
+      line[i] = blendNormal(line[i], rgbx * coverage)
+    inc i
+
+  let
+    rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(cast[int16](0xff00))
+    div255 = mm_set1_epi16(cast[int16](0x8081))
+    vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
+  while i < len - 16:
+    let
+      coverage = mm_loadu_si128(coverages[i].addr)
+      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
+      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+    if mm_movemask_epi8(eqZero) == 0xffff:
+      i += 16
+    elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
+      for _ in 0 ..< 4:
+        mm_store_si128(line[i].addr, rgbxVec)
+        i += 4
+    else:
+      var coverage = coverage
+      for _ in 0 ..< 4:
+        let
+          backdrop = mm_loadu_si128(line[i].addr)
+          source = rgbxVec.applyCoverage(coverage)
+        mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source))
+        coverage = mm_srli_si128(coverage, 4)
+        i += 4
+
+  for i in i ..< len:
+    let coverage = coverages[i]
+    if coverage == 255 and rgbx.a == 255:
+      line[i] = rgbx
+    elif coverage == 0:
+      discard
+    else:
+      line[i] = blendNormal(line[i], rgbx * coverage)
+
 proc blendLineMaskSse2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
@@ -631,22 +686,7 @@ proc blendLineMaskSse2*(
     div255 = mm_set1_epi16(cast[int16](0x8081))
   while i < len - 4:
     let backdrop = mm_load_si128(line[i].addr)
-    var
-      sourceAlpha = mm_and_si128(source, alphaMask)
-      backdropEven = mm_slli_epi16(backdrop, 8)
-      backdropOdd = mm_and_si128(backdrop, oddMask)
-
-    sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
-
-    backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
-    backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
-    backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
-    backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
-
-    mm_store_si128(
-      line[i].addr,
-      mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
-    )
+    mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source))
     i += 4
 
   for i in i ..< len:
@@ -673,27 +713,63 @@ proc blendLineMaskSse2*(
       discard
     else:
       let backdrop = mm_load_si128(a[i].addr)
-      var
-        sourceAlpha = mm_and_si128(source, alphaMask)
-        backdropEven = mm_slli_epi16(backdrop, 8)
-        backdropOdd = mm_and_si128(backdrop, oddMask)
-
-      sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
-
-      backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
-      backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
-      backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
-      backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
-
-      mm_store_si128(
-        a[i].addr,
-        mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
-      )
-
+      mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source))
     i += 4
 
   for i in i ..< len:
     a[i] = blendMask(a[i], b[i])
 
+proc blendLineCoverageMaskSse2*(
+  line: ptr UncheckedArray[ColorRGBX],
+  coverages: ptr UncheckedArray[uint8],
+  rgbx: ColorRGBX,
+  len: int
+) {.simd.} =
+  var i: int
+  while (cast[uint](line[i].addr) and 15) != 0:
+    let coverage = coverages[i]
+    if coverage == 0:
+      line[i] = rgbx(0, 0, 0, 0)
+    elif coverage == 255:
+      discard
+    else:
+      line[i] = blendMask(line[i], rgbx * coverage)
+    inc i
+
+  let
+    rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(cast[int16](0xff00))
+    div255 = mm_set1_epi16(cast[int16](0x8081))
+  while i < len - 16:
+    let
+      coverage = mm_loadu_si128(coverages[i].addr)
+      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
+      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+    if mm_movemask_epi8(eqZero) == 0xffff:
+      for _ in 0 ..< 4:
+        mm_store_si128(line[i].addr, mm_setzero_si128())
+        i += 4
+    elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
+      i += 16
+    else:
+      var coverage = coverage
+      for _ in 0 ..< 4:
+        let
+          backdrop = mm_loadu_si128(line[i].addr)
+          source = rgbxVec.applyCoverage(coverage)
+        mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source))
+        coverage = mm_srli_si128(coverage, 4)
+        i += 4
+
+  for i in i ..< len:
+    let coverage = coverages[i]
+    if coverage == 0:
+      line[i] = rgbx(0, 0, 0, 0)
+    elif coverage == 255:
+      discard
+    else:
+      line[i] = blendMask(line[i], rgbx * coverage)
+
 when defined(release):
   {.pop.}

From a92e289e366e369707b01fa5c430ca4692dbf6da Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 15:15:48 -0500
Subject: [PATCH 10/15] f

---
 src/pixie/simd/sse2.nim | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index 87b4bce..966efbf 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -544,12 +544,15 @@ proc blendLineCoverageOverwriteSse2*(
       line[i] = rgbx * coverage
     inc i
 
-  let rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+  let
+    rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+    vecZero = mm_setzero_si128()
+    vec255 = mm_set1_epi8(255)
   while i < len - 16:
     let
       coverage = mm_loadu_si128(coverages[i].addr)
-      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
-      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+      eqZero = mm_cmpeq_epi8(coverage, vecZero)
+      eq255 = mm_cmpeq_epi8(coverage, vec255)
     if mm_movemask_epi8(eqZero) == 0xffff:
       i += 16
     elif mm_movemask_epi8(eq255) == 0xffff:
@@ -637,6 +640,8 @@ proc blendLineCoverageNormalSse2*(
 
   let
     rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+    vecZero = mm_setzero_si128()
+    vec255 = mm_set1_epi8(255)
     alphaMask = mm_set1_epi32(cast[int32](0xff000000))
     oddMask = mm_set1_epi16(cast[int16](0xff00))
     div255 = mm_set1_epi16(cast[int16](0x8081))
@@ -644,8 +649,8 @@ proc blendLineCoverageNormalSse2*(
   while i < len - 16:
     let
       coverage = mm_loadu_si128(coverages[i].addr)
-      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
-      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+      eqZero = mm_cmpeq_epi8(coverage, vecZero)
+      eq255 = mm_cmpeq_epi8(coverage, vec255)
     if mm_movemask_epi8(eqZero) == 0xffff:
       i += 16
     elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
@@ -738,17 +743,19 @@ proc blendLineCoverageMaskSse2*(
 
   let
     rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
+    vecZero = mm_setzero_si128()
+    vec255 = mm_set1_epi8(255)
     alphaMask = mm_set1_epi32(cast[int32](0xff000000))
     oddMask = mm_set1_epi16(cast[int16](0xff00))
     div255 = mm_set1_epi16(cast[int16](0x8081))
   while i < len - 16:
     let
       coverage = mm_loadu_si128(coverages[i].addr)
-      eqZero = mm_cmpeq_epi8(coverage, mm_setzero_si128())
-      eq255 = mm_cmpeq_epi8(coverage, mm_set1_epi8(255))
+      eqZero = mm_cmpeq_epi8(coverage, vecZero)
+      eq255 = mm_cmpeq_epi8(coverage, vec255)
     if mm_movemask_epi8(eqZero) == 0xffff:
       for _ in 0 ..< 4:
-        mm_store_si128(line[i].addr, mm_setzero_si128())
+        mm_store_si128(line[i].addr, vecZero)
         i += 4
     elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
       i += 16

From 58887e8eb6e021096bead26a7b14db869ebbd857 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 15:20:18 -0500
Subject: [PATCH 11/15] use aligned store (bugfix)

---
 src/pixie/simd/avx2.nim |  2 +-
 src/pixie/simd/sse2.nim | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index 4ddc87d..5ef6591 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -464,7 +464,7 @@ proc blendLineNormalAvx2*(
       source = mm256_loadu_si256(b[i].addr)
       eq255 = mm256_cmpeq_epi8(source, vec255)
     if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
-      mm256_storeu_si256(a[i].addr, source)
+      mm256_store_si256(a[i].addr, source)
     else:
       let backdrop = mm256_load_si256(a[i].addr)
       mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source))
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index 966efbf..e8fd2f7 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -305,7 +305,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} =
       valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
       valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
       valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
-      mm_storeu_si128(
+      mm_store_si128(
         cast[pointer](p),
         mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
       )
@@ -347,8 +347,8 @@ proc ceilSse2*(image: Image) {.simd.} =
     values1 = mm_cmpeq_epi8(values1, vecZero)
     values0 = mm_andnot_si128(values0, vec255)
     values1 = mm_andnot_si128(values1, vec255)
-    mm_storeu_si128(cast[pointer](p), values0)
-    mm_storeu_si128(cast[pointer](p + 16), values1)
+    mm_store_si128(cast[pointer](p), values0)
+    mm_store_si128(cast[pointer](p + 16), values1)
     p += 32
   i += 8 * iterations
 
@@ -562,7 +562,7 @@ proc blendLineCoverageOverwriteSse2*(
     else:
       var coverage = coverage
       for _ in 0 ..< 4:
-        mm_storeu_si128(line[i].addr, rgbxVec.applyCoverage(coverage))
+        mm_store_si128(line[i].addr, rgbxVec.applyCoverage(coverage))
         coverage = mm_srli_si128(coverage, 4)
         i += 4
 
@@ -612,7 +612,7 @@ proc blendLineNormalSse2*(
       source = mm_loadu_si128(b[i].addr)
       eq255 = mm_cmpeq_epi8(source, vec255)
     if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
-      mm_storeu_si128(a[i].addr, source)
+      mm_store_si128(a[i].addr, source)
     else:
       let backdrop = mm_load_si128(a[i].addr)
       mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source))
@@ -663,7 +663,7 @@ proc blendLineCoverageNormalSse2*(
         let
           backdrop = mm_loadu_si128(line[i].addr)
           source = rgbxVec.applyCoverage(coverage)
-        mm_storeu_si128(line[i].addr, blendNormalSimd(backdrop, source))
+        mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source))
         coverage = mm_srli_si128(coverage, 4)
         i += 4
 
@@ -765,7 +765,7 @@ proc blendLineCoverageMaskSse2*(
         let
           backdrop = mm_loadu_si128(line[i].addr)
           source = rgbxVec.applyCoverage(coverage)
-        mm_storeu_si128(line[i].addr, blendMaskSimd(backdrop, source))
+        mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source))
         coverage = mm_srli_si128(coverage, 4)
         i += 4
 

From 17cfb62ab370c1e8fbbc43bfc126d72a1bd71345 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 17:00:10 -0500
Subject: [PATCH 12/15] rename

---
 src/pixie/common.nim | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/pixie/common.nim b/src/pixie/common.nim
index b8da007..3e4bc40 100644
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@@ -76,17 +76,17 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
       a = ((color.a * x + 127) div 255).uint8
     rgbx(r, g, b, a)
 
-proc `*`*(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
-  if coverage == 0:
+proc `*`*(rgbx: ColorRGBX, opacity: uint8): ColorRGBX {.inline.} =
+  if opacity == 0:
     discard
-  elif coverage == 255:
+  elif opacity == 255:
     result = rgbx
   else:
     result = rgbx(
-      ((rgbx.r.uint32 * coverage + 127) div 255).uint8,
-      ((rgbx.g.uint32 * coverage + 127) div 255).uint8,
-      ((rgbx.b.uint32 * coverage + 127) div 255).uint8,
-      ((rgbx.a.uint32 * coverage + 127) div 255).uint8
+      ((rgbx.r.uint32 * opacity + 127) div 255).uint8,
+      ((rgbx.g.uint32 * opacity + 127) div 255).uint8,
+      ((rgbx.b.uint32 * opacity + 127) div 255).uint8,
+      ((rgbx.a.uint32 * opacity + 127) div 255).uint8
     )
 
 proc snapToPixels*(rect: Rect): Rect {.raises: [].} =

From e107f85fb008c9f391ed675d68d1cc09fe9fdd1a Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 17:16:35 -0500
Subject: [PATCH 13/15] check bounds when aligning

---
 src/pixie/paths.nim     |  8 ++------
 src/pixie/simd/avx2.nim |  8 ++++----
 src/pixie/simd/sse2.nim | 22 +++++++++-------------
 3 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index 7c9bf0c..e738e54 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1448,9 +1448,7 @@ proc blendLineCoverageNormal(
 ) {.hasSimd.} =
   for i in 0 ..< len:
     let coverage = coverages[i]
-    if coverage == 255 and rgbx.a == 255:
-      line[i] = rgbx
-    elif coverage == 0:
+    if coverage == 0:
       discard
     else:
       line[i] = blendNormal(line[i], rgbx * coverage)
@@ -1463,9 +1461,7 @@ proc blendLineCoverageMask(
 ) {.hasSimd.} =
   for i in 0 ..< len:
     let coverage = coverages[i]
-    if coverage == 0:
-      line[i] = rgbx(0, 0, 0, 0)
-    elif coverage == 255:
+    if coverage == 255:
       discard
     else:
       line[i] = blendMask(line[i], rgbx * coverage)
diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index 5ef6591..97807c3 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -419,7 +419,7 @@ proc blendLineNormalAvx2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 31) != 0:
+  while i < len and (cast[uint](line[i].addr) and 31) != 0:
     line[i] = blendNormal(line[i], rgbx)
     inc i
 
@@ -445,7 +445,7 @@ proc blendLineNormalAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](a[i].addr) and 31) != 0:
+  while i < len and (cast[uint](a[i].addr) and 31) != 0:
     a[i] = blendNormal(a[i], b[i])
     inc i
 
@@ -477,7 +477,7 @@ proc blendLineMaskAvx2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 31) != 0:
+  while i < len and (cast[uint](line[i].addr) and 31) != 0:
     line[i] = blendMask(line[i], rgbx)
     inc i
 
@@ -502,7 +502,7 @@ proc blendLineMaskAvx2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](a[i].addr) and 31) != 0:
+  while i < len and (cast[uint](a[i].addr) and 31) != 0:
     a[i] = blendMask(a[i], b[i])
     inc i
 
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index e8fd2f7..df849dd 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -538,7 +538,7 @@ proc blendLineCoverageOverwriteSse2*(
   len: int
  ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 15) != 0:
+  while i < len and (cast[uint](line[i].addr) and 15) != 0:
     let coverage = coverages[i]
     if coverage != 0:
       line[i] = rgbx * coverage
@@ -575,7 +575,7 @@ proc blendLineNormalSse2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 15) != 0:
+  while i < len and (cast[uint](line[i].addr) and 15) != 0:
     line[i] = blendNormal(line[i], rgbx)
     inc i
 
@@ -597,7 +597,7 @@ proc blendLineNormalSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](a[i].addr) and 15) != 0:
+  while i < len and (cast[uint](a[i].addr) and 15) != 0:
     a[i] = blendNormal(a[i], b[i])
     inc i
 
@@ -628,11 +628,9 @@ proc blendLineCoverageNormalSse2*(
   len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 15) != 0:
+  while i < len and (cast[uint](line[i].addr) and 15) != 0:
     let coverage = coverages[i]
-    if coverage == 255 and rgbx.a == 255:
-      line[i] = rgbx
-    elif coverage == 0:
+    if coverage == 0:
       discard
     else:
       line[i] = blendNormal(line[i], rgbx * coverage)
@@ -669,9 +667,7 @@ proc blendLineCoverageNormalSse2*(
 
   for i in i ..< len:
     let coverage = coverages[i]
-    if coverage == 255 and rgbx.a == 255:
-      line[i] = rgbx
-    elif coverage == 0:
+    if coverage == 0:
       discard
     else:
       line[i] = blendNormal(line[i], rgbx * coverage)
@@ -680,7 +676,7 @@ proc blendLineMaskSse2*(
   line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 15) != 0:
+  while i < len and (cast[uint](line[i].addr) and 15) != 0:
     line[i] = blendMask(line[i], rgbx)
     inc i
 
@@ -701,7 +697,7 @@ proc blendLineMaskSse2*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](a[i].addr) and 15) != 0:
+  while  i < len and (cast[uint](a[i].addr) and 15) != 0:
     a[i] = blendMask(a[i], b[i])
     inc i
 
@@ -731,7 +727,7 @@ proc blendLineCoverageMaskSse2*(
   len: int
 ) {.simd.} =
   var i: int
-  while (cast[uint](line[i].addr) and 15) != 0:
+  while  i < len and (cast[uint](line[i].addr) and 15) != 0:
     let coverage = coverages[i]
     if coverage == 0:
       line[i] = rgbx(0, 0, 0, 0)

From 31bd588b172c6bff9422d2220b62318b726742fe Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 19:28:05 -0500
Subject: [PATCH 14/15] simpler

---
 src/pixie/simd/sse2.nim | 18 ++++++------------
 tests/bench_fonts.nim   |  3 +--
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index df849dd..a5880ed 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -507,18 +507,10 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
         result.width * 4
       )
 
-proc applyCoverage*(rgbxVec, coverage: M128i): M128i {.inline.} =
-
-  proc unpackAlphaValues(v: M128i): M128i {.inline.} =
-    ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
-    result = mm_unpacklo_epi8(mm_setzero_si128(), v)
-    result = mm_unpacklo_epi8(mm_setzero_si128(), result)
-
-  let
-    oddMask = mm_set1_epi16(0xff00)
-    div255 = mm_set1_epi16(0x8081)
-
-  var unpacked = unpackAlphaValues(coverage)
+template applyCoverage*(rgbxVec, coverage: M128i): M128i =
+  ## Unpack the first 4 coverage bytes.
+  var unpacked = mm_unpacklo_epi8(mm_setzero_si128(), coverage)
+  unpacked = mm_unpacklo_epi8(mm_setzero_si128(), unpacked)
   unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
 
   var
@@ -548,6 +540,8 @@ proc blendLineCoverageOverwriteSse2*(
     rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
     vecZero = mm_setzero_si128()
     vec255 = mm_set1_epi8(255)
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
   while i < len - 16:
     let
       coverage = mm_loadu_si128(coverages[i].addr)
diff --git a/tests/bench_fonts.nim b/tests/bench_fonts.nim
index 32bbc01..544ca12 100644
--- a/tests/bench_fonts.nim
+++ b/tests/bench_fonts.nim
@@ -5,8 +5,7 @@ const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis in q
 var font = readFont("tests/fonts/Roboto-Regular_1.ttf")
 font.size = 16
 
-let
-  image = newImage(500, 300)
+let image = newImage(500, 300)
 
 timeIt "typeset":
   discard font.typeset(text, bounds = vec2(image.width.float32, 0))

From 36675576188d95f0a64dbd6e53bf5df1fe81ee30 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sun, 31 Jul 2022 21:14:21 -0500
Subject: [PATCH 15/15] rename

---
 src/pixie/simd/neon.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim
index bb43213..8beca4f 100644
--- a/src/pixie/simd/neon.nim
+++ b/src/pixie/simd/neon.nim
@@ -414,7 +414,7 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} =
         result.width * 4
       )
 
-proc blitLineNormalNeon*(
+proc blendLineNormalNeon*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int
@@ -463,7 +463,7 @@ proc blitLineNormalNeon*(
   for i in i ..< len:
     a[i] = blendNormal(a[i], b[i])
 
-proc blitLineMaskNeon*(
+proc blendLineMaskNeon*(
   a, b: ptr UncheckedArray[ColorRGBX], len: int
 ) {.simd.} =
   var i: int