From 7401ceb3d1838ab669910a06a4de38282f60a95b Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Sat, 2 Jul 2022 01:07:03 -0500
Subject: [PATCH 01/13] for mac

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index d6e47da..c3939ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ bindings/generated
 dump.txt
 tests/fileformats/jpeg/generated
 tests/fileformats/jpeg/diffs
+*.dylib

From 06005de85c2dd5955653182db00e75eef5540f2a Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 16:14:04 -0500
Subject: [PATCH 02/13] bugfix

---
 src/pixie/internal.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim
index 18ee742..f854dc1 100644
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@@ -89,8 +89,8 @@ proc fillUnsafe*(
   if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
     nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
   else:
-    for color in data.mitems:
-      color = rgbx
+    for i in start ..< start + len:
+          data[i] = rgbx
 
 const straightAlphaTable = block:
   var table: array[256, array[256, uint8]]

From dabe456af94e28ebd5825a0652e592e9de68da5b Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 16:16:07 -0500
Subject: [PATCH 03/13] move image, mask to common

---
 src/pixie/common.nim | 10 ++++++++++
 src/pixie/images.nim |  8 +-------
 src/pixie/masks.nim  |  8 +-------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/pixie/common.nim b/src/pixie/common.nim
index 7441461..e289bb3 100644
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@@ -31,6 +31,16 @@ type
   ImageDimensions* = object
     width*, height*: int
 
+  Image* = ref object
+    ## Image object that holds bitmap data in premultiplied alpha RGBA format.
+    width*, height*: int
+    data*: seq[ColorRGBX]
+
+  Mask* = ref object
+    ## Mask object that holds mask opacity data.
+    width*, height*: int
+    data*: seq[uint8]
+
 proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} =
   ## Linearly interpolate between a and b using t.
   let t = round(t * 255).uint32
diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index a094545..35d8fbd 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -8,13 +8,7 @@ when allowSimd:
 
 const h = 0.5.float32
 
-type
-  Image* = ref object
-    ## Image object that holds bitmap data in RGBA format.
-    width*, height*: int
-    data*: seq[ColorRGBX]
-
-  UnsafeImage = distinct Image
+type UnsafeImage = distinct Image
 
 when defined(release):
   {.push checks: off.}
diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim
index 9797ff0..7efc04e 100644
--- a/src/pixie/masks.nim
+++ b/src/pixie/masks.nim
@@ -6,13 +6,7 @@ when allowSimd:
   when defined(amd64):
     import nimsimd/sse2
 
-type
-  Mask* = ref object
-    ## Mask object that holds mask opacity data.
-    width*, height*: int
-    data*: seq[uint8]
-
-  UnsafeMask = distinct Mask
+type UnsafeMask = distinct Mask
 
 when defined(release):
   {.push checks: off.}

From 8ca14006992a57e30958e4f7a9e303e2efecda41 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 16:18:10 -0500
Subject: [PATCH 04/13] rm unimportant simd for now

---
 src/pixie/blends.nim | 69 ++++++--------------------------------------
 1 file changed, 9 insertions(+), 60 deletions(-)

diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim
index 18edbd7..8200ff7 100644
--- a/src/pixie/blends.nim
+++ b/src/pixie/blends.nim
@@ -273,67 +273,16 @@ proc blendSoftLight*(backdrop, source: ColorRGBX): ColorRGBX =
     backdrop = backdrop.rgba()
     source = source.rgba()
 
-  var rgba: ColorRGBA
-  when defined(amd64) and allowSimd:
-    let
-      vb = mm_setr_ps(
-        backdrop.r.float32,
-        backdrop.g.float32,
-        backdrop.b.float32,
-        0
-      )
-      vs = mm_setr_ps(source.r.float32, source.g.float32, source.b.float32, 0)
-      v2 = mm_set1_ps(2)
-      v255 = mm_set1_ps(255)
-      v255sq = mm_set1_ps(255 * 255)
-      vm = ((v255 - v2 * vs) * vb * vb) / v255sq + (v2 * vs * vb) / v255
-      values = cast[array[4, uint32]](mm_cvtps_epi32(vm))
+  let
+    b = backdrop.color
+    s = source.color
+  var blended: Color
+  blended.r = softLight(b.r, s.r)
+  blended.g = softLight(b.g, s.g)
+  blended.b = softLight(b.b, s.b)
+  blended = alphaFix(b, s, blended)
 
-    rgba.r = values[0].uint8
-    rgba.g = values[1].uint8
-    rgba.b = values[2].uint8
-
-    # proc alphaFix(backdrop, source, mixed: ColorRGBX): ColorRGBX {.inline.} =
-    #   if backdrop.a == 0 and source.a == 0:
-    #     return
-    #   let
-    #     vb = mm_setr_ps(backdrop.r.float32, backdrop.g.float32, backdrop.b.float32, 0)
-    #     vs = mm_setr_ps(source.r.float32, source.g.float32, source.b.float32, 0)
-    #     vm = mm_setr_ps(mixed.r.float32, mixed.g.float32, mixed.b.float32, 0)
-    #   alphaFix(backdrop, source, vb, vs, vm)
-
-    let
-      sa = source.a.float32
-      ba = backdrop.a.float32
-      a = sa + ba * (255 - sa) / 255
-    if a == 0:
-      return
-
-    let
-      t0 = mm_set1_ps(sa * (255 - ba))
-      t1 = mm_set1_ps(sa * ba)
-      t2 = mm_set1_ps((255 - sa) * ba)
-      va = mm_set1_ps(a)
-      final = cast[array[4, uint32]](
-        mm_cvtps_epi32((t0 * vs + t1 * vm + t2 * vb) / va / v255)
-      )
-
-    rgba.r = final[0].uint8
-    rgba.g = final[1].uint8
-    rgba.b = final[2].uint8
-    rgba.a = a.uint8
-  else:
-    let
-      b = backdrop.color
-      s = source.color
-    var blended: Color
-    blended.r = softLight(b.r, s.r)
-    blended.g = softLight(b.g, s.g)
-    blended.b = softLight(b.b, s.b)
-    blended = alphaFix(b, s, blended)
-    rgba = blended.rgba
-
-  result = rgba.rgbx()
+  result = blended.rgbx()
 
 proc blendHardLight*(backdrop, source: ColorRGBX): ColorRGBX =
   result.r = hardLight(backdrop.r, backdrop.a, source.r, source.a)

From 138d88a9bc8b6bbcc43d43f3e5b3b82a3a1fbbe4 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 16:22:06 -0500
Subject: [PATCH 05/13] rename

---
 src/pixie/simd.nim                          | 3 +--
 src/pixie/{runtimechecked => simd}/avx.nim  | 0
 src/pixie/{runtimechecked => simd}/avx2.nim | 0
 3 files changed, 1 insertion(+), 2 deletions(-)
 rename src/pixie/{runtimechecked => simd}/avx.nim (100%)
 rename src/pixie/{runtimechecked => simd}/avx2.nim (100%)

diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim
index 1ed5baf..4230366 100644
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@@ -4,8 +4,7 @@ when defined(release):
   {.push checks: off.}
 
 when defined(amd64):
-  import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx,
-      runtimechecked/avx2
+  import nimsimd/runtimecheck, nimsimd/sse2, simd/avx, simd/avx2
 
   let
     cpuHasAvx* = checkInstructionSets({AVX})
diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/simd/avx.nim
similarity index 100%
rename from src/pixie/runtimechecked/avx.nim
rename to src/pixie/simd/avx.nim
diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/simd/avx2.nim
similarity index 100%
rename from src/pixie/runtimechecked/avx2.nim
rename to src/pixie/simd/avx2.nim

From fd52dfecb46d3e23305020ad845b8ecc5e10049a Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 16:24:36 -0500
Subject: [PATCH 06/13] unused now that mask type is in common

---
 src/pixie/fileformats/png.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pixie/fileformats/png.nim b/src/pixie/fileformats/png.nim
index f7af75b..79c694d 100644
--- a/src/pixie/fileformats/png.nim
+++ b/src/pixie/fileformats/png.nim
@@ -1,5 +1,5 @@
 import chroma, flatty/binny, math, pixie/common, pixie/images, pixie/internal,
-    pixie/masks, zippy, zippy/crc
+    zippy, zippy/crc
 
 when defined(amd64) and allowSimd:
   import nimsimd/sse2

From 8bb6957fe9815a8a4e72b3117f15fbf408b00104 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 18:47:32 -0500
Subject: [PATCH 07/13] simd, hasSimd pragmas

---
 src/pixie/blends.nim           |   5 +-
 src/pixie/common.nim           |  20 ++
 src/pixie/fileformats/jpeg.nim |   5 +-
 src/pixie/fileformats/png.nim  |   5 +-
 src/pixie/images.nim           |  82 ++-----
 src/pixie/internal.nim         |  36 +--
 src/pixie/masks.nim            |  46 +---
 src/pixie/paints.nim           |   5 +-
 src/pixie/paths.nim            |   7 +-
 src/pixie/simd.nim             | 435 ++++-----------------------------
 src/pixie/simd/avx.nim         |   8 +-
 src/pixie/simd/avx2.nim        |  40 +--
 src/pixie/simd/internal.nim    |  39 +++
 src/pixie/simd/sse2.nim        | 351 ++++++++++++++++++++++++++
 src/pixie/simd/todo.nim        |  33 +++
 15 files changed, 560 insertions(+), 557 deletions(-)
 create mode 100644 src/pixie/simd/internal.nim
 create mode 100644 src/pixie/simd/sse2.nim
 create mode 100644 src/pixie/simd/todo.nim

diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim
index 8200ff7..e6a38be 100644
--- a/src/pixie/blends.nim
+++ b/src/pixie/blends.nim
@@ -1,9 +1,6 @@
 ## Blending modes.
 
-import chroma, common, internal, std/math
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import chroma, common, simd, std/math
 
 # See https://www.w3.org/TR/compositing-1/
 # See https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_blend_equation_advanced.txt
diff --git a/src/pixie/common.nim b/src/pixie/common.nim
index e289bb3..94f4563 100644
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@@ -41,6 +41,26 @@ type
     width*, height*: int
     data*: seq[uint8]
 
+proc newImage*(width, height: int): Image {.raises: [PixieError].} =
+  ## Creates a new image with the parameter dimensions.
+  if width <= 0 or height <= 0:
+    raise newException(PixieError, "Image width and height must be > 0")
+
+  result = Image()
+  result.width = width
+  result.height = height
+  result.data = newSeq[ColorRGBX](width * height)
+
+proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
+  ## Creates a new mask with the parameter dimensions.
+  if width <= 0 or height <= 0:
+    raise newException(PixieError, "Mask width and height must be > 0")
+
+  result = Mask()
+  result.width = width
+  result.height = height
+  result.data = newSeq[uint8](width * height)
+
 proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} =
   ## Linearly interpolate between a and b using t.
   let t = round(t * 255).uint32
diff --git a/src/pixie/fileformats/jpeg.nim b/src/pixie/fileformats/jpeg.nim
index 4078d74..4fe5980 100644
--- a/src/pixie/fileformats/jpeg.nim
+++ b/src/pixie/fileformats/jpeg.nim
@@ -1,8 +1,5 @@
 import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
-    pixie/masks, std/decls, std/sequtils, std/strutils
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+    pixie/masks, pixie/simd, std/decls, std/sequtils, std/strutils
 
 # This JPEG decoder is loosely based on stb_image which is public domain.
 
diff --git a/src/pixie/fileformats/png.nim b/src/pixie/fileformats/png.nim
index 79c694d..9877d7a 100644
--- a/src/pixie/fileformats/png.nim
+++ b/src/pixie/fileformats/png.nim
@@ -1,8 +1,5 @@
 import chroma, flatty/binny, math, pixie/common, pixie/images, pixie/internal,
-    zippy, zippy/crc
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+    pixie/simd, zippy, zippy/crc
 
 # See http://www.libpng.org/pub/png/spec/1.2/PNG-Contents.html
 
diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index 35d8fbd..53f969d 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -1,10 +1,4 @@
-import blends, bumpy, chroma, common, internal, masks, vmath
-
-when allowSimd:
-  import simd
-
-  when defined(amd64):
-    import nimsimd/sse2
+import blends, bumpy, chroma, common, internal, masks, simd, vmath
 
 const h = 0.5.float32
 
@@ -13,27 +7,18 @@ type UnsafeImage = distinct Image
 when defined(release):
   {.push checks: off.}
 
-proc newImage*(width, height: int): Image {.raises: [PixieError].} =
-  ## Creates a new image with the parameter dimensions.
-  if width <= 0 or height <= 0:
-    raise newException(PixieError, "Image width and height must be > 0")
-
-  result = Image()
-  result.width = width
-  result.height = height
-  result.data = newSeq[ColorRGBX](width * height)
-
-proc newImage*(mask: Mask): Image {.raises: [PixieError].} =
+proc newImage*(mask: Mask): Image {.hasSimd, raises: [PixieError].} =
   result = newImage(mask.width, mask.height)
-
-  when allowSimd and compiles(newImageFromMaskSimd):
-    newImageFromMaskSimd(result.data, mask.data)
-    return
-
   for i in 0 ..< mask.data.len:
     let v = mask.data[i]
     result.data[i] = rgbx(v, v, v, v)
 
+proc newMask*(image: Image): Mask {.hasSimd, raises: [PixieError].} =
+  ## Returns a new mask using the alpha values of the image.
+  result = newMask(image.width, image.height)
+  for i in 0 ..< image.data.len:
+    result.data[i] = image.data[i].a
+
 proc copy*(image: Image): Image {.raises: [PixieError].} =
   ## Copies the image data into a new image.
   result = newImage(image.width, image.height)
@@ -89,25 +74,17 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
   ## Fills the image with the color.
   fillUnsafe(image.data, color, 0, image.data.len)
 
-proc isOneColor*(image: Image): bool {.raises: [].} =
+proc isOneColor*(image: Image): bool {.hasSimd, raises: [].} =
   ## Checks if the entire image is the same color.
-  when allowSimd and compiles(isOneColorSimd):
-    return isOneColorSimd(image.data)
-
   result = true
-
   let color = cast[uint32](image.data[0])
   for i in 0 ..< image.data.len:
     if cast[uint32](image.data[i]) != color:
       return false
 
-proc isTransparent*(image: Image): bool {.raises: [].} =
+proc isTransparent*(image: Image): bool {.hasSimd, raises: [].} =
   ## Checks if this image is fully transparent or not.
-  when allowSimd and compiles(isTransparentSimd):
-    return isTransparentSimd(image.data)
-
   result = true
-
   for i in 0 ..< image.data.len:
     if image.data[i].a != 0:
       return false
@@ -341,46 +318,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
         result.width * 4
       )
 
-proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
+proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} =
   ## Multiplies alpha of the image by opacity.
   let opacity = round(255 * opacity).uint16
   if opacity == 255:
     return
 
   if opacity == 0:
-    image.fill(rgbx(0, 0, 0, 0))
+    target.fill(rgbx(0, 0, 0, 0))
     return
 
-  when allowSimd and compiles(applyOpacitySimd):
-    applyOpacitySimd(image.data, opacity)
-    return
-
-  for i in 0 ..< image.data.len:
-    var rgbx = image.data[i]
+  for i in 0 ..< target.data.len:
+    var rgbx = target.data[i]
     rgbx.r = ((rgbx.r * opacity) div 255).uint8
     rgbx.g = ((rgbx.g * opacity) div 255).uint8
     rgbx.b = ((rgbx.b * opacity) div 255).uint8
     rgbx.a = ((rgbx.a * opacity) div 255).uint8
-    image.data[i] = rgbx
+    target.data[i] = rgbx
 
-proc invert*(image: Image) {.raises: [].} =
+proc invert*(target: Image) {.hasSimd, raises: [].} =
   ## Inverts all of the colors and alpha.
-  when allowSimd and compiles(invertImageSimd):
-    invertImageSimd(image.data)
-    return
-
-  for i in 0 ..< image.data.len:
-    var rgbx = image.data[i]
+  for i in 0 ..< target.data.len:
+    var rgbx = target.data[i]
     rgbx.r = 255 - rgbx.r
     rgbx.g = 255 - rgbx.g
     rgbx.b = 255 - rgbx.b
     rgbx.a = 255 - rgbx.a
-    image.data[i] = rgbx
+    target.data[i] = rgbx
 
   # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
   # is not a valid premultiplied alpha color.
   # We need to convert back to premultiplied alpha after inverting.
-  image.data.toPremultipliedAlpha()
+  target.data.toPremultipliedAlpha()
 
 proc blur*(
   image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0)
@@ -443,17 +412,6 @@ proc blur*(
         values += outOfBounds * kernel[yy - y + radius]
       image.unsafe[x, y] = rgbx(values)
 
-proc newMask*(image: Image): Mask {.raises: [PixieError].} =
-  ## Returns a new mask using the alpha values of the image.
-  result = newMask(image.width, image.height)
-
-  when allowSimd and compiles(newMaskFromImageSimd):
-    newMaskFromImageSimd(result.data, image.data)
-    return
-
-  for i in 0 ..< image.data.len:
-    result.data[i] = image.data[i].a
-
 proc getRgbaSmooth*(
   image: Image, x, y: float32, wrapped = false
 ): ColorRGBX {.raises: [].} =
diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim
index f854dc1..1cc5b55 100644
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@@ -1,12 +1,4 @@
-import bumpy, chroma, common, system/memory, vmath
-
-const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
-
-when allowSimd:
-  import simd
-
-  when defined(amd64):
-    import nimsimd/sse2
+import bumpy, chroma, common, simd, system/memory, vmath
 
 template currentExceptionAsPixieError*(): untyped =
   ## Gets the current exception and returns it as a PixieError with stack trace.
@@ -76,7 +68,7 @@ proc fillUnsafe*(
 
 proc fillUnsafe*(
   data: var seq[ColorRGBX], color: SomeColor, start, len: int
-) {.raises: [].} =
+) {.hasSimd, raises: [].} =
   ## Fills the image data with the color starting at index start and
   ## continuing for len indices.
   when allowSimd and compiles(fillUnsafeSimd):
@@ -110,12 +102,10 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
     c.b = straightAlphaTable[c.a][c.b]
     data[i] = c
 
-proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
+proc toPremultipliedAlpha*(
+  data: var seq[ColorRGBA | ColorRGBX]
+) {.hasSimd, raises: [].} =
   ## Converts an image to premultiplied alpha from straight alpha.
-  when allowSimd and compiles(toPremultipliedAlphaSimd):
-    toPremultipliedAlphaSimd(data)
-    return
-
   for i in 0 ..< data.len:
     var c = data[i]
     if c.a != 255:
@@ -124,25 +114,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
       c.b = ((c.b.uint32 * c.a) div 255).uint8
       data[i] = c
 
-proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
-  when allowSimd and compiles(isOpaqueSimd):
-    return isOpaqueSimd(data, start, len)
-
+proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} =
   result = true
-
   for i in start ..< start + len:
     if data[i].a != 255:
       return false
 
 when defined(amd64) and allowSimd:
-  proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
-    let opacityVec = mm_set1_ps(opacity)
-    var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
-    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-    cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
-
-  export pack4xAlphaValues, unpackAlphaValues
+  import simd/todo
+  export todo
 
 when defined(release):
   {.pop.}
diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim
index 7efc04e..e214f8e 100644
--- a/src/pixie/masks.nim
+++ b/src/pixie/masks.nim
@@ -1,26 +1,10 @@
-import common, internal, vmath
-
-when allowSimd:
-  import simd
-
-  when defined(amd64):
-    import nimsimd/sse2
+import common, internal, simd, vmath
 
 type UnsafeMask = distinct Mask
 
 when defined(release):
   {.push checks: off.}
 
-proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
-  ## Creates a new mask with the parameter dimensions.
-  if width <= 0 or height <= 0:
-    raise newException(PixieError, "Mask width and height must be > 0")
-
-  result = Mask()
-  result.width = width
-  result.height = height
-  result.data = newSeq[uint8](width * height)
-
 proc copy*(mask: Mask): Mask {.raises: [PixieError].} =
   ## Copies the image data into a new image.
   result = newMask(mask.width, mask.height)
@@ -180,22 +164,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
         result.width * 4
       )
 
-proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} =
+proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} =
   ## Multiplies alpha of the image by opacity.
   let opacity = round(255 * opacity).uint16
   if opacity == 255:
     return
 
   if opacity == 0:
-    mask.fill(0)
+    target.fill(0)
     return
 
-  when allowSimd and compiles(applyOpacitySimd):
-    applyOpacitySimd(mask.data, opacity)
-    return
-
-  for i in 0 ..< mask.data.len:
-    mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
+  for i in 0 ..< target.data.len:
+    target.data[i] = ((target.data[i] * opacity) div 255).uint8
 
 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
   ## Gets a interpolated value with float point coordinates.
@@ -225,14 +205,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
   else:
     topMix
 
-proc invert*(mask: Mask) {.raises: [].} =
+proc invert*(target: Mask) {.hasSimd, raises: [].} =
   ## Inverts all of the values - creates a negative of the mask.
-  when allowSimd and compiles(invertMaskSimd):
-    invertMaskSimd(mask.data)
-    return
-
-  for i in 0 ..< mask.data.len:
-    mask.data[i] = 255 - mask.data[i]
+  for i in 0 ..< target.data.len:
+    target.data[i] = 255 - target.data[i]
 
 proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
   ## Grows the mask by spread.
@@ -295,12 +271,8 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
             break
         mask.unsafe[x, y] = maxValue
 
-proc ceil*(mask: Mask) {.raises: [].} =
+proc ceil*(mask: Mask) {.hasSimd, raises: [].} =
   ## A value of 0 stays 0. Anything else turns into 255.
-  when allowSimd and compiles(invertImageSimd):
-    ceilMaskSimd(mask.data)
-    return
-
   for i in 0 ..< mask.data.len:
     if mask.data[i] != 0:
       mask.data[i] = 255
diff --git a/src/pixie/paints.nim b/src/pixie/paints.nim
index 133a367..1c7d312 100644
--- a/src/pixie/paints.nim
+++ b/src/pixie/paints.nim
@@ -1,7 +1,4 @@
-import chroma, common, images, internal, vmath
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import chroma, common, images, simd, vmath
 
 type
   PaintKind* = enum
diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim
index b8fa5a1..4ead1c8 100644
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@@ -1,8 +1,5 @@
-import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv,
-    std/strutils, vmath
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import blends, bumpy, chroma, common, images, internal, masks, paints, simd,
+    std/fenv, std/strutils, vmath
 
 type
   WindingRule* = enum
diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim
index 4230366..14207d6 100644
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@@ -1,392 +1,57 @@
-import chroma
+import simd/internal, std/macros, std/tables
 
-when defined(release):
-  {.push checks: off.}
-
-when defined(amd64):
-  import nimsimd/runtimecheck, nimsimd/sse2, simd/avx, simd/avx2
+const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
 
+macro hasSimd*(procedure: untyped) =
   let
-    cpuHasAvx* = checkInstructionSets({AVX})
-    cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
+    name = procedure.procName()
+    args = procedure.procArguments()
+    originalBody = procedure[6]
+    nameSse2 = name & "Sse2"
+    nameAvx = name & "Avx"
+    nameAvx2 = name & "Avx2"
+    callAvx = call(ident(nameAvx), args)
+    callAvx2 = call(ident(nameAvx2), args)
 
-  proc packAlphaValues(v: M128i): M128i {.inline.} =
-    ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
-    result = mm_srli_epi32(v, 24)
-    result = mm_packus_epi16(result, mm_setzero_si128())
-    result = mm_packus_epi16(result, mm_setzero_si128())
+  var body = newStmtList()
 
-  proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
+  when not defined(pixieNoAvx):
+    if nameAvx2 in simdProcs:
+      body.add quote do:
+        if cpuHasAvx2:
+          forceReturn `callAvx2`
+
+    if nameAvx in simdProcs:
+      body.add quote do:
+        if cpuHasAvx:
+          forceReturn `callAvx`
+
+  if nameSse2 in simdProcs:
+    let bodySse2 = simdProcs[nameSse2][6]
+    body.add quote do:
+      `bodySse2`
+  else:
+    body.add quote do:
+      echo "using ", `name`, " scalar"
+      `originalBody`
+
+  procedure[6] = body
+
+  return procedure
+
+when allowSimd and defined(amd64):
+  import simd/sse2, simd/avx, simd/avx2
+  export sse2, avx, avx2
+
+  when defined(pixieNoAvx):
+    const
+      cpuHasAvx* = false
+      cpuHasAvx2* = false
+  else:
+    import nimsimd/runtimecheck
     let
-      i = packAlphaValues(i)
-      j = mm_slli_si128(packAlphaValues(j), 4)
-      k = mm_slli_si128(packAlphaValues(k), 8)
-      l = mm_slli_si128(packAlphaValues(l), 12)
-    mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
+      cpuHasAvx* = checkInstructionSets({AVX})
+      cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
 
-  proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
-    ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
-    result = mm_unpacklo_epi8(mm_setzero_si128(), v)
-    result = mm_unpacklo_epi8(mm_setzero_si128(), result)
-
-  proc fillUnsafeSimd*(
-    data: var seq[ColorRGBX],
-    start, len: int,
-    color: SomeColor
-  ) =
-    if cpuHasAvx:
-      fillUnsafeAvx(data, start, len, color)
-      return
-
-    let rgbx = color.asRgbx()
-
-    var
-      i = start
-      p = cast[uint](data[i].addr)
-    # Align to 16 bytes
-    while i < (start + len) and (p and 15) != 0:
-      data[i] = rgbx
-      inc i
-      p += 4
-
-    let
-      colorVec = mm_set1_epi32(cast[int32](rgbx))
-      iterations = (start + len - i) div 8
-    for _ in 0 ..< iterations:
-      mm_store_si128(cast[pointer](p), colorVec)
-      mm_store_si128(cast[pointer](p + 16), colorVec)
-      p += 32
-    i += iterations * 8
-
-    for i in i ..< start + len:
-      data[i] = rgbx
-
-  proc isOneColorSimd*(data: var seq[ColorRGBX]): bool =
-    if cpuHasAvx2:
-      return isOneColorAvx2(data)
-
-    result = true
-
-    let color = data[0]
-
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      if data[i] != color:
-        return false
-      inc i
-      p += 4
-
-    let
-      colorVec = mm_set1_epi32(cast[int32](color))
-      iterations = (data.len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        eq0 = mm_cmpeq_epi8(values0, colorVec)
-        eq1 = mm_cmpeq_epi8(values1, colorVec)
-        eq2 = mm_cmpeq_epi8(values2, colorVec)
-        eq3 = mm_cmpeq_epi8(values3, colorVec)
-        eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
-      if mm_movemask_epi8(eq0123) != 0xffff:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i] != color:
-        return false
-
-  proc isTransparentSimd*(data: var seq[ColorRGBX]): bool =
-    if cpuHasAvx2:
-      return isTransparentAvx2(data)
-
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      if data[i].a != 0:
-        return false
-      inc i
-      p += 4
-
-    result = true
-
-    let
-      vecZero = mm_setzero_si128()
-      iterations = (data.len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        values01 = mm_or_si128(values0, values1)
-        values23 = mm_or_si128(values2, values3)
-        values0123 = mm_or_si128(values01, values23)
-      if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i].a != 0:
-        return false
-
-  proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool =
-    if cpuHasAvx2:
-      return isOpaqueAvx2(data, start, len)
-
-    result = true
-
-    var
-      i = start
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < (start + len) and (p and 15) != 0:
-      if data[i].a != 255:
-        return false
-      inc i
-      p += 4
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = (start + len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        values01 = mm_and_si128(values0, values1)
-        values23 = mm_and_si128(values2, values3)
-        values0123 = mm_and_si128(values01, values23)
-        eq = mm_cmpeq_epi8(values0123, vec255)
-      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< start + len:
-      if data[i].a != 255:
-        return false
-
-  proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) =
-    if cpuHasAvx2:
-      toPremultipliedAlphaAvx2(data)
-      return
-
-    var i: int
-
-    let
-      alphaMask = mm_set1_epi32(cast[int32](0xff000000))
-      oddMask = mm_set1_epi16(0xff00)
-      div255 = mm_set1_epi16(0x8081)
-      iterations = data.len div 4
-    for _ in 0 ..< iterations:
-      let
-        values = mm_loadu_si128(data[i].addr)
-        alpha = mm_and_si128(values, alphaMask)
-        eq = mm_cmpeq_epi8(values, alphaMask)
-      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
-        let
-          evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
-          oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
-        var
-          colorsEven = mm_slli_epi16(values, 8)
-          colorsOdd = mm_and_si128(values, oddMask)
-        colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
-        colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
-        colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
-        colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
-        mm_storeu_si128(
-          data[i].addr,
-          mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
-        )
-      i += 4
-
-    for i in i ..< data.len:
-      var c = data[i]
-      if c.a != 255:
-        c.r = ((c.r.uint32 * c.a) div 255).uint8
-        c.g = ((c.g.uint32 * c.a) div 255).uint8
-        c.b = ((c.b.uint32 * c.a) div 255).uint8
-        data[i] = c
-
-  proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) =
-    var i: int
-    for _ in 0 ..< src.len div 16:
-      var alphas = mm_loadu_si128(src[i].addr)
-      for j in 0 ..< 4:
-        var unpacked = unpackAlphaValues(alphas)
-        unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
-        unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
-        mm_storeu_si128(dst[i + j * 4].addr, unpacked)
-        alphas = mm_srli_si128(alphas, 4)
-      i += 16
-
-    for i in i ..< src.len:
-      let v = src[i]
-      dst[i] = rgbx(v, v, v, v)
-
-  proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) =
-    var i: int
-    for _ in 0 ..< src.len div 16:
-      let
-        a = mm_loadu_si128(src[i + 0].addr)
-        b = mm_loadu_si128(src[i + 4].addr)
-        c = mm_loadu_si128(src[i + 8].addr)
-        d = mm_loadu_si128(src[i + 12].addr)
-      mm_storeu_si128(
-        dst[i].addr,
-        pack4xAlphaValues(a, b, c, d)
-      )
-      i += 16
-
-    for i in i ..< src.len:
-      dst[i] = src[i].a
-
-  proc invertImageSimd*(data: var seq[ColorRGBX]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      var rgbx = data[i]
-      rgbx.r = 255 - rgbx.r
-      rgbx.g = 255 - rgbx.g
-      rgbx.b = 255 - rgbx.b
-      rgbx.a = 255 - rgbx.a
-      data[i] = rgbx
-      inc i
-      p += 4
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 16
-    for _ in 0 ..< iterations:
-      let
-        a = mm_load_si128(cast[pointer](p))
-        b = mm_load_si128(cast[pointer](p + 16))
-        c = mm_load_si128(cast[pointer](p + 32))
-        d = mm_load_si128(cast[pointer](p + 48))
-      mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
-      mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
-      mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
-      mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      var rgbx = data[i]
-      rgbx.r = 255 - rgbx.r
-      rgbx.g = 255 - rgbx.g
-      rgbx.b = 255 - rgbx.b
-      rgbx.a = 255 - rgbx.a
-      data[i] = rgbx
-
-    toPremultipliedAlphaSimd(data)
-
-  proc invertMaskSimd*(data: var seq[uint8]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      data[i] = 255 - data[i]
-      inc i
-      inc p
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 64
-    for _ in 0 ..< iterations:
-      let
-        a = mm_load_si128(cast[pointer](p))
-        b = mm_load_si128(cast[pointer](p + 16))
-        c = mm_load_si128(cast[pointer](p + 32))
-        d = mm_load_si128(cast[pointer](p + 48))
-      mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
-      mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
-      mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
-      mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
-      p += 64
-    i += 64 * iterations
-
-    for i in i ..< data.len:
-      data[i] = 255 - data[i]
-
-  proc ceilMaskSimd*(data: var seq[uint8]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-
-    let
-      zeroVec = mm_setzero_si128()
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 16
-    for _ in 0 ..< iterations:
-      var values = mm_loadu_si128(cast[pointer](p))
-      values = mm_cmpeq_epi8(values, zeroVec)
-      values = mm_andnot_si128(values, vec255)
-      mm_storeu_si128(cast[pointer](p), values)
-      p += 16
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i] != 0:
-        data[i] = 255
-
-  proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-      len =
-        when data is seq[ColorRGBX]:
-          data.len * 4
-        else:
-          data.len
-
-    let
-      oddMask = mm_set1_epi16(0xff00)
-      div255 = mm_set1_epi16(0x8081)
-      zeroVec = mm_setzero_si128()
-      opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
-      iterations = len div 16
-    for _ in 0 ..< len div 16:
-      let values = mm_loadu_si128(cast[pointer](p))
-      if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
-        var
-          valuesEven = mm_slli_epi16(values, 8)
-          valuesOdd = mm_and_si128(values, oddMask)
-        valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
-        valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
-        valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
-        valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
-        mm_storeu_si128(
-          cast[pointer](p),
-          mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
-        )
-      p += 16
-    i += 16 * iterations
-
-    when data is seq[ColorRGBX]:
-      for i in i div 4 ..< data.len:
-        var rgbx = data[i]
-        rgbx.r = ((rgbx.r * opacity) div 255).uint8
-        rgbx.g = ((rgbx.g * opacity) div 255).uint8
-        rgbx.b = ((rgbx.b * opacity) div 255).uint8
-        rgbx.a = ((rgbx.a * opacity) div 255).uint8
-        data[i] = rgbx
-    else:
-      for i in i ..< data.len:
-        data[i] = ((data[i] * opacity) div 255).uint8
-
-when defined(release):
-  {.pop.}
+  import nimsimd/sse2 as nimsimdsse2
+  export nimsimdsse2
diff --git a/src/pixie/simd/avx.nim b/src/pixie/simd/avx.nim
index c18e9c6..82b4333 100644
--- a/src/pixie/simd/avx.nim
+++ b/src/pixie/simd/avx.nim
@@ -1,4 +1,4 @@
-import chroma, nimsimd/avx
+import chroma, internal, nimsimd/avx
 
 when defined(gcc) or defined(clang):
   {.localPassc: "-mavx".}
@@ -8,9 +8,9 @@ when defined(release):
 
 proc fillUnsafeAvx*(
   data: var seq[ColorRGBX],
-  start, len: int,
-  color: SomeColor
-) =
+  color: SomeColor,
+  start, len: int
+) {.simd.} =
   let rgbx = color.asRgbx()
 
   var
diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim
index 3539931..a692692 100644
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@@ -1,4 +1,4 @@
-import chroma, nimsimd/avx2
+import chroma, internal, nimsimd/avx2, pixie/common
 
 when defined(gcc) or defined(clang):
   {.localPassc: "-mavx2".}
@@ -6,25 +6,25 @@ when defined(gcc) or defined(clang):
 when defined(release):
   {.push checks: off.}
 
-proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
+proc isOneColorAvx2*(image: Image): bool {.simd.} =
   result = true
 
-  let color = data[0]
+  let color = image.data[0]
 
   var i: int
   # Align to 32 bytes
-  while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
-    if data[i] != color:
+  while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
+    if image.data[i] != color:
       return false
     inc i
 
   let
     colorVec = mm256_set1_epi32(cast[int32](color))
-    iterations = (data.len - i) div 16
+    iterations = (image.data.len - i) div 16
   for _ in 0 ..< iterations:
     let
-      values0 = mm256_load_si256(data[i].addr)
-      values1 = mm256_load_si256(data[i + 8].addr)
+      values0 = mm256_load_si256(image.data[i].addr)
+      values1 = mm256_load_si256(image.data[i + 8].addr)
       eq0 = mm256_cmpeq_epi8(values0, colorVec)
       eq1 = mm256_cmpeq_epi8(values1, colorVec)
       eq01 = mm256_and_si256(eq0, eq1)
@@ -32,38 +32,38 @@ proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
       return false
     i += 16
 
-  for i in i ..< data.len:
-    if data[i] != color:
+  for i in i ..< image.data.len:
+    if image.data[i] != color:
       return false
 
-proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool =
+proc isTransparentAvx2*(image: Image): bool {.simd.} =
   result = true
 
   var i: int
   # Align to 32 bytes
-  while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
-    if data[i].a != 0:
+  while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
+    if image.data[i].a != 0:
       return false
     inc i
 
   let
     vecZero = mm256_setzero_si256()
-    iterations = (data.len - i) div 16
+    iterations = (image.data.len - i) div 16
   for _ in 0 ..< iterations:
     let
-      values0 = mm256_load_si256(data[i].addr)
-      values1 = mm256_load_si256(data[i + 8].addr)
+      values0 = mm256_load_si256(image.data[i].addr)
+      values1 = mm256_load_si256(image.data[i + 8].addr)
       values01 = mm256_or_si256(values0, values1)
       eq = mm256_cmpeq_epi8(values01, vecZero)
     if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
       return false
     i += 16
 
-  for i in i ..< data.len:
-    if data[i].a != 0:
+  for i in i ..< image.data.len:
+    if image.data[i].a != 0:
       return false
 
-proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
+proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
   result = true
 
   var i = start
@@ -90,7 +90,7 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
     if data[i].a != 255:
       return false
 
-proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) =
+proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
   var i: int
 
   let
diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim
new file mode 100644
index 0000000..2165e72
--- /dev/null
+++ b/src/pixie/simd/internal.nim
@@ -0,0 +1,39 @@
+import std/macros, std/tables
+
+var simdProcs* {.compiletime.}: Table[string, NimNode]
+
+template forceReturn*(procedure: untyped) =
+  ## Produce `return procedure()` when procedure returns something otherwise
+  ## `procedure(); return` if it procedure returns nothing.
+  when compiles(block: return procedure):
+    return procedure
+  else:
+    procedure
+    return
+
+proc procName*(procedure: NimNode): string =
+  ## Given a procedure signature returns only name string.
+  let nameNode = procedure[0]
+  if nameNode.kind == nnkPostfix:
+    nameNode[1].strVal
+  else:
+    nameNode.strVal
+
+proc procArguments*(procedure: NimNode): seq[NimNode] =
+  ## Given a procedure signature gets the arguments as a list.
+  for i, arg in procedure[3]:
+    if i > 0:
+      for j in 0 ..< arg.len - 2:
+        result.add(arg[j])
+
+proc call*(name: NimNode, args: seq[NimNode]): NimNode =
+  ## Produces a procedure call with arguments.
+  result = newNimNode(nnkCall)
+  result.add(name)
+  for arg in args:
+    result.add(arg)
+
+macro simd*(procedure: untyped) =
+  let name = procedure.procName()
+  simdProcs[name] = procedure.copy()
+  return procedure
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
new file mode 100644
index 0000000..b862fbc
--- /dev/null
+++ b/src/pixie/simd/sse2.nim
@@ -0,0 +1,351 @@
+import chroma, internal, nimsimd/sse2, pixie/common, todo, vmath
+
+when defined(release):
+  {.push checks: off.}
+
+proc fillUnsafeSse2*(
+  data: var seq[ColorRGBX],
+  color: SomeColor,
+  start, len: int
+) {.simd.} =
+  let rgbx = color.asRgbx()
+
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 16 bytes
+  while i < (start + len) and (p and 15) != 0:
+    data[i] = rgbx
+    inc i
+    p += 4
+
+  let
+    colorVec = mm_set1_epi32(cast[int32](rgbx))
+    iterations = (start + len - i) div 8
+  for _ in 0 ..< iterations:
+    mm_store_si128(cast[pointer](p), colorVec)
+    mm_store_si128(cast[pointer](p + 16), colorVec)
+    p += 32
+  i += iterations * 8
+
+  for i in i ..< start + len:
+    data[i] = rgbx
+
+proc isOneColorSse2*(image: Image): bool {.simd.} =
+  result = true
+
+  let color = image.data[0]
+
+  var
+    i: int
+    p = cast[uint](image.data[0].addr)
+  # Align to 16 bytes
+  while i < image.data.len and (p and 15) != 0:
+    if image.data[i] != color:
+      return false
+    inc i
+    p += 4
+
+  let
+    colorVec = mm_set1_epi32(cast[int32](color))
+    iterations = (image.data.len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      eq0 = mm_cmpeq_epi8(values0, colorVec)
+      eq1 = mm_cmpeq_epi8(values1, colorVec)
+      eq2 = mm_cmpeq_epi8(values2, colorVec)
+      eq3 = mm_cmpeq_epi8(values3, colorVec)
+      eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
+    if mm_movemask_epi8(eq0123) != 0xffff:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< image.data.len:
+    if image.data[i] != color:
+      return false
+
+proc isTransparentSse2*(image: Image): bool {.simd.} =
+  var
+    i: int
+    p = cast[uint](image.data[0].addr)
+  # Align to 16 bytes
+  while i < image.data.len and (p and 15) != 0:
+    if image.data[i].a != 0:
+      return false
+    inc i
+    p += 4
+
+  result = true
+
+  let
+    vecZero = mm_setzero_si128()
+    iterations = (image.data.len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      values01 = mm_or_si128(values0, values1)
+      values23 = mm_or_si128(values2, values3)
+      values0123 = mm_or_si128(values01, values23)
+    if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< image.data.len:
+    if image.data[i].a != 0:
+      return false
+
+proc isOpaqueSse2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
+  result = true
+
+  var
+    i = start
+    p = cast[uint](data[0].addr)
+  # Align to 16 bytes
+  while i < (start + len) and (p and 15) != 0:
+    if data[i].a != 255:
+      return false
+    inc i
+    p += 4
+
+  let
+    vec255 = mm_set1_epi8(255)
+    iterations = (start + len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      values01 = mm_and_si128(values0, values1)
+      values23 = mm_and_si128(values2, values3)
+      values0123 = mm_and_si128(values01, values23)
+      eq = mm_cmpeq_epi8(values0123, vec255)
+    if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< start + len:
+    if data[i].a != 255:
+      return false
+
+proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
+  var i: int
+
+  let
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
+    iterations = data.len div 4
+  for _ in 0 ..< iterations:
+    let
+      values = mm_loadu_si128(data[i].addr)
+      alpha = mm_and_si128(values, alphaMask)
+      eq = mm_cmpeq_epi8(values, alphaMask)
+    if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+      let
+        evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
+        oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
+      var
+        colorsEven = mm_slli_epi16(values, 8)
+        colorsOdd = mm_and_si128(values, oddMask)
+      colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
+      colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
+      colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
+      colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
+      mm_storeu_si128(
+        data[i].addr,
+        mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
+      )
+    i += 4
+
+  for i in i ..< data.len:
+    var c = data[i]
+    if c.a != 255:
+      c.r = ((c.r.uint32 * c.a) div 255).uint8
+      c.g = ((c.g.uint32 * c.a) div 255).uint8
+      c.b = ((c.b.uint32 * c.a) div 255).uint8
+      data[i] = c
+
+proc newImageSse2*(mask: Mask): Image {.simd.} =
+  result = newImage(mask.width, mask.height)
+
+  var i: int
+  for _ in 0 ..< mask.data.len div 16:
+    var alphas = mm_loadu_si128(mask.data[i].addr)
+    for j in 0 ..< 4:
+      var unpacked = unpackAlphaValues(alphas)
+      unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
+      unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
+      mm_storeu_si128(result.data[i + j * 4].addr, unpacked)
+      alphas = mm_srli_si128(alphas, 4)
+    i += 16
+
+  for i in i ..< mask.data.len:
+    let v = mask.data[i]
+    result.data[i] = rgbx(v, v, v, v)
+
+proc newMaskSse2*(image: Image): Mask {.simd.} =
+  result = newMask(image.width, image.height)
+
+  var i: int
+  for _ in 0 ..< image.data.len div 16:
+    let
+      a = mm_loadu_si128(image.data[i + 0].addr)
+      b = mm_loadu_si128(image.data[i + 4].addr)
+      c = mm_loadu_si128(image.data[i + 8].addr)
+      d = mm_loadu_si128(image.data[i + 12].addr)
+    mm_storeu_si128(
+      result.data[i].addr,
+      pack4xAlphaValues(a, b, c, d)
+    )
+    i += 16
+
+  for i in i ..< image.data.len:
+    result.data[i] = image.data[i].a
+
+proc invertSse2*(target: Image | Mask) {.simd.} =
+  var
+    i: int
+    p = cast[uint](target.data[0].addr)
+  # Align to 16 bytes
+  while i < target.data.len and (p and 15) != 0:
+    when target is Image:
+      var rgbx = target.data[i]
+      rgbx.r = 255 - rgbx.r
+      rgbx.g = 255 - rgbx.g
+      rgbx.b = 255 - rgbx.b
+      rgbx.a = 255 - rgbx.a
+      target.data[i] = rgbx
+      inc i
+      p += 4
+    else:
+      target.data[i] = 255 - target.data[i]
+      inc i
+      inc p
+
+  let vec255 = mm_set1_epi8(255)
+
+  when target is Image:
+    let iterations = target.data.len div 16
+  else:
+    let iterations = target.data.len div 64
+
+  for _ in 0 ..< iterations:
+    let
+      a = mm_load_si128(cast[pointer](p))
+      b = mm_load_si128(cast[pointer](p + 16))
+      c = mm_load_si128(cast[pointer](p + 32))
+      d = mm_load_si128(cast[pointer](p + 48))
+    mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
+    mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
+    mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
+    mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
+    p += 64
+
+  when target is Image:
+    i += 16 * iterations
+
+    for i in i ..< target.data.len:
+      var rgbx = target.data[i]
+      rgbx.r = 255 - rgbx.r
+      rgbx.g = 255 - rgbx.g
+      rgbx.b = 255 - rgbx.b
+      rgbx.a = 255 - rgbx.a
+      target.data[i] = rgbx
+
+    toPremultipliedAlphaSse2(target.data)
+  else:
+    i += 64 * iterations
+
+    for i in i ..< target.data.len:
+      target.data[i] = 255 - target.data[i]
+
+proc ceilSse2*(mask: Mask) {.simd.} =
+  var
+    i: int
+    p = cast[uint](mask.data[0].addr)
+
+  let
+    zeroVec = mm_setzero_si128()
+    vec255 = mm_set1_epi8(255)
+    iterations = mask.data.len div 16
+  for _ in 0 ..< iterations:
+    var values = mm_loadu_si128(cast[pointer](p))
+    values = mm_cmpeq_epi8(values, zeroVec)
+    values = mm_andnot_si128(values, vec255)
+    mm_storeu_si128(cast[pointer](p), values)
+    p += 16
+  i += 16 * iterations
+
+  for i in i ..< mask.data.len:
+    if mask.data[i] != 0:
+      mask.data[i] = 255
+
+proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} =
+  let opacity = round(255 * opacity).uint16
+  if opacity == 255:
+    return
+
+  if opacity == 0:
+    when target is Image:
+      target.fill(rgbx(0, 0, 0, 0))
+    else:
+      target.fill(0)
+    return
+
+  var
+    i: int
+    p = cast[uint](target.data[0].addr)
+    len =
+      when target is Image:
+        target.data.len * 4
+      else:
+        target.data.len
+
+  let
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
+    zeroVec = mm_setzero_si128()
+    opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
+    iterations = len div 16
+  for _ in 0 ..< len div 16:
+    let values = mm_loadu_si128(cast[pointer](p))
+    if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
+      var
+        valuesEven = mm_slli_epi16(values, 8)
+        valuesOdd = mm_and_si128(values, oddMask)
+      valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
+      valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
+      valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
+      valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
+      mm_storeu_si128(
+        cast[pointer](p),
+        mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
+      )
+    p += 16
+  i += 16 * iterations
+
+  when target is Image:
+    for i in i div 4 ..< target.data.len:
+      var rgbx = target.data[i]
+      rgbx.r = ((rgbx.r * opacity) div 255).uint8
+      rgbx.g = ((rgbx.g * opacity) div 255).uint8
+      rgbx.b = ((rgbx.b * opacity) div 255).uint8
+      rgbx.a = ((rgbx.a * opacity) div 255).uint8
+      target.data[i] = rgbx
+  else:
+    for i in i ..< target.data.len:
+      target.data[i] = ((target.data[i] * opacity) div 255).uint8
+
+when defined(release):
+  {.pop.}
diff --git a/src/pixie/simd/todo.nim b/src/pixie/simd/todo.nim
new file mode 100644
index 0000000..e7cafb9
--- /dev/null
+++ b/src/pixie/simd/todo.nim
@@ -0,0 +1,33 @@
+import chroma, nimsimd/sse2
+
+when defined(release):
+  {.push checks: off.}
+
+proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
+  let opacityVec = mm_set1_ps(opacity)
+  var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
+
+proc packAlphaValues(v: M128i): M128i {.inline.} =
+  ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
+  result = mm_srli_epi32(v, 24)
+  result = mm_packus_epi16(result, mm_setzero_si128())
+  result = mm_packus_epi16(result, mm_setzero_si128())
+
+proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
+  let
+    i = packAlphaValues(i)
+    j = mm_slli_si128(packAlphaValues(j), 4)
+    k = mm_slli_si128(packAlphaValues(k), 8)
+    l = mm_slli_si128(packAlphaValues(l), 12)
+  mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
+
+proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
+  ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
+  result = mm_unpacklo_epi8(mm_setzero_si128(), v)
+  result = mm_unpacklo_epi8(mm_setzero_si128(), result)
+
+when defined(release):
+  {.pop.}

From 83c7527474793f0f957ad7cf464ea6e2e0e6520e Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 19:11:21 -0500
Subject: [PATCH 08/13] rm

---
 src/pixie/simd.nim | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim
index 14207d6..a61f751 100644
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@@ -32,7 +32,6 @@ macro hasSimd*(procedure: untyped) =
       `bodySse2`
   else:
     body.add quote do:
-      echo "using ", `name`, " scalar"
       `originalBody`
 
   procedure[6] = body

From 902dda2a2e7edb96c25d862db619dba88cc9d6ef Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 19:39:39 -0500
Subject: [PATCH 09/13] simpler

---
 src/pixie/simd.nim | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim
index a61f751..8a6eddf 100644
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@@ -42,11 +42,7 @@ when allowSimd and defined(amd64):
   import simd/sse2, simd/avx, simd/avx2
   export sse2, avx, avx2
 
-  when defined(pixieNoAvx):
-    const
-      cpuHasAvx* = false
-      cpuHasAvx2* = false
-  else:
+  when not defined(pixieNoAvx):
     import nimsimd/runtimecheck
     let
       cpuHasAvx* = checkInstructionSets({AVX})

From abc2dd653f790d77dce8e153a03dfa935924e30a Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 19:52:22 -0500
Subject: [PATCH 10/13] rm todo

---
 src/pixie/internal.nim  |  4 ----
 src/pixie/simd/sse2.nim | 28 +++++++++++++++++++++++++++-
 src/pixie/simd/todo.nim | 33 ---------------------------------
 3 files changed, 27 insertions(+), 38 deletions(-)
 delete mode 100644 src/pixie/simd/todo.nim

diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim
index 1cc5b55..df4e639 100644
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@@ -120,9 +120,5 @@ proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} =
     if data[i].a != 255:
       return false
 
-when defined(amd64) and allowSimd:
-  import simd/todo
-  export todo
-
 when defined(release):
   {.pop.}
diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim
index b862fbc..7421b9d 100644
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@@ -1,8 +1,34 @@
-import chroma, internal, nimsimd/sse2, pixie/common, todo, vmath
+import chroma, internal, nimsimd/sse2, pixie/common, vmath
 
 when defined(release):
   {.push checks: off.}
 
+proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
+  let opacityVec = mm_set1_ps(opacity)
+  var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
+
+proc packAlphaValues(v: M128i): M128i {.inline.} =
+  ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
+  result = mm_srli_epi32(v, 24)
+  result = mm_packus_epi16(result, mm_setzero_si128())
+  result = mm_packus_epi16(result, mm_setzero_si128())
+
+proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
+  let
+    i = packAlphaValues(i)
+    j = mm_slli_si128(packAlphaValues(j), 4)
+    k = mm_slli_si128(packAlphaValues(k), 8)
+    l = mm_slli_si128(packAlphaValues(l), 12)
+  mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
+
+proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
+  ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
+  result = mm_unpacklo_epi8(mm_setzero_si128(), v)
+  result = mm_unpacklo_epi8(mm_setzero_si128(), result)
+
 proc fillUnsafeSse2*(
   data: var seq[ColorRGBX],
   color: SomeColor,
diff --git a/src/pixie/simd/todo.nim b/src/pixie/simd/todo.nim
deleted file mode 100644
index e7cafb9..0000000
--- a/src/pixie/simd/todo.nim
+++ /dev/null
@@ -1,33 +0,0 @@
-import chroma, nimsimd/sse2
-
-when defined(release):
-  {.push checks: off.}
-
-proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
-  let opacityVec = mm_set1_ps(opacity)
-  var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
-  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-  cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
-
-proc packAlphaValues(v: M128i): M128i {.inline.} =
-  ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
-  result = mm_srli_epi32(v, 24)
-  result = mm_packus_epi16(result, mm_setzero_si128())
-  result = mm_packus_epi16(result, mm_setzero_si128())
-
-proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
-  let
-    i = packAlphaValues(i)
-    j = mm_slli_si128(packAlphaValues(j), 4)
-    k = mm_slli_si128(packAlphaValues(k), 8)
-    l = mm_slli_si128(packAlphaValues(l), 12)
-  mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
-
-proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
-  ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
-  result = mm_unpacklo_epi8(mm_setzero_si128(), v)
-  result = mm_unpacklo_epi8(mm_setzero_si128(), result)
-
-when defined(release):
-  {.pop.}

From 9099df4715d2d66f70f88c459e714ca2593908ac Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 20:10:14 -0500
Subject: [PATCH 11/13] treeform better macro

---
 src/pixie/simd.nim          | 40 ++-------------------
 src/pixie/simd/internal.nim | 71 ++++++++++++++++++++++++++++---------
 2 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim
index 8a6eddf..4988bd1 100644
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@@ -1,43 +1,9 @@
-import simd/internal, std/macros, std/tables
+import simd/internal
+
+export internal
 
 const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
 
-macro hasSimd*(procedure: untyped) =
-  let
-    name = procedure.procName()
-    args = procedure.procArguments()
-    originalBody = procedure[6]
-    nameSse2 = name & "Sse2"
-    nameAvx = name & "Avx"
-    nameAvx2 = name & "Avx2"
-    callAvx = call(ident(nameAvx), args)
-    callAvx2 = call(ident(nameAvx2), args)
-
-  var body = newStmtList()
-
-  when not defined(pixieNoAvx):
-    if nameAvx2 in simdProcs:
-      body.add quote do:
-        if cpuHasAvx2:
-          forceReturn `callAvx2`
-
-    if nameAvx in simdProcs:
-      body.add quote do:
-        if cpuHasAvx:
-          forceReturn `callAvx`
-
-  if nameSse2 in simdProcs:
-    let bodySse2 = simdProcs[nameSse2][6]
-    body.add quote do:
-      `bodySse2`
-  else:
-    body.add quote do:
-      `originalBody`
-
-  procedure[6] = body
-
-  return procedure
-
 when allowSimd and defined(amd64):
   import simd/sse2, simd/avx, simd/avx2
   export sse2, avx, avx2
diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim
index 2165e72..25885f9 100644
--- a/src/pixie/simd/internal.nim
+++ b/src/pixie/simd/internal.nim
@@ -2,16 +2,7 @@ import std/macros, std/tables
 
 var simdProcs* {.compiletime.}: Table[string, NimNode]
 
-template forceReturn*(procedure: untyped) =
-  ## Produce `return procedure()` when procedure returns something otherwise
-  ## `procedure(); return` if it procedure returns nothing.
-  when compiles(block: return procedure):
-    return procedure
-  else:
-    procedure
-    return
-
-proc procName*(procedure: NimNode): string =
+proc procName(procedure: NimNode): string =
   ## Given a procedure signature returns only name string.
   let nameNode = procedure[0]
   if nameNode.kind == nnkPostfix:
@@ -19,21 +10,69 @@ proc procName*(procedure: NimNode): string =
   else:
     nameNode.strVal
 
-proc procArguments*(procedure: NimNode): seq[NimNode] =
+proc procArguments(procedure: NimNode): seq[NimNode] =
   ## Given a procedure signature gets the arguments as a list.
   for i, arg in procedure[3]:
     if i > 0:
       for j in 0 ..< arg.len - 2:
         result.add(arg[j])
 
-proc call*(name: NimNode, args: seq[NimNode]): NimNode =
+proc procReturnType(procedure: NimNode): NimNode =
+  ## Given a procedure signature gets the return type.
+  procedure[3][0]
+
+proc callAndReturn(name: NimNode, procedure: NimNode): NimNode =
   ## Produces a procedure call with arguments.
-  result = newNimNode(nnkCall)
-  result.add(name)
-  for arg in args:
-    result.add(arg)
+  let
+    retType = procedure.procReturnType()
+    call = newNimNode(nnkCall)
+  call.add(name)
+  for arg in procedure.procArguments():
+    call.add(arg)
+  if retType.kind == nnkEmpty:
+    result = quote do:
+      `call`
+      return
+  else:
+    result = quote do:
+      return `call`
 
 macro simd*(procedure: untyped) =
   let name = procedure.procName()
   simdProcs[name] = procedure.copy()
   return procedure
+
+macro hasSimd*(procedure: untyped) =
+  let
+    name = procedure.procName()
+    originalBody = procedure[6]
+    nameSse2 = name & "Sse2"
+    nameAvx = name & "Avx"
+    nameAvx2 = name & "Avx2"
+    callAvx = callAndReturn(ident(nameAvx), procedure)
+    callAvx2 = callAndReturn(ident(nameAvx2), procedure)
+
+  var body = newStmtList()
+
+  when not defined(pixieNoAvx):
+    if nameAvx2 in simdProcs:
+      body.add quote do:
+        if cpuHasAvx2:
+          `callAvx2`
+
+    if nameAvx in simdProcs:
+      body.add quote do:
+        if cpuHasAvx2:
+          `callAvx`
+
+  if nameSse2 in simdProcs:
+    let bodySse2 = simdProcs[nameSse2][6]
+    body.add quote do:
+      `bodySse2`
+  else:
+    body.add quote do:
+      `originalBody`
+
+  procedure[6] = body
+
+  return procedure

From 0b8d8029f3bebc2373d58906cf361f1cbc9bfe12 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 20:16:39 -0500
Subject: [PATCH 12/13] restore expected procs/types

---
 src/pixie/images.nim | 2 ++
 src/pixie/masks.nim  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index 53f969d..11f5c15 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -1,5 +1,7 @@
 import blends, bumpy, chroma, common, internal, masks, simd, vmath
 
+export Image, newImage
+
 const h = 0.5.float32
 
 type UnsafeImage = distinct Image
diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim
index e214f8e..6856ba6 100644
--- a/src/pixie/masks.nim
+++ b/src/pixie/masks.nim
@@ -1,5 +1,7 @@
 import common, internal, simd, vmath
 
+export Mask, newMask
+
 type UnsafeMask = distinct Mask
 
 when defined(release):

From 316bf1ce4f68f03313005b2001415d013f550606 Mon Sep 17 00:00:00 2001
From: Ryan Oldenburg <ryan@guzba.com>
Date: Thu, 7 Jul 2022 21:40:02 -0500
Subject: [PATCH 13/13] rm

---
 src/pixie/internal.nim | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim
index df4e639..8a6e2ef 100644
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@@ -71,12 +71,7 @@ proc fillUnsafe*(
 ) {.hasSimd, raises: [].} =
   ## Fills the image data with the color starting at index start and
   ## continuing for len indices.
-  when allowSimd and compiles(fillUnsafeSimd):
-    fillUnsafeSimd(data, start, len, color)
-    return
-
   let rgbx = color.asRgbx()
-
   # Use memset when every byte has the same value
   if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
     nimSetMem(data[start].addr, rgbx.r.cint, len * 4)