Merge pull request #461 from guzba/master

simd, hasSimd pragmas
2022-07-07 20:30:09 -07:00 · 2022-07-07 20:30:09 -07:00 · 595ddeaa50
commit 595ddeaa50
parent 66d5535ae9 316bf1ce4f
15 changed files with 578 additions and 640 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,3 +16,4 @@ bindings/generated
 dump.txt
 tests/fileformats/jpeg/generated
 tests/fileformats/jpeg/diffs
+*.dylib
--- a/src/pixie/blends.nim
+++ b/src/pixie/blends.nim
@ -1,9 +1,6 @@
 ## Blending modes.

-import chroma, common, internal, std/math
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import chroma, common, simd, std/math

 # See https://www.w3.org/TR/compositing-1/
 # See https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_blend_equation_advanced.txt
@ -273,67 +270,16 @@ proc blendSoftLight*(backdrop, source: ColorRGBX): ColorRGBX =
    backdrop = backdrop.rgba()
    source = source.rgba()

-  var rgba: ColorRGBA
-  when defined(amd64) and allowSimd:
-    let
-      vb = mm_setr_ps(
-        backdrop.r.float32,
-        backdrop.g.float32,
-        backdrop.b.float32,
-        0
-      )
-      vs = mm_setr_ps(source.r.float32, source.g.float32, source.b.float32, 0)
-      v2 = mm_set1_ps(2)
-      v255 = mm_set1_ps(255)
-      v255sq = mm_set1_ps(255 * 255)
-      vm = ((v255 - v2 * vs) * vb * vb) / v255sq + (v2 * vs * vb) / v255
-      values = cast[array[4, uint32]](mm_cvtps_epi32(vm))
+  let
+    b = backdrop.color
+    s = source.color
+  var blended: Color
+  blended.r = softLight(b.r, s.r)
+  blended.g = softLight(b.g, s.g)
+  blended.b = softLight(b.b, s.b)
+  blended = alphaFix(b, s, blended)

-    rgba.r = values[0].uint8
-    rgba.g = values[1].uint8
-    rgba.b = values[2].uint8
-
-    # proc alphaFix(backdrop, source, mixed: ColorRGBX): ColorRGBX {.inline.} =
-    #   if backdrop.a == 0 and source.a == 0:
-    #     return
-    #   let
-    #     vb = mm_setr_ps(backdrop.r.float32, backdrop.g.float32, backdrop.b.float32, 0)
-    #     vs = mm_setr_ps(source.r.float32, source.g.float32, source.b.float32, 0)
-    #     vm = mm_setr_ps(mixed.r.float32, mixed.g.float32, mixed.b.float32, 0)
-    #   alphaFix(backdrop, source, vb, vs, vm)
-
-    let
-      sa = source.a.float32
-      ba = backdrop.a.float32
-      a = sa + ba * (255 - sa) / 255
-    if a == 0:
-      return
-
-    let
-      t0 = mm_set1_ps(sa * (255 - ba))
-      t1 = mm_set1_ps(sa * ba)
-      t2 = mm_set1_ps((255 - sa) * ba)
-      va = mm_set1_ps(a)
-      final = cast[array[4, uint32]](
-        mm_cvtps_epi32((t0 * vs + t1 * vm + t2 * vb) / va / v255)
-      )
-
-    rgba.r = final[0].uint8
-    rgba.g = final[1].uint8
-    rgba.b = final[2].uint8
-    rgba.a = a.uint8
-  else:
-    let
-      b = backdrop.color
-      s = source.color
-    var blended: Color
-    blended.r = softLight(b.r, s.r)
-    blended.g = softLight(b.g, s.g)
-    blended.b = softLight(b.b, s.b)
-    blended = alphaFix(b, s, blended)
-    rgba = blended.rgba
-
-  result = rgba.rgbx()
+  result = blended.rgbx()

 proc blendHardLight*(backdrop, source: ColorRGBX): ColorRGBX =
  result.r = hardLight(backdrop.r, backdrop.a, source.r, source.a)
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@ -31,6 +31,36 @@ type
  ImageDimensions* = object
    width*, height*: int

+  Image* = ref object
+    ## Image object that holds bitmap data in premultiplied alpha RGBA format.
+    width*, height*: int
+    data*: seq[ColorRGBX]
+
+  Mask* = ref object
+    ## Mask object that holds mask opacity data.
+    width*, height*: int
+    data*: seq[uint8]
+
+proc newImage*(width, height: int): Image {.raises: [PixieError].} =
+  ## Creates a new image with the parameter dimensions.
+  if width <= 0 or height <= 0:
+    raise newException(PixieError, "Image width and height must be > 0")
+
+  result = Image()
+  result.width = width
+  result.height = height
+  result.data = newSeq[ColorRGBX](width * height)
+
+proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
+  ## Creates a new mask with the parameter dimensions.
+  if width <= 0 or height <= 0:
+    raise newException(PixieError, "Mask width and height must be > 0")
+
+  result = Mask()
+  result.width = width
+  result.height = height
+  result.data = newSeq[uint8](width * height)
+
 proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} =
  ## Linearly interpolate between a and b using t.
  let t = round(t * 255).uint32
--- a/src/pixie/fileformats/jpeg.nim
+++ b/src/pixie/fileformats/jpeg.nim
@ -1,8 +1,5 @@
 import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
-    pixie/masks, std/decls, std/sequtils, std/strutils
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+    pixie/masks, pixie/simd, std/decls, std/sequtils, std/strutils

 # This JPEG decoder is loosely based on stb_image which is public domain.

--- a/src/pixie/fileformats/png.nim
+++ b/src/pixie/fileformats/png.nim
@ -1,8 +1,5 @@
 import chroma, flatty/binny, math, pixie/common, pixie/images, pixie/internal,
-    pixie/masks, zippy, zippy/crc
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+    pixie/simd, zippy, zippy/crc

 # See http://www.libpng.org/pub/png/spec/1.2/PNG-Contents.html

--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@ -1,45 +1,26 @@
-import blends, bumpy, chroma, common, internal, masks, vmath
+import blends, bumpy, chroma, common, internal, masks, simd, vmath

-when allowSimd:
-  import simd
-
-  when defined(amd64):
-    import nimsimd/sse2
+export Image, newImage

 const h = 0.5.float32

-type
-  Image* = ref object
-    ## Image object that holds bitmap data in RGBA format.
-    width*, height*: int
-    data*: seq[ColorRGBX]
-
-  UnsafeImage = distinct Image
+type UnsafeImage = distinct Image

 when defined(release):
  {.push checks: off.}

-proc newImage*(width, height: int): Image {.raises: [PixieError].} =
-  ## Creates a new image with the parameter dimensions.
-  if width <= 0 or height <= 0:
-    raise newException(PixieError, "Image width and height must be > 0")
-
-  result = Image()
-  result.width = width
-  result.height = height
-  result.data = newSeq[ColorRGBX](width * height)
-
-proc newImage*(mask: Mask): Image {.raises: [PixieError].} =
+proc newImage*(mask: Mask): Image {.hasSimd, raises: [PixieError].} =
  result = newImage(mask.width, mask.height)
-
-  when allowSimd and compiles(newImageFromMaskSimd):
-    newImageFromMaskSimd(result.data, mask.data)
-    return
-
  for i in 0 ..< mask.data.len:
    let v = mask.data[i]
    result.data[i] = rgbx(v, v, v, v)

+proc newMask*(image: Image): Mask {.hasSimd, raises: [PixieError].} =
+  ## Returns a new mask using the alpha values of the image.
+  result = newMask(image.width, image.height)
+  for i in 0 ..< image.data.len:
+    result.data[i] = image.data[i].a
+
 proc copy*(image: Image): Image {.raises: [PixieError].} =
  ## Copies the image data into a new image.
  result = newImage(image.width, image.height)
@ -95,25 +76,17 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
  ## Fills the image with the color.
  fillUnsafe(image.data, color, 0, image.data.len)

-proc isOneColor*(image: Image): bool {.raises: [].} =
+proc isOneColor*(image: Image): bool {.hasSimd, raises: [].} =
  ## Checks if the entire image is the same color.
-  when allowSimd and compiles(isOneColorSimd):
-    return isOneColorSimd(image.data)
-
  result = true
-
  let color = cast[uint32](image.data[0])
  for i in 0 ..< image.data.len:
    if cast[uint32](image.data[i]) != color:
      return false

-proc isTransparent*(image: Image): bool {.raises: [].} =
+proc isTransparent*(image: Image): bool {.hasSimd, raises: [].} =
  ## Checks if this image is fully transparent or not.
-  when allowSimd and compiles(isTransparentSimd):
-    return isTransparentSimd(image.data)
-
  result = true
-
  for i in 0 ..< image.data.len:
    if image.data[i].a != 0:
      return false
@ -347,46 +320,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
        result.width * 4
      )

-proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
+proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} =
  ## Multiplies alpha of the image by opacity.
  let opacity = round(255 * opacity).uint16
  if opacity == 255:
    return

  if opacity == 0:
-    image.fill(rgbx(0, 0, 0, 0))
+    target.fill(rgbx(0, 0, 0, 0))
    return

-  when allowSimd and compiles(applyOpacitySimd):
-    applyOpacitySimd(image.data, opacity)
-    return
-
-  for i in 0 ..< image.data.len:
-    var rgbx = image.data[i]
+  for i in 0 ..< target.data.len:
+    var rgbx = target.data[i]
    rgbx.r = ((rgbx.r * opacity) div 255).uint8
    rgbx.g = ((rgbx.g * opacity) div 255).uint8
    rgbx.b = ((rgbx.b * opacity) div 255).uint8
    rgbx.a = ((rgbx.a * opacity) div 255).uint8
-    image.data[i] = rgbx
+    target.data[i] = rgbx

-proc invert*(image: Image) {.raises: [].} =
+proc invert*(target: Image) {.hasSimd, raises: [].} =
  ## Inverts all of the colors and alpha.
-  when allowSimd and compiles(invertImageSimd):
-    invertImageSimd(image.data)
-    return
-
-  for i in 0 ..< image.data.len:
-    var rgbx = image.data[i]
+  for i in 0 ..< target.data.len:
+    var rgbx = target.data[i]
    rgbx.r = 255 - rgbx.r
    rgbx.g = 255 - rgbx.g
    rgbx.b = 255 - rgbx.b
    rgbx.a = 255 - rgbx.a
-    image.data[i] = rgbx
+    target.data[i] = rgbx

  # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
  # is not a valid premultiplied alpha color.
  # We need to convert back to premultiplied alpha after inverting.
-  image.data.toPremultipliedAlpha()
+  target.data.toPremultipliedAlpha()

 proc blur*(
  image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0)
@ -449,17 +414,6 @@ proc blur*(
        values += outOfBounds * kernel[yy - y + radius]
      image.unsafe[x, y] = rgbx(values)

-proc newMask*(image: Image): Mask {.raises: [PixieError].} =
-  ## Returns a new mask using the alpha values of the image.
-  result = newMask(image.width, image.height)
-
-  when allowSimd and compiles(newMaskFromImageSimd):
-    newMaskFromImageSimd(result.data, image.data)
-    return
-
-  for i in 0 ..< image.data.len:
-    result.data[i] = image.data[i].a
-
 proc getRgbaSmooth*(
  image: Image, x, y: float32, wrapped = false
 ): ColorRGBX {.raises: [].} =
--- a/src/pixie/internal.nim
+++ b/src/pixie/internal.nim
@ -1,12 +1,4 @@
-import bumpy, chroma, common, system/memory, vmath
-
-const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
-
-when allowSimd:
-  import simd
-
-  when defined(amd64):
-    import nimsimd/sse2
+import bumpy, chroma, common, simd, system/memory, vmath

 template currentExceptionAsPixieError*(): untyped =
  ## Gets the current exception and returns it as a PixieError with stack trace.
@ -76,21 +68,16 @@ proc fillUnsafe*(

 proc fillUnsafe*(
  data: var seq[ColorRGBX], color: SomeColor, start, len: int
-) {.raises: [].} =
+) {.hasSimd, raises: [].} =
  ## Fills the image data with the color starting at index start and
  ## continuing for len indices.
-  when allowSimd and compiles(fillUnsafeSimd):
-    fillUnsafeSimd(data, start, len, color)
-    return
-
  let rgbx = color.asRgbx()
-
  # Use memset when every byte has the same value
  if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
    nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
  else:
-    for color in data.mitems:
-      color = rgbx
+    for i in start ..< start + len:
+          data[i] = rgbx

 const straightAlphaTable = block:
  var table: array[256, array[256, uint8]]
@ -110,12 +97,10 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
    c.b = straightAlphaTable[c.a][c.b]
    data[i] = c

-proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
+proc toPremultipliedAlpha*(
+  data: var seq[ColorRGBA | ColorRGBX]
+) {.hasSimd, raises: [].} =
  ## Converts an image to premultiplied alpha from straight alpha.
-  when allowSimd and compiles(toPremultipliedAlphaSimd):
-    toPremultipliedAlphaSimd(data)
-    return
-
  for i in 0 ..< data.len:
    var c = data[i]
    if c.a != 255:
@ -124,25 +109,11 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
      c.b = ((c.b.uint32 * c.a) div 255).uint8
      data[i] = c

-proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
-  when allowSimd and compiles(isOpaqueSimd):
-    return isOpaqueSimd(data, start, len)
-
+proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} =
  result = true
-
  for i in start ..< start + len:
    if data[i].a != 255:
      return false

-when defined(amd64) and allowSimd:
-  proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
-    let opacityVec = mm_set1_ps(opacity)
-    var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
-    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-    finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
-    cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
-
-  export pack4xAlphaValues, unpackAlphaValues
-
 when defined(release):
  {.pop.}
--- a/src/pixie/masks.nim
+++ b/src/pixie/masks.nim
@ -1,32 +1,12 @@
-import common, internal, vmath
+import common, internal, simd, vmath

-when allowSimd:
-  import simd
+export Mask, newMask

-  when defined(amd64):
-    import nimsimd/sse2
-
-type
-  Mask* = ref object
-    ## Mask object that holds mask opacity data.
-    width*, height*: int
-    data*: seq[uint8]
-
-  UnsafeMask = distinct Mask
+type UnsafeMask = distinct Mask

 when defined(release):
  {.push checks: off.}

-proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
-  ## Creates a new mask with the parameter dimensions.
-  if width <= 0 or height <= 0:
-    raise newException(PixieError, "Mask width and height must be > 0")
-
-  result = Mask()
-  result.width = width
-  result.height = height
-  result.data = newSeq[uint8](width * height)
-
 proc copy*(mask: Mask): Mask {.raises: [PixieError].} =
  ## Copies the image data into a new image.
  result = newMask(mask.width, mask.height)
@ -186,22 +166,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
        result.width * 4
      )

-proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} =
+proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} =
  ## Multiplies alpha of the image by opacity.
  let opacity = round(255 * opacity).uint16
  if opacity == 255:
    return

  if opacity == 0:
-    mask.fill(0)
+    target.fill(0)
    return

-  when allowSimd and compiles(applyOpacitySimd):
-    applyOpacitySimd(mask.data, opacity)
-    return
-
-  for i in 0 ..< mask.data.len:
-    mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
+  for i in 0 ..< target.data.len:
+    target.data[i] = ((target.data[i] * opacity) div 255).uint8

 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
  ## Gets a interpolated value with float point coordinates.
@ -231,14 +207,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
  else:
    topMix

-proc invert*(mask: Mask) {.raises: [].} =
+proc invert*(target: Mask) {.hasSimd, raises: [].} =
  ## Inverts all of the values - creates a negative of the mask.
-  when allowSimd and compiles(invertMaskSimd):
-    invertMaskSimd(mask.data)
-    return
-
-  for i in 0 ..< mask.data.len:
-    mask.data[i] = 255 - mask.data[i]
+  for i in 0 ..< target.data.len:
+    target.data[i] = 255 - target.data[i]

 proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
  ## Grows the mask by spread.
@ -301,12 +273,8 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
            break
        mask.unsafe[x, y] = maxValue

-proc ceil*(mask: Mask) {.raises: [].} =
+proc ceil*(mask: Mask) {.hasSimd, raises: [].} =
  ## A value of 0 stays 0. Anything else turns into 255.
-  when allowSimd and compiles(invertImageSimd):
-    ceilMaskSimd(mask.data)
-    return
-
  for i in 0 ..< mask.data.len:
    if mask.data[i] != 0:
      mask.data[i] = 255
--- a/src/pixie/paints.nim
+++ b/src/pixie/paints.nim
@ -1,7 +1,4 @@
-import chroma, common, images, internal, vmath
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import chroma, common, images, simd, vmath

 type
  PaintKind* = enum
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -1,8 +1,5 @@
-import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv,
-    std/strutils, vmath
-
-when defined(amd64) and allowSimd:
-  import nimsimd/sse2
+import blends, bumpy, chroma, common, images, internal, masks, paints, simd,
+    std/fenv, std/strutils, vmath

 type
  WindingRule* = enum
--- a/src/pixie/simd.nim
+++ b/src/pixie/simd.nim
@ -1,393 +1,18 @@
-import chroma
+import simd/internal

-when defined(release):
-  {.push checks: off.}
+export internal

-when defined(amd64):
-  import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx,
-      runtimechecked/avx2
+const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)

-  let
-    cpuHasAvx* = checkInstructionSets({AVX})
-    cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
+when allowSimd and defined(amd64):
+  import simd/sse2, simd/avx, simd/avx2
+  export sse2, avx, avx2

-  proc packAlphaValues(v: M128i): M128i {.inline.} =
-    ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
-    result = mm_srli_epi32(v, 24)
-    result = mm_packus_epi16(result, mm_setzero_si128())
-    result = mm_packus_epi16(result, mm_setzero_si128())
-
-  proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
+  when not defined(pixieNoAvx):
+    import nimsimd/runtimecheck
    let
-      i = packAlphaValues(i)
-      j = mm_slli_si128(packAlphaValues(j), 4)
-      k = mm_slli_si128(packAlphaValues(k), 8)
-      l = mm_slli_si128(packAlphaValues(l), 12)
-    mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
+      cpuHasAvx* = checkInstructionSets({AVX})
+      cpuHasAvx2* = checkInstructionSets({AVX, AVX2})

-  proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
-    ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
-    result = mm_unpacklo_epi8(mm_setzero_si128(), v)
-    result = mm_unpacklo_epi8(mm_setzero_si128(), result)
-
-  proc fillUnsafeSimd*(
-    data: var seq[ColorRGBX],
-    start, len: int,
-    color: SomeColor
-  ) =
-    if cpuHasAvx:
-      fillUnsafeAvx(data, start, len, color)
-      return
-
-    let rgbx = color.asRgbx()
-
-    var
-      i = start
-      p = cast[uint](data[i].addr)
-    # Align to 16 bytes
-    while i < (start + len) and (p and 15) != 0:
-      data[i] = rgbx
-      inc i
-      p += 4
-
-    let
-      colorVec = mm_set1_epi32(cast[int32](rgbx))
-      iterations = (start + len - i) div 8
-    for _ in 0 ..< iterations:
-      mm_store_si128(cast[pointer](p), colorVec)
-      mm_store_si128(cast[pointer](p + 16), colorVec)
-      p += 32
-    i += iterations * 8
-
-    for i in i ..< start + len:
-      data[i] = rgbx
-
-  proc isOneColorSimd*(data: var seq[ColorRGBX]): bool =
-    if cpuHasAvx2:
-      return isOneColorAvx2(data)
-
-    result = true
-
-    let color = data[0]
-
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      if data[i] != color:
-        return false
-      inc i
-      p += 4
-
-    let
-      colorVec = mm_set1_epi32(cast[int32](color))
-      iterations = (data.len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        eq0 = mm_cmpeq_epi8(values0, colorVec)
-        eq1 = mm_cmpeq_epi8(values1, colorVec)
-        eq2 = mm_cmpeq_epi8(values2, colorVec)
-        eq3 = mm_cmpeq_epi8(values3, colorVec)
-        eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
-      if mm_movemask_epi8(eq0123) != 0xffff:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i] != color:
-        return false
-
-  proc isTransparentSimd*(data: var seq[ColorRGBX]): bool =
-    if cpuHasAvx2:
-      return isTransparentAvx2(data)
-
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      if data[i].a != 0:
-        return false
-      inc i
-      p += 4
-
-    result = true
-
-    let
-      vecZero = mm_setzero_si128()
-      iterations = (data.len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        values01 = mm_or_si128(values0, values1)
-        values23 = mm_or_si128(values2, values3)
-        values0123 = mm_or_si128(values01, values23)
-      if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i].a != 0:
-        return false
-
-  proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool =
-    if cpuHasAvx2:
-      return isOpaqueAvx2(data, start, len)
-
-    result = true
-
-    var
-      i = start
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < (start + len) and (p and 15) != 0:
-      if data[i].a != 255:
-        return false
-      inc i
-      p += 4
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = (start + len - i) div 16
-    for _ in 0 ..< iterations:
-      let
-        values0 = mm_load_si128(cast[pointer](p))
-        values1 = mm_load_si128(cast[pointer](p + 16))
-        values2 = mm_load_si128(cast[pointer](p + 32))
-        values3 = mm_load_si128(cast[pointer](p + 48))
-        values01 = mm_and_si128(values0, values1)
-        values23 = mm_and_si128(values2, values3)
-        values0123 = mm_and_si128(values01, values23)
-        eq = mm_cmpeq_epi8(values0123, vec255)
-      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
-        return false
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< start + len:
-      if data[i].a != 255:
-        return false
-
-  proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) =
-    if cpuHasAvx2:
-      toPremultipliedAlphaAvx2(data)
-      return
-
-    var i: int
-
-    let
-      alphaMask = mm_set1_epi32(cast[int32](0xff000000))
-      oddMask = mm_set1_epi16(0xff00)
-      div255 = mm_set1_epi16(0x8081)
-      iterations = data.len div 4
-    for _ in 0 ..< iterations:
-      let
-        values = mm_loadu_si128(data[i].addr)
-        alpha = mm_and_si128(values, alphaMask)
-        eq = mm_cmpeq_epi8(values, alphaMask)
-      if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
-        let
-          evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
-          oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
-        var
-          colorsEven = mm_slli_epi16(values, 8)
-          colorsOdd = mm_and_si128(values, oddMask)
-        colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
-        colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
-        colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
-        colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
-        mm_storeu_si128(
-          data[i].addr,
-          mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
-        )
-      i += 4
-
-    for i in i ..< data.len:
-      var c = data[i]
-      if c.a != 255:
-        c.r = ((c.r.uint32 * c.a) div 255).uint8
-        c.g = ((c.g.uint32 * c.a) div 255).uint8
-        c.b = ((c.b.uint32 * c.a) div 255).uint8
-        data[i] = c
-
-  proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) =
-    var i: int
-    for _ in 0 ..< src.len div 16:
-      var alphas = mm_loadu_si128(src[i].addr)
-      for j in 0 ..< 4:
-        var unpacked = unpackAlphaValues(alphas)
-        unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
-        unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
-        mm_storeu_si128(dst[i + j * 4].addr, unpacked)
-        alphas = mm_srli_si128(alphas, 4)
-      i += 16
-
-    for i in i ..< src.len:
-      let v = src[i]
-      dst[i] = rgbx(v, v, v, v)
-
-  proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) =
-    var i: int
-    for _ in 0 ..< src.len div 16:
-      let
-        a = mm_loadu_si128(src[i + 0].addr)
-        b = mm_loadu_si128(src[i + 4].addr)
-        c = mm_loadu_si128(src[i + 8].addr)
-        d = mm_loadu_si128(src[i + 12].addr)
-      mm_storeu_si128(
-        dst[i].addr,
-        pack4xAlphaValues(a, b, c, d)
-      )
-      i += 16
-
-    for i in i ..< src.len:
-      dst[i] = src[i].a
-
-  proc invertImageSimd*(data: var seq[ColorRGBX]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      var rgbx = data[i]
-      rgbx.r = 255 - rgbx.r
-      rgbx.g = 255 - rgbx.g
-      rgbx.b = 255 - rgbx.b
-      rgbx.a = 255 - rgbx.a
-      data[i] = rgbx
-      inc i
-      p += 4
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 16
-    for _ in 0 ..< iterations:
-      let
-        a = mm_load_si128(cast[pointer](p))
-        b = mm_load_si128(cast[pointer](p + 16))
-        c = mm_load_si128(cast[pointer](p + 32))
-        d = mm_load_si128(cast[pointer](p + 48))
-      mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
-      mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
-      mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
-      mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
-      p += 64
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      var rgbx = data[i]
-      rgbx.r = 255 - rgbx.r
-      rgbx.g = 255 - rgbx.g
-      rgbx.b = 255 - rgbx.b
-      rgbx.a = 255 - rgbx.a
-      data[i] = rgbx
-
-    toPremultipliedAlphaSimd(data)
-
-  proc invertMaskSimd*(data: var seq[uint8]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-    # Align to 16 bytes
-    while i < data.len and (p and 15) != 0:
-      data[i] = 255 - data[i]
-      inc i
-      inc p
-
-    let
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 64
-    for _ in 0 ..< iterations:
-      let
-        a = mm_load_si128(cast[pointer](p))
-        b = mm_load_si128(cast[pointer](p + 16))
-        c = mm_load_si128(cast[pointer](p + 32))
-        d = mm_load_si128(cast[pointer](p + 48))
-      mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
-      mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
-      mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
-      mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
-      p += 64
-    i += 64 * iterations
-
-    for i in i ..< data.len:
-      data[i] = 255 - data[i]
-
-  proc ceilMaskSimd*(data: var seq[uint8]) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-
-    let
-      zeroVec = mm_setzero_si128()
-      vec255 = mm_set1_epi8(255)
-      iterations = data.len div 16
-    for _ in 0 ..< iterations:
-      var values = mm_loadu_si128(cast[pointer](p))
-      values = mm_cmpeq_epi8(values, zeroVec)
-      values = mm_andnot_si128(values, vec255)
-      mm_storeu_si128(cast[pointer](p), values)
-      p += 16
-    i += 16 * iterations
-
-    for i in i ..< data.len:
-      if data[i] != 0:
-        data[i] = 255
-
-  proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) =
-    var
-      i: int
-      p = cast[uint](data[0].addr)
-      len =
-        when data is seq[ColorRGBX]:
-          data.len * 4
-        else:
-          data.len
-
-    let
-      oddMask = mm_set1_epi16(0xff00)
-      div255 = mm_set1_epi16(0x8081)
-      zeroVec = mm_setzero_si128()
-      opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
-      iterations = len div 16
-    for _ in 0 ..< len div 16:
-      let values = mm_loadu_si128(cast[pointer](p))
-      if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
-        var
-          valuesEven = mm_slli_epi16(values, 8)
-          valuesOdd = mm_and_si128(values, oddMask)
-        valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
-        valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
-        valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
-        valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
-        mm_storeu_si128(
-          cast[pointer](p),
-          mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
-        )
-      p += 16
-    i += 16 * iterations
-
-    when data is seq[ColorRGBX]:
-      for i in i div 4 ..< data.len:
-        var rgbx = data[i]
-        rgbx.r = ((rgbx.r * opacity) div 255).uint8
-        rgbx.g = ((rgbx.g * opacity) div 255).uint8
-        rgbx.b = ((rgbx.b * opacity) div 255).uint8
-        rgbx.a = ((rgbx.a * opacity) div 255).uint8
-        data[i] = rgbx
-    else:
-      for i in i ..< data.len:
-        data[i] = ((data[i] * opacity) div 255).uint8
-
-when defined(release):
-  {.pop.}
+  import nimsimd/sse2 as nimsimdsse2
+  export nimsimdsse2
--- a/src/pixie/runtimechecked/avx.nim
+++ b/src/pixie/runtimechecked/avx.nim
@ -1,4 +1,4 @@
-import chroma, nimsimd/avx
+import chroma, internal, nimsimd/avx

 when defined(gcc) or defined(clang):
  {.localPassc: "-mavx".}
@ -8,9 +8,9 @@ when defined(release):

 proc fillUnsafeAvx*(
  data: var seq[ColorRGBX],
-  start, len: int,
-  color: SomeColor
-) =
+  color: SomeColor,
+  start, len: int
+) {.simd.} =
  let rgbx = color.asRgbx()

  var
--- a/src/pixie/runtimechecked/avx2.nim
+++ b/src/pixie/runtimechecked/avx2.nim
@ -1,4 +1,4 @@
-import chroma, nimsimd/avx2
+import chroma, internal, nimsimd/avx2, pixie/common

 when defined(gcc) or defined(clang):
  {.localPassc: "-mavx2".}
@ -6,25 +6,25 @@ when defined(gcc) or defined(clang):
 when defined(release):
  {.push checks: off.}

-proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
+proc isOneColorAvx2*(image: Image): bool {.simd.} =
  result = true

-  let color = data[0]
+  let color = image.data[0]

  var i: int
  # Align to 32 bytes
-  while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
-    if data[i] != color:
+  while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
+    if image.data[i] != color:
      return false
    inc i

  let
    colorVec = mm256_set1_epi32(cast[int32](color))
-    iterations = (data.len - i) div 16
+    iterations = (image.data.len - i) div 16
  for _ in 0 ..< iterations:
    let
-      values0 = mm256_load_si256(data[i].addr)
-      values1 = mm256_load_si256(data[i + 8].addr)
+      values0 = mm256_load_si256(image.data[i].addr)
+      values1 = mm256_load_si256(image.data[i + 8].addr)
      eq0 = mm256_cmpeq_epi8(values0, colorVec)
      eq1 = mm256_cmpeq_epi8(values1, colorVec)
      eq01 = mm256_and_si256(eq0, eq1)
@ -32,38 +32,38 @@ proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
      return false
    i += 16

-  for i in i ..< data.len:
-    if data[i] != color:
+  for i in i ..< image.data.len:
+    if image.data[i] != color:
      return false

-proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool =
+proc isTransparentAvx2*(image: Image): bool {.simd.} =
  result = true

  var i: int
  # Align to 32 bytes
-  while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
-    if data[i].a != 0:
+  while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
+    if image.data[i].a != 0:
      return false
    inc i

  let
    vecZero = mm256_setzero_si256()
-    iterations = (data.len - i) div 16
+    iterations = (image.data.len - i) div 16
  for _ in 0 ..< iterations:
    let
-      values0 = mm256_load_si256(data[i].addr)
-      values1 = mm256_load_si256(data[i + 8].addr)
+      values0 = mm256_load_si256(image.data[i].addr)
+      values1 = mm256_load_si256(image.data[i + 8].addr)
      values01 = mm256_or_si256(values0, values1)
      eq = mm256_cmpeq_epi8(values01, vecZero)
    if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
      return false
    i += 16

-  for i in i ..< data.len:
-    if data[i].a != 0:
+  for i in i ..< image.data.len:
+    if image.data[i].a != 0:
      return false

-proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
+proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
  result = true

  var i = start
@ -90,7 +90,7 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
    if data[i].a != 255:
      return false

-proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) =
+proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
  var i: int

  let
--- a/src/pixie/simd/internal.nim
+++ b/src/pixie/simd/internal.nim
@ -0,0 +1,78 @@
+import std/macros, std/tables
+
+var simdProcs* {.compiletime.}: Table[string, NimNode]
+
+proc procName(procedure: NimNode): string =
+  ## Given a procedure signature returns only name string.
+  let nameNode = procedure[0]
+  if nameNode.kind == nnkPostfix:
+    nameNode[1].strVal
+  else:
+    nameNode.strVal
+
+proc procArguments(procedure: NimNode): seq[NimNode] =
+  ## Given a procedure signature gets the arguments as a list.
+  for i, arg in procedure[3]:
+    if i > 0:
+      for j in 0 ..< arg.len - 2:
+        result.add(arg[j])
+
+proc procReturnType(procedure: NimNode): NimNode =
+  ## Given a procedure signature gets the return type.
+  procedure[3][0]
+
+proc callAndReturn(name: NimNode, procedure: NimNode): NimNode =
+  ## Produces a procedure call with arguments.
+  let
+    retType = procedure.procReturnType()
+    call = newNimNode(nnkCall)
+  call.add(name)
+  for arg in procedure.procArguments():
+    call.add(arg)
+  if retType.kind == nnkEmpty:
+    result = quote do:
+      `call`
+      return
+  else:
+    result = quote do:
+      return `call`
+
+macro simd*(procedure: untyped) =
+  let name = procedure.procName()
+  simdProcs[name] = procedure.copy()
+  return procedure
+
+macro hasSimd*(procedure: untyped) =
+  let
+    name = procedure.procName()
+    originalBody = procedure[6]
+    nameSse2 = name & "Sse2"
+    nameAvx = name & "Avx"
+    nameAvx2 = name & "Avx2"
+    callAvx = callAndReturn(ident(nameAvx), procedure)
+    callAvx2 = callAndReturn(ident(nameAvx2), procedure)
+
+  var body = newStmtList()
+
+  when not defined(pixieNoAvx):
+    if nameAvx2 in simdProcs:
+      body.add quote do:
+        if cpuHasAvx2:
+          `callAvx2`
+
+    if nameAvx in simdProcs:
+      body.add quote do:
+        if cpuHasAvx2:
+          `callAvx`
+
+  if nameSse2 in simdProcs:
+    let bodySse2 = simdProcs[nameSse2][6]
+    body.add quote do:
+      `bodySse2`
+  else:
+    body.add quote do:
+      `originalBody`
+
+  procedure[6] = body
+
+  return procedure
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@ -0,0 +1,377 @@
+import chroma, internal, nimsimd/sse2, pixie/common, vmath
+
+when defined(release):
+  {.push checks: off.}
+
+proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
+  let opacityVec = mm_set1_ps(opacity)
+  var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
+  cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
+
+proc packAlphaValues(v: M128i): M128i {.inline.} =
+  ## Shuffle the alpha values for these 4 colors to the first 4 bytes.
+  result = mm_srli_epi32(v, 24)
+  result = mm_packus_epi16(result, mm_setzero_si128())
+  result = mm_packus_epi16(result, mm_setzero_si128())
+
+proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
+  let
+    i = packAlphaValues(i)
+    j = mm_slli_si128(packAlphaValues(j), 4)
+    k = mm_slli_si128(packAlphaValues(k), 8)
+    l = mm_slli_si128(packAlphaValues(l), 12)
+  mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
+
+proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
+  ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
+  result = mm_unpacklo_epi8(mm_setzero_si128(), v)
+  result = mm_unpacklo_epi8(mm_setzero_si128(), result)
+
+proc fillUnsafeSse2*(
+  data: var seq[ColorRGBX],
+  color: SomeColor,
+  start, len: int
+) {.simd.} =
+  let rgbx = color.asRgbx()
+
+  var
+    i = start
+    p = cast[uint](data[i].addr)
+  # Align to 16 bytes
+  while i < (start + len) and (p and 15) != 0:
+    data[i] = rgbx
+    inc i
+    p += 4
+
+  let
+    colorVec = mm_set1_epi32(cast[int32](rgbx))
+    iterations = (start + len - i) div 8
+  for _ in 0 ..< iterations:
+    mm_store_si128(cast[pointer](p), colorVec)
+    mm_store_si128(cast[pointer](p + 16), colorVec)
+    p += 32
+  i += iterations * 8
+
+  for i in i ..< start + len:
+    data[i] = rgbx
+
+proc isOneColorSse2*(image: Image): bool {.simd.} =
+  result = true
+
+  let color = image.data[0]
+
+  var
+    i: int
+    p = cast[uint](image.data[0].addr)
+  # Align to 16 bytes
+  while i < image.data.len and (p and 15) != 0:
+    if image.data[i] != color:
+      return false
+    inc i
+    p += 4
+
+  let
+    colorVec = mm_set1_epi32(cast[int32](color))
+    iterations = (image.data.len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      eq0 = mm_cmpeq_epi8(values0, colorVec)
+      eq1 = mm_cmpeq_epi8(values1, colorVec)
+      eq2 = mm_cmpeq_epi8(values2, colorVec)
+      eq3 = mm_cmpeq_epi8(values3, colorVec)
+      eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
+    if mm_movemask_epi8(eq0123) != 0xffff:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< image.data.len:
+    if image.data[i] != color:
+      return false
+
+proc isTransparentSse2*(image: Image): bool {.simd.} =
+  var
+    i: int
+    p = cast[uint](image.data[0].addr)
+  # Align to 16 bytes
+  while i < image.data.len and (p and 15) != 0:
+    if image.data[i].a != 0:
+      return false
+    inc i
+    p += 4
+
+  result = true
+
+  let
+    vecZero = mm_setzero_si128()
+    iterations = (image.data.len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      values01 = mm_or_si128(values0, values1)
+      values23 = mm_or_si128(values2, values3)
+      values0123 = mm_or_si128(values01, values23)
+    if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< image.data.len:
+    if image.data[i].a != 0:
+      return false
+
+proc isOpaqueSse2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
+  result = true
+
+  var
+    i = start
+    p = cast[uint](data[0].addr)
+  # Align to 16 bytes
+  while i < (start + len) and (p and 15) != 0:
+    if data[i].a != 255:
+      return false
+    inc i
+    p += 4
+
+  let
+    vec255 = mm_set1_epi8(255)
+    iterations = (start + len - i) div 16
+  for _ in 0 ..< iterations:
+    let
+      values0 = mm_load_si128(cast[pointer](p))
+      values1 = mm_load_si128(cast[pointer](p + 16))
+      values2 = mm_load_si128(cast[pointer](p + 32))
+      values3 = mm_load_si128(cast[pointer](p + 48))
+      values01 = mm_and_si128(values0, values1)
+      values23 = mm_and_si128(values2, values3)
+      values0123 = mm_and_si128(values01, values23)
+      eq = mm_cmpeq_epi8(values0123, vec255)
+    if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+      return false
+    p += 64
+  i += 16 * iterations
+
+  for i in i ..< start + len:
+    if data[i].a != 255:
+      return false
+
+proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
+  var i: int
+
+  let
+    alphaMask = mm_set1_epi32(cast[int32](0xff000000))
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
+    iterations = data.len div 4
+  for _ in 0 ..< iterations:
+    let
+      values = mm_loadu_si128(data[i].addr)
+      alpha = mm_and_si128(values, alphaMask)
+      eq = mm_cmpeq_epi8(values, alphaMask)
+    if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
+      let
+        evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
+        oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
+      var
+        colorsEven = mm_slli_epi16(values, 8)
+        colorsOdd = mm_and_si128(values, oddMask)
+      colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
+      colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
+      colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
+      colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
+      mm_storeu_si128(
+        data[i].addr,
+        mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
+      )
+    i += 4
+
+  for i in i ..< data.len:
+    var c = data[i]
+    if c.a != 255:
+      c.r = ((c.r.uint32 * c.a) div 255).uint8
+      c.g = ((c.g.uint32 * c.a) div 255).uint8
+      c.b = ((c.b.uint32 * c.a) div 255).uint8
+      data[i] = c
+
+proc newImageSse2*(mask: Mask): Image {.simd.} =
+  result = newImage(mask.width, mask.height)
+
+  var i: int
+  for _ in 0 ..< mask.data.len div 16:
+    var alphas = mm_loadu_si128(mask.data[i].addr)
+    for j in 0 ..< 4:
+      var unpacked = unpackAlphaValues(alphas)
+      unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
+      unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
+      mm_storeu_si128(result.data[i + j * 4].addr, unpacked)
+      alphas = mm_srli_si128(alphas, 4)
+    i += 16
+
+  for i in i ..< mask.data.len:
+    let v = mask.data[i]
+    result.data[i] = rgbx(v, v, v, v)
+
+proc newMaskSse2*(image: Image): Mask {.simd.} =
+  result = newMask(image.width, image.height)
+
+  var i: int
+  for _ in 0 ..< image.data.len div 16:
+    let
+      a = mm_loadu_si128(image.data[i + 0].addr)
+      b = mm_loadu_si128(image.data[i + 4].addr)
+      c = mm_loadu_si128(image.data[i + 8].addr)
+      d = mm_loadu_si128(image.data[i + 12].addr)
+    mm_storeu_si128(
+      result.data[i].addr,
+      pack4xAlphaValues(a, b, c, d)
+    )
+    i += 16
+
+  for i in i ..< image.data.len:
+    result.data[i] = image.data[i].a
+
+proc invertSse2*(target: Image | Mask) {.simd.} =
+  var
+    i: int
+    p = cast[uint](target.data[0].addr)
+  # Align to 16 bytes
+  while i < target.data.len and (p and 15) != 0:
+    when target is Image:
+      var rgbx = target.data[i]
+      rgbx.r = 255 - rgbx.r
+      rgbx.g = 255 - rgbx.g
+      rgbx.b = 255 - rgbx.b
+      rgbx.a = 255 - rgbx.a
+      target.data[i] = rgbx
+      inc i
+      p += 4
+    else:
+      target.data[i] = 255 - target.data[i]
+      inc i
+      inc p
+
+  let vec255 = mm_set1_epi8(255)
+
+  when target is Image:
+    let iterations = target.data.len div 16
+  else:
+    let iterations = target.data.len div 64
+
+  for _ in 0 ..< iterations:
+    let
+      a = mm_load_si128(cast[pointer](p))
+      b = mm_load_si128(cast[pointer](p + 16))
+      c = mm_load_si128(cast[pointer](p + 32))
+      d = mm_load_si128(cast[pointer](p + 48))
+    mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
+    mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
+    mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
+    mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
+    p += 64
+
+  when target is Image:
+    i += 16 * iterations
+
+    for i in i ..< target.data.len:
+      var rgbx = target.data[i]
+      rgbx.r = 255 - rgbx.r
+      rgbx.g = 255 - rgbx.g
+      rgbx.b = 255 - rgbx.b
+      rgbx.a = 255 - rgbx.a
+      target.data[i] = rgbx
+
+    toPremultipliedAlphaSse2(target.data)
+  else:
+    i += 64 * iterations
+
+    for i in i ..< target.data.len:
+      target.data[i] = 255 - target.data[i]
+
+proc ceilSse2*(mask: Mask) {.simd.} =
+  var
+    i: int
+    p = cast[uint](mask.data[0].addr)
+
+  let
+    zeroVec = mm_setzero_si128()
+    vec255 = mm_set1_epi8(255)
+    iterations = mask.data.len div 16
+  for _ in 0 ..< iterations:
+    var values = mm_loadu_si128(cast[pointer](p))
+    values = mm_cmpeq_epi8(values, zeroVec)
+    values = mm_andnot_si128(values, vec255)
+    mm_storeu_si128(cast[pointer](p), values)
+    p += 16
+  i += 16 * iterations
+
+  for i in i ..< mask.data.len:
+    if mask.data[i] != 0:
+      mask.data[i] = 255
+
+proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} =
+  let opacity = round(255 * opacity).uint16
+  if opacity == 255:
+    return
+
+  if opacity == 0:
+    when target is Image:
+      target.fill(rgbx(0, 0, 0, 0))
+    else:
+      target.fill(0)
+    return
+
+  var
+    i: int
+    p = cast[uint](target.data[0].addr)
+    len =
+      when target is Image:
+        target.data.len * 4
+      else:
+        target.data.len
+
+  let
+    oddMask = mm_set1_epi16(0xff00)
+    div255 = mm_set1_epi16(0x8081)
+    zeroVec = mm_setzero_si128()
+    opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
+    iterations = len div 16
+  for _ in 0 ..< len div 16:
+    let values = mm_loadu_si128(cast[pointer](p))
+    if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
+      var
+        valuesEven = mm_slli_epi16(values, 8)
+        valuesOdd = mm_and_si128(values, oddMask)
+      valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
+      valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
+      valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
+      valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
+      mm_storeu_si128(
+        cast[pointer](p),
+        mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
+      )
+    p += 16
+  i += 16 * iterations
+
+  when target is Image:
+    for i in i div 4 ..< target.data.len:
+      var rgbx = target.data[i]
+      rgbx.r = ((rgbx.r * opacity) div 255).uint8
+      rgbx.g = ((rgbx.g * opacity) div 255).uint8
+      rgbx.b = ((rgbx.b * opacity) div 255).uint8
+      rgbx.a = ((rgbx.a * opacity) div 255).uint8
+      target.data[i] = rgbx
+  else:
+    for i in i ..< target.data.len:
+      target.data[i] = ((target.data[i] * opacity) div 255).uint8
+
+when defined(release):
+  {.pop.}