diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index 8200ff7..e6a38be 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -1,9 +1,6 @@ ## Blending modes. -import chroma, common, internal, std/math - -when defined(amd64) and allowSimd: - import nimsimd/sse2 +import chroma, common, simd, std/math # See https://www.w3.org/TR/compositing-1/ # See https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_blend_equation_advanced.txt diff --git a/src/pixie/common.nim b/src/pixie/common.nim index e289bb3..94f4563 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -41,6 +41,26 @@ type width*, height*: int data*: seq[uint8] +proc newImage*(width, height: int): Image {.raises: [PixieError].} = + ## Creates a new image with the parameter dimensions. + if width <= 0 or height <= 0: + raise newException(PixieError, "Image width and height must be > 0") + + result = Image() + result.width = width + result.height = height + result.data = newSeq[ColorRGBX](width * height) + +proc newMask*(width, height: int): Mask {.raises: [PixieError].} = + ## Creates a new mask with the parameter dimensions. + if width <= 0 or height <= 0: + raise newException(PixieError, "Mask width and height must be > 0") + + result = Mask() + result.width = width + result.height = height + result.data = newSeq[uint8](width * height) + proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} = ## Linearly interpolate between a and b using t. let t = round(t * 255).uint32 diff --git a/src/pixie/fileformats/jpeg.nim b/src/pixie/fileformats/jpeg.nim index 4078d74..4fe5980 100644 --- a/src/pixie/fileformats/jpeg.nim +++ b/src/pixie/fileformats/jpeg.nim @@ -1,8 +1,5 @@ import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal, - pixie/masks, std/decls, std/sequtils, std/strutils - -when defined(amd64) and allowSimd: - import nimsimd/sse2 + pixie/masks, pixie/simd, std/decls, std/sequtils, std/strutils # This JPEG decoder is loosely based on stb_image which is public domain. diff --git a/src/pixie/fileformats/png.nim b/src/pixie/fileformats/png.nim index 79c694d..9877d7a 100644 --- a/src/pixie/fileformats/png.nim +++ b/src/pixie/fileformats/png.nim @@ -1,8 +1,5 @@ import chroma, flatty/binny, math, pixie/common, pixie/images, pixie/internal, - zippy, zippy/crc - -when defined(amd64) and allowSimd: - import nimsimd/sse2 + pixie/simd, zippy, zippy/crc # See http://www.libpng.org/pub/png/spec/1.2/PNG-Contents.html diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 35d8fbd..53f969d 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,10 +1,4 @@ -import blends, bumpy, chroma, common, internal, masks, vmath - -when allowSimd: - import simd - - when defined(amd64): - import nimsimd/sse2 +import blends, bumpy, chroma, common, internal, masks, simd, vmath const h = 0.5.float32 @@ -13,27 +7,18 @@ type UnsafeImage = distinct Image when defined(release): {.push checks: off.} -proc newImage*(width, height: int): Image {.raises: [PixieError].} = - ## Creates a new image with the parameter dimensions. - if width <= 0 or height <= 0: - raise newException(PixieError, "Image width and height must be > 0") - - result = Image() - result.width = width - result.height = height - result.data = newSeq[ColorRGBX](width * height) - -proc newImage*(mask: Mask): Image {.raises: [PixieError].} = +proc newImage*(mask: Mask): Image {.hasSimd, raises: [PixieError].} = result = newImage(mask.width, mask.height) - - when allowSimd and compiles(newImageFromMaskSimd): - newImageFromMaskSimd(result.data, mask.data) - return - for i in 0 ..< mask.data.len: let v = mask.data[i] result.data[i] = rgbx(v, v, v, v) +proc newMask*(image: Image): Mask {.hasSimd, raises: [PixieError].} = + ## Returns a new mask using the alpha values of the image. + result = newMask(image.width, image.height) + for i in 0 ..< image.data.len: + result.data[i] = image.data[i].a + proc copy*(image: Image): Image {.raises: [PixieError].} = ## Copies the image data into a new image. result = newImage(image.width, image.height) @@ -89,25 +74,17 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = ## Fills the image with the color. fillUnsafe(image.data, color, 0, image.data.len) -proc isOneColor*(image: Image): bool {.raises: [].} = +proc isOneColor*(image: Image): bool {.hasSimd, raises: [].} = ## Checks if the entire image is the same color. - when allowSimd and compiles(isOneColorSimd): - return isOneColorSimd(image.data) - result = true - let color = cast[uint32](image.data[0]) for i in 0 ..< image.data.len: if cast[uint32](image.data[i]) != color: return false -proc isTransparent*(image: Image): bool {.raises: [].} = +proc isTransparent*(image: Image): bool {.hasSimd, raises: [].} = ## Checks if this image is fully transparent or not. - when allowSimd and compiles(isTransparentSimd): - return isTransparentSimd(image.data) - result = true - for i in 0 ..< image.data.len: if image.data[i].a != 0: return false @@ -341,46 +318,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = +proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - image.fill(rgbx(0, 0, 0, 0)) + target.fill(rgbx(0, 0, 0, 0)) return - when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd(image.data, opacity) - return - - for i in 0 ..< image.data.len: - var rgbx = image.data[i] + for i in 0 ..< target.data.len: + var rgbx = target.data[i] rgbx.r = ((rgbx.r * opacity) div 255).uint8 rgbx.g = ((rgbx.g * opacity) div 255).uint8 rgbx.b = ((rgbx.b * opacity) div 255).uint8 rgbx.a = ((rgbx.a * opacity) div 255).uint8 - image.data[i] = rgbx + target.data[i] = rgbx -proc invert*(image: Image) {.raises: [].} = +proc invert*(target: Image) {.hasSimd, raises: [].} = ## Inverts all of the colors and alpha. - when allowSimd and compiles(invertImageSimd): - invertImageSimd(image.data) - return - - for i in 0 ..< image.data.len: - var rgbx = image.data[i] + for i in 0 ..< target.data.len: + var rgbx = target.data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g rgbx.b = 255 - rgbx.b rgbx.a = 255 - rgbx.a - image.data[i] = rgbx + target.data[i] = rgbx # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # is not a valid premultiplied alpha color. # We need to convert back to premultiplied alpha after inverting. - image.data.toPremultipliedAlpha() + target.data.toPremultipliedAlpha() proc blur*( image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0) @@ -443,17 +412,6 @@ proc blur*( values += outOfBounds * kernel[yy - y + radius] image.unsafe[x, y] = rgbx(values) -proc newMask*(image: Image): Mask {.raises: [PixieError].} = - ## Returns a new mask using the alpha values of the image. - result = newMask(image.width, image.height) - - when allowSimd and compiles(newMaskFromImageSimd): - newMaskFromImageSimd(result.data, image.data) - return - - for i in 0 ..< image.data.len: - result.data[i] = image.data[i].a - proc getRgbaSmooth*( image: Image, x, y: float32, wrapped = false ): ColorRGBX {.raises: [].} = diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index f854dc1..1cc5b55 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -1,12 +1,4 @@ -import bumpy, chroma, common, system/memory, vmath - -const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) - -when allowSimd: - import simd - - when defined(amd64): - import nimsimd/sse2 +import bumpy, chroma, common, simd, system/memory, vmath template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -76,7 +68,7 @@ proc fillUnsafe*( proc fillUnsafe*( data: var seq[ColorRGBX], color: SomeColor, start, len: int -) {.raises: [].} = +) {.hasSimd, raises: [].} = ## Fills the image data with the color starting at index start and ## continuing for len indices. when allowSimd and compiles(fillUnsafeSimd): @@ -110,12 +102,10 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = c.b = straightAlphaTable[c.a][c.b] data[i] = c -proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = +proc toPremultipliedAlpha*( + data: var seq[ColorRGBA | ColorRGBX] +) {.hasSimd, raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. - when allowSimd and compiles(toPremultipliedAlphaSimd): - toPremultipliedAlphaSimd(data) - return - for i in 0 ..< data.len: var c = data[i] if c.a != 255: @@ -124,25 +114,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} c.b = ((c.b.uint32 * c.a) div 255).uint8 data[i] = c -proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = - when allowSimd and compiles(isOpaqueSimd): - return isOpaqueSimd(data, start, len) - +proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} = result = true - for i in start ..< start + len: if data[i].a != 255: return false when defined(amd64) and allowSimd: - proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = - let opacityVec = mm_set1_ps(opacity) - var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec)) - finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) - finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) - cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) - - export pack4xAlphaValues, unpackAlphaValues + import simd/todo + export todo when defined(release): {.pop.} diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 7efc04e..e214f8e 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,26 +1,10 @@ -import common, internal, vmath - -when allowSimd: - import simd - - when defined(amd64): - import nimsimd/sse2 +import common, internal, simd, vmath type UnsafeMask = distinct Mask when defined(release): {.push checks: off.} -proc newMask*(width, height: int): Mask {.raises: [PixieError].} = - ## Creates a new mask with the parameter dimensions. - if width <= 0 or height <= 0: - raise newException(PixieError, "Mask width and height must be > 0") - - result = Mask() - result.width = width - result.height = height - result.data = newSeq[uint8](width * height) - proc copy*(mask: Mask): Mask {.raises: [PixieError].} = ## Copies the image data into a new image. result = newMask(mask.width, mask.height) @@ -180,22 +164,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} = +proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - mask.fill(0) + target.fill(0) return - when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd(mask.data, opacity) - return - - for i in 0 ..< mask.data.len: - mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 + for i in 0 ..< target.data.len: + target.data[i] = ((target.data[i] * opacity) div 255).uint8 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = ## Gets a interpolated value with float point coordinates. @@ -225,14 +205,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = else: topMix -proc invert*(mask: Mask) {.raises: [].} = +proc invert*(target: Mask) {.hasSimd, raises: [].} = ## Inverts all of the values - creates a negative of the mask. - when allowSimd and compiles(invertMaskSimd): - invertMaskSimd(mask.data) - return - - for i in 0 ..< mask.data.len: - mask.data[i] = 255 - mask.data[i] + for i in 0 ..< target.data.len: + target.data[i] = 255 - target.data[i] proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = ## Grows the mask by spread. @@ -295,12 +271,8 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = break mask.unsafe[x, y] = maxValue -proc ceil*(mask: Mask) {.raises: [].} = +proc ceil*(mask: Mask) {.hasSimd, raises: [].} = ## A value of 0 stays 0. Anything else turns into 255. - when allowSimd and compiles(invertImageSimd): - ceilMaskSimd(mask.data) - return - for i in 0 ..< mask.data.len: if mask.data[i] != 0: mask.data[i] = 255 diff --git a/src/pixie/paints.nim b/src/pixie/paints.nim index 133a367..1c7d312 100644 --- a/src/pixie/paints.nim +++ b/src/pixie/paints.nim @@ -1,7 +1,4 @@ -import chroma, common, images, internal, vmath - -when defined(amd64) and allowSimd: - import nimsimd/sse2 +import chroma, common, images, simd, vmath type PaintKind* = enum diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index b8fa5a1..4ead1c8 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1,8 +1,5 @@ -import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv, - std/strutils, vmath - -when defined(amd64) and allowSimd: - import nimsimd/sse2 +import blends, bumpy, chroma, common, images, internal, masks, paints, simd, + std/fenv, std/strutils, vmath type WindingRule* = enum diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 4230366..14207d6 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -1,392 +1,57 @@ -import chroma +import simd/internal, std/macros, std/tables -when defined(release): - {.push checks: off.} - -when defined(amd64): - import nimsimd/runtimecheck, nimsimd/sse2, simd/avx, simd/avx2 +const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) +macro hasSimd*(procedure: untyped) = let - cpuHasAvx* = checkInstructionSets({AVX}) - cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) + name = procedure.procName() + args = procedure.procArguments() + originalBody = procedure[6] + nameSse2 = name & "Sse2" + nameAvx = name & "Avx" + nameAvx2 = name & "Avx2" + callAvx = call(ident(nameAvx), args) + callAvx2 = call(ident(nameAvx2), args) - proc packAlphaValues(v: M128i): M128i {.inline.} = - ## Shuffle the alpha values for these 4 colors to the first 4 bytes. - result = mm_srli_epi32(v, 24) - result = mm_packus_epi16(result, mm_setzero_si128()) - result = mm_packus_epi16(result, mm_setzero_si128()) + var body = newStmtList() - proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} = + when not defined(pixieNoAvx): + if nameAvx2 in simdProcs: + body.add quote do: + if cpuHasAvx2: + forceReturn `callAvx2` + + if nameAvx in simdProcs: + body.add quote do: + if cpuHasAvx: + forceReturn `callAvx` + + if nameSse2 in simdProcs: + let bodySse2 = simdProcs[nameSse2][6] + body.add quote do: + `bodySse2` + else: + body.add quote do: + echo "using ", `name`, " scalar" + `originalBody` + + procedure[6] = body + + return procedure + +when allowSimd and defined(amd64): + import simd/sse2, simd/avx, simd/avx2 + export sse2, avx, avx2 + + when defined(pixieNoAvx): + const + cpuHasAvx* = false + cpuHasAvx2* = false + else: + import nimsimd/runtimecheck let - i = packAlphaValues(i) - j = mm_slli_si128(packAlphaValues(j), 4) - k = mm_slli_si128(packAlphaValues(k), 8) - l = mm_slli_si128(packAlphaValues(l), 12) - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) - proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) - - proc fillUnsafeSimd*( - data: var seq[ColorRGBX], - start, len: int, - color: SomeColor - ) = - if cpuHasAvx: - fillUnsafeAvx(data, start, len, color) - return - - let rgbx = color.asRgbx() - - var - i = start - p = cast[uint](data[i].addr) - # Align to 16 bytes - while i < (start + len) and (p and 15) != 0: - data[i] = rgbx - inc i - p += 4 - - let - colorVec = mm_set1_epi32(cast[int32](rgbx)) - iterations = (start + len - i) div 8 - for _ in 0 ..< iterations: - mm_store_si128(cast[pointer](p), colorVec) - mm_store_si128(cast[pointer](p + 16), colorVec) - p += 32 - i += iterations * 8 - - for i in i ..< start + len: - data[i] = rgbx - - proc isOneColorSimd*(data: var seq[ColorRGBX]): bool = - if cpuHasAvx2: - return isOneColorAvx2(data) - - result = true - - let color = data[0] - - var - i: int - p = cast[uint](data[0].addr) - # Align to 16 bytes - while i < data.len and (p and 15) != 0: - if data[i] != color: - return false - inc i - p += 4 - - let - colorVec = mm_set1_epi32(cast[int32](color)) - iterations = (data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - eq0 = mm_cmpeq_epi8(values0, colorVec) - eq1 = mm_cmpeq_epi8(values1, colorVec) - eq2 = mm_cmpeq_epi8(values2, colorVec) - eq3 = mm_cmpeq_epi8(values3, colorVec) - eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) - if mm_movemask_epi8(eq0123) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< data.len: - if data[i] != color: - return false - - proc isTransparentSimd*(data: var seq[ColorRGBX]): bool = - if cpuHasAvx2: - return isTransparentAvx2(data) - - var - i: int - p = cast[uint](data[0].addr) - # Align to 16 bytes - while i < data.len and (p and 15) != 0: - if data[i].a != 0: - return false - inc i - p += 4 - - result = true - - let - vecZero = mm_setzero_si128() - iterations = (data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_or_si128(values0, values1) - values23 = mm_or_si128(values2, values3) - values0123 = mm_or_si128(values01, values23) - if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< data.len: - if data[i].a != 0: - return false - - proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool = - if cpuHasAvx2: - return isOpaqueAvx2(data, start, len) - - result = true - - var - i = start - p = cast[uint](data[0].addr) - # Align to 16 bytes - while i < (start + len) and (p and 15) != 0: - if data[i].a != 255: - return false - inc i - p += 4 - - let - vec255 = mm_set1_epi8(255) - iterations = (start + len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_and_si128(values0, values1) - values23 = mm_and_si128(values2, values3) - values0123 = mm_and_si128(values01, values23) - eq = mm_cmpeq_epi8(values0123, vec255) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - return false - p += 64 - i += 16 * iterations - - for i in i ..< start + len: - if data[i].a != 255: - return false - - proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) = - if cpuHasAvx2: - toPremultipliedAlphaAvx2(data) - return - - var i: int - - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - iterations = data.len div 4 - for _ in 0 ..< iterations: - let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 - - for i in i ..< data.len: - var c = data[i] - if c.a != 255: - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 - data[i] = c - - proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) = - var i: int - for _ in 0 ..< src.len div 16: - var alphas = mm_loadu_si128(src[i].addr) - for j in 0 ..< 4: - var unpacked = unpackAlphaValues(alphas) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - mm_storeu_si128(dst[i + j * 4].addr, unpacked) - alphas = mm_srli_si128(alphas, 4) - i += 16 - - for i in i ..< src.len: - let v = src[i] - dst[i] = rgbx(v, v, v, v) - - proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) = - var i: int - for _ in 0 ..< src.len div 16: - let - a = mm_loadu_si128(src[i + 0].addr) - b = mm_loadu_si128(src[i + 4].addr) - c = mm_loadu_si128(src[i + 8].addr) - d = mm_loadu_si128(src[i + 12].addr) - mm_storeu_si128( - dst[i].addr, - pack4xAlphaValues(a, b, c, d) - ) - i += 16 - - for i in i ..< src.len: - dst[i] = src[i].a - - proc invertImageSimd*(data: var seq[ColorRGBX]) = - var - i: int - p = cast[uint](data[0].addr) - # Align to 16 bytes - while i < data.len and (p and 15) != 0: - var rgbx = data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - data[i] = rgbx - inc i - p += 4 - - let - vec255 = mm_set1_epi8(255) - iterations = data.len div 16 - for _ in 0 ..< iterations: - let - a = mm_load_si128(cast[pointer](p)) - b = mm_load_si128(cast[pointer](p + 16)) - c = mm_load_si128(cast[pointer](p + 32)) - d = mm_load_si128(cast[pointer](p + 48)) - mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) - mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) - mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) - mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) - p += 64 - i += 16 * iterations - - for i in i ..< data.len: - var rgbx = data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - data[i] = rgbx - - toPremultipliedAlphaSimd(data) - - proc invertMaskSimd*(data: var seq[uint8]) = - var - i: int - p = cast[uint](data[0].addr) - # Align to 16 bytes - while i < data.len and (p and 15) != 0: - data[i] = 255 - data[i] - inc i - inc p - - let - vec255 = mm_set1_epi8(255) - iterations = data.len div 64 - for _ in 0 ..< iterations: - let - a = mm_load_si128(cast[pointer](p)) - b = mm_load_si128(cast[pointer](p + 16)) - c = mm_load_si128(cast[pointer](p + 32)) - d = mm_load_si128(cast[pointer](p + 48)) - mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) - mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) - mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) - mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) - p += 64 - i += 64 * iterations - - for i in i ..< data.len: - data[i] = 255 - data[i] - - proc ceilMaskSimd*(data: var seq[uint8]) = - var - i: int - p = cast[uint](data[0].addr) - - let - zeroVec = mm_setzero_si128() - vec255 = mm_set1_epi8(255) - iterations = data.len div 16 - for _ in 0 ..< iterations: - var values = mm_loadu_si128(cast[pointer](p)) - values = mm_cmpeq_epi8(values, zeroVec) - values = mm_andnot_si128(values, vec255) - mm_storeu_si128(cast[pointer](p), values) - p += 16 - i += 16 * iterations - - for i in i ..< data.len: - if data[i] != 0: - data[i] = 255 - - proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) = - var - i: int - p = cast[uint](data[0].addr) - len = - when data is seq[ColorRGBX]: - data.len * 4 - else: - data.len - - let - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - zeroVec = mm_setzero_si128() - opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) - iterations = len div 16 - for _ in 0 ..< len div 16: - let values = mm_loadu_si128(cast[pointer](p)) - if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: - var - valuesEven = mm_slli_epi16(values, 8) - valuesOdd = mm_and_si128(values, oddMask) - valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) - valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) - valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) - valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) - mm_storeu_si128( - cast[pointer](p), - mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) - ) - p += 16 - i += 16 * iterations - - when data is seq[ColorRGBX]: - for i in i div 4 ..< data.len: - var rgbx = data[i] - rgbx.r = ((rgbx.r * opacity) div 255).uint8 - rgbx.g = ((rgbx.g * opacity) div 255).uint8 - rgbx.b = ((rgbx.b * opacity) div 255).uint8 - rgbx.a = ((rgbx.a * opacity) div 255).uint8 - data[i] = rgbx - else: - for i in i ..< data.len: - data[i] = ((data[i] * opacity) div 255).uint8 - -when defined(release): - {.pop.} + import nimsimd/sse2 as nimsimdsse2 + export nimsimdsse2 diff --git a/src/pixie/simd/avx.nim b/src/pixie/simd/avx.nim index c18e9c6..82b4333 100644 --- a/src/pixie/simd/avx.nim +++ b/src/pixie/simd/avx.nim @@ -1,4 +1,4 @@ -import chroma, nimsimd/avx +import chroma, internal, nimsimd/avx when defined(gcc) or defined(clang): {.localPassc: "-mavx".} @@ -8,9 +8,9 @@ when defined(release): proc fillUnsafeAvx*( data: var seq[ColorRGBX], - start, len: int, - color: SomeColor -) = + color: SomeColor, + start, len: int +) {.simd.} = let rgbx = color.asRgbx() var diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 3539931..a692692 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -1,4 +1,4 @@ -import chroma, nimsimd/avx2 +import chroma, internal, nimsimd/avx2, pixie/common when defined(gcc) or defined(clang): {.localPassc: "-mavx2".} @@ -6,25 +6,25 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} -proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool = +proc isOneColorAvx2*(image: Image): bool {.simd.} = result = true - let color = data[0] + let color = image.data[0] var i: int # Align to 32 bytes - while i < data.len and (cast[uint](data[i].addr) and 31) != 0: - if data[i] != color: + while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0: + if image.data[i] != color: return false inc i let colorVec = mm256_set1_epi32(cast[int32](color)) - iterations = (data.len - i) div 16 + iterations = (image.data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(data[i].addr) - values1 = mm256_load_si256(data[i + 8].addr) + values0 = mm256_load_si256(image.data[i].addr) + values1 = mm256_load_si256(image.data[i + 8].addr) eq0 = mm256_cmpeq_epi8(values0, colorVec) eq1 = mm256_cmpeq_epi8(values1, colorVec) eq01 = mm256_and_si256(eq0, eq1) @@ -32,38 +32,38 @@ proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool = return false i += 16 - for i in i ..< data.len: - if data[i] != color: + for i in i ..< image.data.len: + if image.data[i] != color: return false -proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool = +proc isTransparentAvx2*(image: Image): bool {.simd.} = result = true var i: int # Align to 32 bytes - while i < data.len and (cast[uint](data[i].addr) and 31) != 0: - if data[i].a != 0: + while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0: + if image.data[i].a != 0: return false inc i let vecZero = mm256_setzero_si256() - iterations = (data.len - i) div 16 + iterations = (image.data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(data[i].addr) - values1 = mm256_load_si256(data[i + 8].addr) + values0 = mm256_load_si256(image.data[i].addr) + values1 = mm256_load_si256(image.data[i + 8].addr) values01 = mm256_or_si256(values0, values1) eq = mm256_cmpeq_epi8(values01, vecZero) if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): return false i += 16 - for i in i ..< data.len: - if data[i].a != 0: + for i in i ..< image.data.len: + if image.data[i].a != 0: return false -proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = result = true var i = start @@ -90,7 +90,7 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = var i: int let diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim new file mode 100644 index 0000000..2165e72 --- /dev/null +++ b/src/pixie/simd/internal.nim @@ -0,0 +1,39 @@ +import std/macros, std/tables + +var simdProcs* {.compiletime.}: Table[string, NimNode] + +template forceReturn*(procedure: untyped) = + ## Produce `return procedure()` when procedure returns something otherwise + ## `procedure(); return` if it procedure returns nothing. + when compiles(block: return procedure): + return procedure + else: + procedure + return + +proc procName*(procedure: NimNode): string = + ## Given a procedure signature returns only name string. + let nameNode = procedure[0] + if nameNode.kind == nnkPostfix: + nameNode[1].strVal + else: + nameNode.strVal + +proc procArguments*(procedure: NimNode): seq[NimNode] = + ## Given a procedure signature gets the arguments as a list. + for i, arg in procedure[3]: + if i > 0: + for j in 0 ..< arg.len - 2: + result.add(arg[j]) + +proc call*(name: NimNode, args: seq[NimNode]): NimNode = + ## Produces a procedure call with arguments. + result = newNimNode(nnkCall) + result.add(name) + for arg in args: + result.add(arg) + +macro simd*(procedure: untyped) = + let name = procedure.procName() + simdProcs[name] = procedure.copy() + return procedure diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim new file mode 100644 index 0000000..b862fbc --- /dev/null +++ b/src/pixie/simd/sse2.nim @@ -0,0 +1,351 @@ +import chroma, internal, nimsimd/sse2, pixie/common, todo, vmath + +when defined(release): + {.push checks: off.} + +proc fillUnsafeSse2*( + data: var seq[ColorRGBX], + color: SomeColor, + start, len: int +) {.simd.} = + let rgbx = color.asRgbx() + + var + i = start + p = cast[uint](data[i].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + data[i] = rgbx + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](rgbx)) + iterations = (start + len - i) div 8 + for _ in 0 ..< iterations: + mm_store_si128(cast[pointer](p), colorVec) + mm_store_si128(cast[pointer](p + 16), colorVec) + p += 32 + i += iterations * 8 + + for i in i ..< start + len: + data[i] = rgbx + +proc isOneColorSse2*(image: Image): bool {.simd.} = + result = true + + let color = image.data[0] + + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + if image.data[i] != color: + return false + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](color)) + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) + eq0 = mm_cmpeq_epi8(values0, colorVec) + eq1 = mm_cmpeq_epi8(values1, colorVec) + eq2 = mm_cmpeq_epi8(values2, colorVec) + eq3 = mm_cmpeq_epi8(values3, colorVec) + eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) + if mm_movemask_epi8(eq0123) != 0xffff: + return false + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + if image.data[i] != color: + return false + +proc isTransparentSse2*(image: Image): bool {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + if image.data[i].a != 0: + return false + inc i + p += 4 + + result = true + + let + vecZero = mm_setzero_si128() + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) + values01 = mm_or_si128(values0, values1) + values23 = mm_or_si128(values2, values3) + values0123 = mm_or_si128(values01, values23) + if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: + return false + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + if image.data[i].a != 0: + return false + +proc isOpaqueSse2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = + result = true + + var + i = start + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let + vec255 = mm_set1_epi8(255) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) + values01 = mm_and_si128(values0, values1) + values23 = mm_and_si128(values2, values3) + values0123 = mm_and_si128(values01, values23) + eq = mm_cmpeq_epi8(values0123, vec255) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i].a != 255: + return false + +proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = + var i: int + + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + iterations = data.len div 4 + for _ in 0 ..< iterations: + let + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + let + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c + +proc newImageSse2*(mask: Mask): Image {.simd.} = + result = newImage(mask.width, mask.height) + + var i: int + for _ in 0 ..< mask.data.len div 16: + var alphas = mm_loadu_si128(mask.data[i].addr) + for j in 0 ..< 4: + var unpacked = unpackAlphaValues(alphas) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + mm_storeu_si128(result.data[i + j * 4].addr, unpacked) + alphas = mm_srli_si128(alphas, 4) + i += 16 + + for i in i ..< mask.data.len: + let v = mask.data[i] + result.data[i] = rgbx(v, v, v, v) + +proc newMaskSse2*(image: Image): Mask {.simd.} = + result = newMask(image.width, image.height) + + var i: int + for _ in 0 ..< image.data.len div 16: + let + a = mm_loadu_si128(image.data[i + 0].addr) + b = mm_loadu_si128(image.data[i + 4].addr) + c = mm_loadu_si128(image.data[i + 8].addr) + d = mm_loadu_si128(image.data[i + 12].addr) + mm_storeu_si128( + result.data[i].addr, + pack4xAlphaValues(a, b, c, d) + ) + i += 16 + + for i in i ..< image.data.len: + result.data[i] = image.data[i].a + +proc invertSse2*(target: Image | Mask) {.simd.} = + var + i: int + p = cast[uint](target.data[0].addr) + # Align to 16 bytes + while i < target.data.len and (p and 15) != 0: + when target is Image: + var rgbx = target.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + target.data[i] = rgbx + inc i + p += 4 + else: + target.data[i] = 255 - target.data[i] + inc i + inc p + + let vec255 = mm_set1_epi8(255) + + when target is Image: + let iterations = target.data.len div 16 + else: + let iterations = target.data.len div 64 + + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + + when target is Image: + i += 16 * iterations + + for i in i ..< target.data.len: + var rgbx = target.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + target.data[i] = rgbx + + toPremultipliedAlphaSse2(target.data) + else: + i += 64 * iterations + + for i in i ..< target.data.len: + target.data[i] = 255 - target.data[i] + +proc ceilSse2*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + + let + zeroVec = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + var values = mm_loadu_si128(cast[pointer](p)) + values = mm_cmpeq_epi8(values, zeroVec) + values = mm_andnot_si128(values, vec255) + mm_storeu_si128(cast[pointer](p), values) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + if mask.data[i] != 0: + mask.data[i] = 255 + +proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + when target is Image: + target.fill(rgbx(0, 0, 0, 0)) + else: + target.fill(0) + return + + var + i: int + p = cast[uint](target.data[0].addr) + len = + when target is Image: + target.data.len * 4 + else: + target.data.len + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + zeroVec = mm_setzero_si128() + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = len div 16 + for _ in 0 ..< len div 16: + let values = mm_loadu_si128(cast[pointer](p)) + if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: + var + valuesEven = mm_slli_epi16(values, 8) + valuesOdd = mm_and_si128(values, oddMask) + valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) + mm_storeu_si128( + cast[pointer](p), + mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) + ) + p += 16 + i += 16 * iterations + + when target is Image: + for i in i div 4 ..< target.data.len: + var rgbx = target.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + target.data[i] = rgbx + else: + for i in i ..< target.data.len: + target.data[i] = ((target.data[i] * opacity) div 255).uint8 + +when defined(release): + {.pop.} diff --git a/src/pixie/simd/todo.nim b/src/pixie/simd/todo.nim new file mode 100644 index 0000000..e7cafb9 --- /dev/null +++ b/src/pixie/simd/todo.nim @@ -0,0 +1,33 @@ +import chroma, nimsimd/sse2 + +when defined(release): + {.push checks: off.} + +proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = + let opacityVec = mm_set1_ps(opacity) + var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec)) + finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) + finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) + cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) + +proc packAlphaValues(v: M128i): M128i {.inline.} = + ## Shuffle the alpha values for these 4 colors to the first 4 bytes. + result = mm_srli_epi32(v, 24) + result = mm_packus_epi16(result, mm_setzero_si128()) + result = mm_packus_epi16(result, mm_setzero_si128()) + +proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} = + let + i = packAlphaValues(i) + j = mm_slli_si128(packAlphaValues(j), 4) + k = mm_slli_si128(packAlphaValues(k), 8) + l = mm_slli_si128(packAlphaValues(l), 12) + mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + +proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). + result = mm_unpacklo_epi8(mm_setzero_si128(), v) + result = mm_unpacklo_epi8(mm_setzero_si128(), result) + +when defined(release): + {.pop.}