simd, hasSimd pragmas

This commit is contained in:
Ryan Oldenburg 2022-07-07 18:47:32 -05:00
parent fd52dfecb4
commit 8bb6957fe9
15 changed files with 560 additions and 557 deletions

View file

@ -1,9 +1,6 @@
## Blending modes.
import chroma, common, internal, std/math
when defined(amd64) and allowSimd:
import nimsimd/sse2
import chroma, common, simd, std/math
# See https://www.w3.org/TR/compositing-1/
# See https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_blend_equation_advanced.txt

View file

@ -41,6 +41,26 @@ type
width*, height*: int
data*: seq[uint8]
proc newImage*(width, height: int): Image {.raises: [PixieError].} =
## Creates a new image with the parameter dimensions.
if width <= 0 or height <= 0:
raise newException(PixieError, "Image width and height must be > 0")
result = Image()
result.width = width
result.height = height
result.data = newSeq[ColorRGBX](width * height)
proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
## Creates a new mask with the parameter dimensions.
if width <= 0 or height <= 0:
raise newException(PixieError, "Mask width and height must be > 0")
result = Mask()
result.width = width
result.height = height
result.data = newSeq[uint8](width * height)
proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} =
## Linearly interpolate between a and b using t.
let t = round(t * 255).uint32

View file

@ -1,8 +1,5 @@
import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
pixie/masks, std/decls, std/sequtils, std/strutils
when defined(amd64) and allowSimd:
import nimsimd/sse2
pixie/masks, pixie/simd, std/decls, std/sequtils, std/strutils
# This JPEG decoder is loosely based on stb_image which is public domain.

View file

@ -1,8 +1,5 @@
import chroma, flatty/binny, math, pixie/common, pixie/images, pixie/internal,
zippy, zippy/crc
when defined(amd64) and allowSimd:
import nimsimd/sse2
pixie/simd, zippy, zippy/crc
# See http://www.libpng.org/pub/png/spec/1.2/PNG-Contents.html

View file

@ -1,10 +1,4 @@
import blends, bumpy, chroma, common, internal, masks, vmath
when allowSimd:
import simd
when defined(amd64):
import nimsimd/sse2
import blends, bumpy, chroma, common, internal, masks, simd, vmath
const h = 0.5.float32
@ -13,27 +7,18 @@ type UnsafeImage = distinct Image
when defined(release):
{.push checks: off.}
proc newImage*(width, height: int): Image {.raises: [PixieError].} =
## Creates a new image with the parameter dimensions.
if width <= 0 or height <= 0:
raise newException(PixieError, "Image width and height must be > 0")
result = Image()
result.width = width
result.height = height
result.data = newSeq[ColorRGBX](width * height)
proc newImage*(mask: Mask): Image {.raises: [PixieError].} =
proc newImage*(mask: Mask): Image {.hasSimd, raises: [PixieError].} =
result = newImage(mask.width, mask.height)
when allowSimd and compiles(newImageFromMaskSimd):
newImageFromMaskSimd(result.data, mask.data)
return
for i in 0 ..< mask.data.len:
let v = mask.data[i]
result.data[i] = rgbx(v, v, v, v)
proc newMask*(image: Image): Mask {.hasSimd, raises: [PixieError].} =
## Returns a new mask using the alpha values of the image.
result = newMask(image.width, image.height)
for i in 0 ..< image.data.len:
result.data[i] = image.data[i].a
proc copy*(image: Image): Image {.raises: [PixieError].} =
## Copies the image data into a new image.
result = newImage(image.width, image.height)
@ -89,25 +74,17 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
## Fills the image with the color.
fillUnsafe(image.data, color, 0, image.data.len)
proc isOneColor*(image: Image): bool {.raises: [].} =
proc isOneColor*(image: Image): bool {.hasSimd, raises: [].} =
## Checks if the entire image is the same color.
when allowSimd and compiles(isOneColorSimd):
return isOneColorSimd(image.data)
result = true
let color = cast[uint32](image.data[0])
for i in 0 ..< image.data.len:
if cast[uint32](image.data[i]) != color:
return false
proc isTransparent*(image: Image): bool {.raises: [].} =
proc isTransparent*(image: Image): bool {.hasSimd, raises: [].} =
## Checks if this image is fully transparent or not.
when allowSimd and compiles(isTransparentSimd):
return isTransparentSimd(image.data)
result = true
for i in 0 ..< image.data.len:
if image.data[i].a != 0:
return false
@ -341,46 +318,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
result.width * 4
)
proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} =
## Multiplies alpha of the image by opacity.
let opacity = round(255 * opacity).uint16
if opacity == 255:
return
if opacity == 0:
image.fill(rgbx(0, 0, 0, 0))
target.fill(rgbx(0, 0, 0, 0))
return
when allowSimd and compiles(applyOpacitySimd):
applyOpacitySimd(image.data, opacity)
return
for i in 0 ..< image.data.len:
var rgbx = image.data[i]
for i in 0 ..< target.data.len:
var rgbx = target.data[i]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
image.data[i] = rgbx
target.data[i] = rgbx
proc invert*(image: Image) {.raises: [].} =
proc invert*(target: Image) {.hasSimd, raises: [].} =
## Inverts all of the colors and alpha.
when allowSimd and compiles(invertImageSimd):
invertImageSimd(image.data)
return
for i in 0 ..< image.data.len:
var rgbx = image.data[i]
for i in 0 ..< target.data.len:
var rgbx = target.data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
image.data[i] = rgbx
target.data[i] = rgbx
# Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
# is not a valid premultiplied alpha color.
# We need to convert back to premultiplied alpha after inverting.
image.data.toPremultipliedAlpha()
target.data.toPremultipliedAlpha()
proc blur*(
image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0)
@ -443,17 +412,6 @@ proc blur*(
values += outOfBounds * kernel[yy - y + radius]
image.unsafe[x, y] = rgbx(values)
proc newMask*(image: Image): Mask {.raises: [PixieError].} =
## Returns a new mask using the alpha values of the image.
result = newMask(image.width, image.height)
when allowSimd and compiles(newMaskFromImageSimd):
newMaskFromImageSimd(result.data, image.data)
return
for i in 0 ..< image.data.len:
result.data[i] = image.data[i].a
proc getRgbaSmooth*(
image: Image, x, y: float32, wrapped = false
): ColorRGBX {.raises: [].} =

View file

@ -1,12 +1,4 @@
import bumpy, chroma, common, system/memory, vmath
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
when allowSimd:
import simd
when defined(amd64):
import nimsimd/sse2
import bumpy, chroma, common, simd, system/memory, vmath
template currentExceptionAsPixieError*(): untyped =
## Gets the current exception and returns it as a PixieError with stack trace.
@ -76,7 +68,7 @@ proc fillUnsafe*(
proc fillUnsafe*(
data: var seq[ColorRGBX], color: SomeColor, start, len: int
) {.raises: [].} =
) {.hasSimd, raises: [].} =
## Fills the image data with the color starting at index start and
## continuing for len indices.
when allowSimd and compiles(fillUnsafeSimd):
@ -110,12 +102,10 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
c.b = straightAlphaTable[c.a][c.b]
data[i] = c
proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
proc toPremultipliedAlpha*(
data: var seq[ColorRGBA | ColorRGBX]
) {.hasSimd, raises: [].} =
## Converts an image to premultiplied alpha from straight alpha.
when allowSimd and compiles(toPremultipliedAlphaSimd):
toPremultipliedAlphaSimd(data)
return
for i in 0 ..< data.len:
var c = data[i]
if c.a != 255:
@ -124,25 +114,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
c.b = ((c.b.uint32 * c.a) div 255).uint8
data[i] = c
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
when allowSimd and compiles(isOpaqueSimd):
return isOpaqueSimd(data, start, len)
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} =
result = true
for i in start ..< start + len:
if data[i].a != 255:
return false
when defined(amd64) and allowSimd:
proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
let opacityVec = mm_set1_ps(opacity)
var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
export pack4xAlphaValues, unpackAlphaValues
import simd/todo
export todo
when defined(release):
{.pop.}

View file

@ -1,26 +1,10 @@
import common, internal, vmath
when allowSimd:
import simd
when defined(amd64):
import nimsimd/sse2
import common, internal, simd, vmath
type UnsafeMask = distinct Mask
when defined(release):
{.push checks: off.}
proc newMask*(width, height: int): Mask {.raises: [PixieError].} =
## Creates a new mask with the parameter dimensions.
if width <= 0 or height <= 0:
raise newException(PixieError, "Mask width and height must be > 0")
result = Mask()
result.width = width
result.height = height
result.data = newSeq[uint8](width * height)
proc copy*(mask: Mask): Mask {.raises: [PixieError].} =
## Copies the image data into a new image.
result = newMask(mask.width, mask.height)
@ -180,22 +164,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
result.width * 4
)
proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} =
proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} =
## Multiplies alpha of the image by opacity.
let opacity = round(255 * opacity).uint16
if opacity == 255:
return
if opacity == 0:
mask.fill(0)
target.fill(0)
return
when allowSimd and compiles(applyOpacitySimd):
applyOpacitySimd(mask.data, opacity)
return
for i in 0 ..< mask.data.len:
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
for i in 0 ..< target.data.len:
target.data[i] = ((target.data[i] * opacity) div 255).uint8
proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
## Gets a interpolated value with float point coordinates.
@ -225,14 +205,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
else:
topMix
proc invert*(mask: Mask) {.raises: [].} =
proc invert*(target: Mask) {.hasSimd, raises: [].} =
## Inverts all of the values - creates a negative of the mask.
when allowSimd and compiles(invertMaskSimd):
invertMaskSimd(mask.data)
return
for i in 0 ..< mask.data.len:
mask.data[i] = 255 - mask.data[i]
for i in 0 ..< target.data.len:
target.data[i] = 255 - target.data[i]
proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
## Grows the mask by spread.
@ -295,12 +271,8 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
break
mask.unsafe[x, y] = maxValue
proc ceil*(mask: Mask) {.raises: [].} =
proc ceil*(mask: Mask) {.hasSimd, raises: [].} =
## A value of 0 stays 0. Anything else turns into 255.
when allowSimd and compiles(invertImageSimd):
ceilMaskSimd(mask.data)
return
for i in 0 ..< mask.data.len:
if mask.data[i] != 0:
mask.data[i] = 255

View file

@ -1,7 +1,4 @@
import chroma, common, images, internal, vmath
when defined(amd64) and allowSimd:
import nimsimd/sse2
import chroma, common, images, simd, vmath
type
PaintKind* = enum

View file

@ -1,8 +1,5 @@
import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv,
std/strutils, vmath
when defined(amd64) and allowSimd:
import nimsimd/sse2
import blends, bumpy, chroma, common, images, internal, masks, paints, simd,
std/fenv, std/strutils, vmath
type
WindingRule* = enum

View file

@ -1,392 +1,57 @@
import chroma
import simd/internal, std/macros, std/tables
when defined(release):
{.push checks: off.}
when defined(amd64):
import nimsimd/runtimecheck, nimsimd/sse2, simd/avx, simd/avx2
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
macro hasSimd*(procedure: untyped) =
let
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
name = procedure.procName()
args = procedure.procArguments()
originalBody = procedure[6]
nameSse2 = name & "Sse2"
nameAvx = name & "Avx"
nameAvx2 = name & "Avx2"
callAvx = call(ident(nameAvx), args)
callAvx2 = call(ident(nameAvx2), args)
proc packAlphaValues(v: M128i): M128i {.inline.} =
## Shuffle the alpha values for these 4 colors to the first 4 bytes.
result = mm_srli_epi32(v, 24)
result = mm_packus_epi16(result, mm_setzero_si128())
result = mm_packus_epi16(result, mm_setzero_si128())
var body = newStmtList()
proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
when not defined(pixieNoAvx):
if nameAvx2 in simdProcs:
body.add quote do:
if cpuHasAvx2:
forceReturn `callAvx2`
if nameAvx in simdProcs:
body.add quote do:
if cpuHasAvx:
forceReturn `callAvx`
if nameSse2 in simdProcs:
let bodySse2 = simdProcs[nameSse2][6]
body.add quote do:
`bodySse2`
else:
body.add quote do:
echo "using ", `name`, " scalar"
`originalBody`
procedure[6] = body
return procedure
when allowSimd and defined(amd64):
import simd/sse2, simd/avx, simd/avx2
export sse2, avx, avx2
when defined(pixieNoAvx):
const
cpuHasAvx* = false
cpuHasAvx2* = false
else:
import nimsimd/runtimecheck
let
i = packAlphaValues(i)
j = mm_slli_si128(packAlphaValues(j), 4)
k = mm_slli_si128(packAlphaValues(k), 8)
l = mm_slli_si128(packAlphaValues(l), 12)
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
proc fillUnsafeSimd*(
data: var seq[ColorRGBX],
start, len: int,
color: SomeColor
) =
if cpuHasAvx:
fillUnsafeAvx(data, start, len, color)
return
let rgbx = color.asRgbx()
var
i = start
p = cast[uint](data[i].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(cast[pointer](p), colorVec)
mm_store_si128(cast[pointer](p + 16), colorVec)
p += 32
i += iterations * 8
for i in i ..< start + len:
data[i] = rgbx
proc isOneColorSimd*(data: var seq[ColorRGBX]): bool =
if cpuHasAvx2:
return isOneColorAvx2(data)
result = true
let color = data[0]
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
if data[i] != color:
return false
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
eq3 = mm_cmpeq_epi8(values3, colorVec)
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< data.len:
if data[i] != color:
return false
proc isTransparentSimd*(data: var seq[ColorRGBX]): bool =
if cpuHasAvx2:
return isTransparentAvx2(data)
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
if data[i].a != 0:
return false
inc i
p += 4
result = true
let
vecZero = mm_setzero_si128()
iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< data.len:
if data[i].a != 0:
return false
proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool =
if cpuHasAvx2:
return isOpaqueAvx2(data, start, len)
result = true
var
i = start
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
vec255 = mm_set1_epi8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i].a != 255:
return false
proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) =
if cpuHasAvx2:
toPremultipliedAlphaAvx2(data)
return
var i: int
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
iterations = data.len div 4
for _ in 0 ..< iterations:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
for i in i ..< data.len:
var c = data[i]
if c.a != 255:
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
data[i] = c
proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) =
var i: int
for _ in 0 ..< src.len div 16:
var alphas = mm_loadu_si128(src[i].addr)
for j in 0 ..< 4:
var unpacked = unpackAlphaValues(alphas)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
mm_storeu_si128(dst[i + j * 4].addr, unpacked)
alphas = mm_srli_si128(alphas, 4)
i += 16
for i in i ..< src.len:
let v = src[i]
dst[i] = rgbx(v, v, v, v)
proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) =
var i: int
for _ in 0 ..< src.len div 16:
let
a = mm_loadu_si128(src[i + 0].addr)
b = mm_loadu_si128(src[i + 4].addr)
c = mm_loadu_si128(src[i + 8].addr)
d = mm_loadu_si128(src[i + 12].addr)
mm_storeu_si128(
dst[i].addr,
pack4xAlphaValues(a, b, c, d)
)
i += 16
for i in i ..< src.len:
dst[i] = src[i].a
proc invertImageSimd*(data: var seq[ColorRGBX]) =
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
var rgbx = data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
data[i] = rgbx
inc i
p += 4
let
vec255 = mm_set1_epi8(255)
iterations = data.len div 16
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 16 * iterations
for i in i ..< data.len:
var rgbx = data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
data[i] = rgbx
toPremultipliedAlphaSimd(data)
proc invertMaskSimd*(data: var seq[uint8]) =
var
i: int
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < data.len and (p and 15) != 0:
data[i] = 255 - data[i]
inc i
inc p
let
vec255 = mm_set1_epi8(255)
iterations = data.len div 64
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
i += 64 * iterations
for i in i ..< data.len:
data[i] = 255 - data[i]
proc ceilMaskSimd*(data: var seq[uint8]) =
var
i: int
p = cast[uint](data[0].addr)
let
zeroVec = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
iterations = data.len div 16
for _ in 0 ..< iterations:
var values = mm_loadu_si128(cast[pointer](p))
values = mm_cmpeq_epi8(values, zeroVec)
values = mm_andnot_si128(values, vec255)
mm_storeu_si128(cast[pointer](p), values)
p += 16
i += 16 * iterations
for i in i ..< data.len:
if data[i] != 0:
data[i] = 255
proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) =
var
i: int
p = cast[uint](data[0].addr)
len =
when data is seq[ColorRGBX]:
data.len * 4
else:
data.len
let
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
zeroVec = mm_setzero_si128()
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
iterations = len div 16
for _ in 0 ..< len div 16:
let values = mm_loadu_si128(cast[pointer](p))
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
var
valuesEven = mm_slli_epi16(values, 8)
valuesOdd = mm_and_si128(values, oddMask)
valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
cast[pointer](p),
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
p += 16
i += 16 * iterations
when data is seq[ColorRGBX]:
for i in i div 4 ..< data.len:
var rgbx = data[i]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
data[i] = rgbx
else:
for i in i ..< data.len:
data[i] = ((data[i] * opacity) div 255).uint8
when defined(release):
{.pop.}
import nimsimd/sse2 as nimsimdsse2
export nimsimdsse2

View file

@ -1,4 +1,4 @@
import chroma, nimsimd/avx
import chroma, internal, nimsimd/avx
when defined(gcc) or defined(clang):
{.localPassc: "-mavx".}
@ -8,9 +8,9 @@ when defined(release):
proc fillUnsafeAvx*(
data: var seq[ColorRGBX],
start, len: int,
color: SomeColor
) =
color: SomeColor,
start, len: int
) {.simd.} =
let rgbx = color.asRgbx()
var

View file

@ -1,4 +1,4 @@
import chroma, nimsimd/avx2
import chroma, internal, nimsimd/avx2, pixie/common
when defined(gcc) or defined(clang):
{.localPassc: "-mavx2".}
@ -6,25 +6,25 @@ when defined(gcc) or defined(clang):
when defined(release):
{.push checks: off.}
proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
proc isOneColorAvx2*(image: Image): bool {.simd.} =
result = true
let color = data[0]
let color = image.data[0]
var i: int
# Align to 32 bytes
while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
if data[i] != color:
while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
if image.data[i] != color:
return false
inc i
let
colorVec = mm256_set1_epi32(cast[int32](color))
iterations = (data.len - i) div 16
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(data[i].addr)
values1 = mm256_load_si256(data[i + 8].addr)
values0 = mm256_load_si256(image.data[i].addr)
values1 = mm256_load_si256(image.data[i + 8].addr)
eq0 = mm256_cmpeq_epi8(values0, colorVec)
eq1 = mm256_cmpeq_epi8(values1, colorVec)
eq01 = mm256_and_si256(eq0, eq1)
@ -32,38 +32,38 @@ proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool =
return false
i += 16
for i in i ..< data.len:
if data[i] != color:
for i in i ..< image.data.len:
if image.data[i] != color:
return false
proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool =
proc isTransparentAvx2*(image: Image): bool {.simd.} =
result = true
var i: int
# Align to 32 bytes
while i < data.len and (cast[uint](data[i].addr) and 31) != 0:
if data[i].a != 0:
while i < image.data.len and (cast[uint](image.data[i].addr) and 31) != 0:
if image.data[i].a != 0:
return false
inc i
let
vecZero = mm256_setzero_si256()
iterations = (data.len - i) div 16
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm256_load_si256(data[i].addr)
values1 = mm256_load_si256(data[i + 8].addr)
values0 = mm256_load_si256(image.data[i].addr)
values1 = mm256_load_si256(image.data[i + 8].addr)
values01 = mm256_or_si256(values0, values1)
eq = mm256_cmpeq_epi8(values01, vecZero)
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
return false
i += 16
for i in i ..< data.len:
if data[i].a != 0:
for i in i ..< image.data.len:
if image.data[i].a != 0:
return false
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
result = true
var i = start
@ -90,7 +90,7 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
if data[i].a != 255:
return false
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) =
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
var i: int
let

View file

@ -0,0 +1,39 @@
import std/macros, std/tables
var simdProcs* {.compiletime.}: Table[string, NimNode]
template forceReturn*(procedure: untyped) =
## Produce `return procedure()` when procedure returns something otherwise
## `procedure(); return` if it procedure returns nothing.
when compiles(block: return procedure):
return procedure
else:
procedure
return
proc procName*(procedure: NimNode): string =
## Given a procedure signature returns only name string.
let nameNode = procedure[0]
if nameNode.kind == nnkPostfix:
nameNode[1].strVal
else:
nameNode.strVal
proc procArguments*(procedure: NimNode): seq[NimNode] =
## Given a procedure signature gets the arguments as a list.
for i, arg in procedure[3]:
if i > 0:
for j in 0 ..< arg.len - 2:
result.add(arg[j])
proc call*(name: NimNode, args: seq[NimNode]): NimNode =
## Produces a procedure call with arguments.
result = newNimNode(nnkCall)
result.add(name)
for arg in args:
result.add(arg)
macro simd*(procedure: untyped) =
let name = procedure.procName()
simdProcs[name] = procedure.copy()
return procedure

351
src/pixie/simd/sse2.nim Normal file
View file

@ -0,0 +1,351 @@
import chroma, internal, nimsimd/sse2, pixie/common, todo, vmath
when defined(release):
{.push checks: off.}
proc fillUnsafeSse2*(
data: var seq[ColorRGBX],
color: SomeColor,
start, len: int
) {.simd.} =
let rgbx = color.asRgbx()
var
i = start
p = cast[uint](data[i].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(cast[pointer](p), colorVec)
mm_store_si128(cast[pointer](p + 16), colorVec)
p += 32
i += iterations * 8
for i in i ..< start + len:
data[i] = rgbx
proc isOneColorSse2*(image: Image): bool {.simd.} =
result = true
let color = image.data[0]
var
i: int
p = cast[uint](image.data[0].addr)
# Align to 16 bytes
while i < image.data.len and (p and 15) != 0:
if image.data[i] != color:
return false
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
eq3 = mm_cmpeq_epi8(values3, colorVec)
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< image.data.len:
if image.data[i] != color:
return false
proc isTransparentSse2*(image: Image): bool {.simd.} =
var
i: int
p = cast[uint](image.data[0].addr)
# Align to 16 bytes
while i < image.data.len and (p and 15) != 0:
if image.data[i].a != 0:
return false
inc i
p += 4
result = true
let
vecZero = mm_setzero_si128()
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< image.data.len:
if image.data[i].a != 0:
return false
proc isOpaqueSse2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
result = true
var
i = start
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
vec255 = mm_set1_epi8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i].a != 255:
return false
proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
var i: int
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
iterations = data.len div 4
for _ in 0 ..< iterations:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
for i in i ..< data.len:
var c = data[i]
if c.a != 255:
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
data[i] = c
proc newImageSse2*(mask: Mask): Image {.simd.} =
result = newImage(mask.width, mask.height)
var i: int
for _ in 0 ..< mask.data.len div 16:
var alphas = mm_loadu_si128(mask.data[i].addr)
for j in 0 ..< 4:
var unpacked = unpackAlphaValues(alphas)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
mm_storeu_si128(result.data[i + j * 4].addr, unpacked)
alphas = mm_srli_si128(alphas, 4)
i += 16
for i in i ..< mask.data.len:
let v = mask.data[i]
result.data[i] = rgbx(v, v, v, v)
proc newMaskSse2*(image: Image): Mask {.simd.} =
result = newMask(image.width, image.height)
var i: int
for _ in 0 ..< image.data.len div 16:
let
a = mm_loadu_si128(image.data[i + 0].addr)
b = mm_loadu_si128(image.data[i + 4].addr)
c = mm_loadu_si128(image.data[i + 8].addr)
d = mm_loadu_si128(image.data[i + 12].addr)
mm_storeu_si128(
result.data[i].addr,
pack4xAlphaValues(a, b, c, d)
)
i += 16
for i in i ..< image.data.len:
result.data[i] = image.data[i].a
proc invertSse2*(target: Image | Mask) {.simd.} =
var
i: int
p = cast[uint](target.data[0].addr)
# Align to 16 bytes
while i < target.data.len and (p and 15) != 0:
when target is Image:
var rgbx = target.data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
target.data[i] = rgbx
inc i
p += 4
else:
target.data[i] = 255 - target.data[i]
inc i
inc p
let vec255 = mm_set1_epi8(255)
when target is Image:
let iterations = target.data.len div 16
else:
let iterations = target.data.len div 64
for _ in 0 ..< iterations:
let
a = mm_load_si128(cast[pointer](p))
b = mm_load_si128(cast[pointer](p + 16))
c = mm_load_si128(cast[pointer](p + 32))
d = mm_load_si128(cast[pointer](p + 48))
mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a))
mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b))
mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c))
mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d))
p += 64
when target is Image:
i += 16 * iterations
for i in i ..< target.data.len:
var rgbx = target.data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
target.data[i] = rgbx
toPremultipliedAlphaSse2(target.data)
else:
i += 64 * iterations
for i in i ..< target.data.len:
target.data[i] = 255 - target.data[i]
proc ceilSse2*(mask: Mask) {.simd.} =
var
i: int
p = cast[uint](mask.data[0].addr)
let
zeroVec = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
iterations = mask.data.len div 16
for _ in 0 ..< iterations:
var values = mm_loadu_si128(cast[pointer](p))
values = mm_cmpeq_epi8(values, zeroVec)
values = mm_andnot_si128(values, vec255)
mm_storeu_si128(cast[pointer](p), values)
p += 16
i += 16 * iterations
for i in i ..< mask.data.len:
if mask.data[i] != 0:
mask.data[i] = 255
proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} =
let opacity = round(255 * opacity).uint16
if opacity == 255:
return
if opacity == 0:
when target is Image:
target.fill(rgbx(0, 0, 0, 0))
else:
target.fill(0)
return
var
i: int
p = cast[uint](target.data[0].addr)
len =
when target is Image:
target.data.len * 4
else:
target.data.len
let
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
zeroVec = mm_setzero_si128()
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
iterations = len div 16
for _ in 0 ..< len div 16:
let values = mm_loadu_si128(cast[pointer](p))
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
var
valuesEven = mm_slli_epi16(values, 8)
valuesOdd = mm_and_si128(values, oddMask)
valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
cast[pointer](p),
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
p += 16
i += 16 * iterations
when target is Image:
for i in i div 4 ..< target.data.len:
var rgbx = target.data[i]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
target.data[i] = rgbx
else:
for i in i ..< target.data.len:
target.data[i] = ((target.data[i] * opacity) div 255).uint8
when defined(release):
{.pop.}

33
src/pixie/simd/todo.nim Normal file
View file

@ -0,0 +1,33 @@
import chroma, nimsimd/sse2
when defined(release):
{.push checks: off.}
proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
let opacityVec = mm_set1_ps(opacity)
var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
proc packAlphaValues(v: M128i): M128i {.inline.} =
## Shuffle the alpha values for these 4 colors to the first 4 bytes.
result = mm_srli_epi32(v, 24)
result = mm_packus_epi16(result, mm_setzero_si128())
result = mm_packus_epi16(result, mm_setzero_si128())
proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
let
i = packAlphaValues(i)
j = mm_slli_si128(packAlphaValues(j), 4)
k = mm_slli_si128(packAlphaValues(k), 8)
l = mm_slli_si128(packAlphaValues(l), 12)
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
when defined(release):
{.pop.}