Merge pull request #454 from guzba/master

simd changes started as discussed
This commit is contained in:
Andre von Houck 2022-06-29 17:48:47 -07:00 committed by GitHub
commit fc3b834e62
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 508 additions and 384 deletions

View file

@ -1,7 +1,10 @@
import blends, bumpy, chroma, common, masks, pixie/internal, vmath import blends, bumpy, chroma, common, internal, masks, vmath
when defined(amd64) and allowSimd: when allowSimd:
import nimsimd/sse2, runtimechecked/avx2 import simd
when defined(amd64):
import nimsimd/sse2
const h = 0.5.float32 const h = 0.5.float32
@ -28,21 +31,18 @@ proc newImage*(width, height: int): Image {.raises: [PixieError].} =
proc newImage*(mask: Mask): Image {.raises: [PixieError].} = proc newImage*(mask: Mask): Image {.raises: [PixieError].} =
result = newImage(mask.width, mask.height) result = newImage(mask.width, mask.height)
var i: int
when defined(amd64) and allowSimd:
for _ in 0 ..< mask.data.len div 16:
var alphas = mm_loadu_si128(mask.data[i].addr)
for j in 0 ..< 4:
var unpacked = unpackAlphaValues(alphas)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
mm_storeu_si128(result.data[i + j * 4].addr, unpacked)
alphas = mm_srli_si128(alphas, 4)
i += 16
for j in i ..< mask.data.len: when allowSimd and compiles(newImageFromMaskSimd):
let v = mask.data[j] newImageFromMaskSimd(
result.data[j] = rgbx(v, v, v, v) cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr),
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len
)
return
for i in 0 ..< mask.data.len:
let v = mask.data[i]
result.data[i] = rgbx(v, v, v, v)
proc copy*(image: Image): Image {.raises: [PixieError].} = proc copy*(image: Image): Image {.raises: [PixieError].} =
## Copies the image data into a new image. ## Copies the image data into a new image.
@ -101,83 +101,30 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
proc isOneColor*(image: Image): bool {.raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} =
## Checks if the entire image is the same color. ## Checks if the entire image is the same color.
when defined(amd64) and allowSimd: when allowSimd and compiles(isOneColorSimd):
if cpuHasAvx2: return isOneColorSimd(
return isOneColorAvx2(image.data, 0, image.data.len) cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
result = true result = true
let color = image.data[0] let color = cast[uint32](image.data[0])
for i in 0 ..< image.data.len:
var i: int if cast[uint32](image.data[i]) != color:
when defined(amd64) and allowSimd:
# Align to 16 bytes
var p = cast[uint](image.data[i].addr)
while i < image.data.len and (p and 15) != 0:
if image.data[i] != color:
return false
inc i
p += 4
let
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
eq3 = mm_cmpeq_epi8(values3, colorVec)
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< image.data.len:
if image.data[i] != color:
return false return false
proc isTransparent*(image: Image): bool {.raises: [].} = proc isTransparent*(image: Image): bool {.raises: [].} =
## Checks if this image is fully transparent or not. ## Checks if this image is fully transparent or not.
when defined(amd64) and allowSimd: when allowSimd and compiles(isTransparentSimd):
if cpuHasAvx2: return isTransparentSimd(
return isTransparentAvx2(image.data, 0, image.data.len) cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
image.data.len
)
result = true result = true
var i: int for i in 0 ..< image.data.len:
when defined(amd64) and allowSimd:
# Align to 16 bytes
var p = cast[uint](image.data[i].addr)
while i < image.data.len and (p and 15) != 0:
if image.data[i].a != 0:
return false
inc i
p += 4
let
vecZero = mm_setzero_si128()
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
p += 64
i += 16 * iterations
for i in i ..< image.data.len:
if image.data[i].a != 0: if image.data[i].a != 0:
return false return false
@ -410,89 +357,48 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
result.width * 4 result.width * 4
) )
proc applyOpacity*(target: Image | Mask, opacity: float32) {.raises: [].} = proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} =
## Multiplies alpha of the image by opacity. ## Multiplies alpha of the image by opacity.
let opacity = round(255 * opacity).uint16 let opacity = round(255 * opacity).uint16
if opacity == 255: if opacity == 255:
return return
if opacity == 0: if opacity == 0:
when type(target) is Image: image.fill(rgbx(0, 0, 0, 0))
target.fill(rgbx(0, 0, 0, 0))
else:
target.fill(0)
return return
var i: int when allowSimd and compiles(applyOpacitySimd):
when defined(amd64) and allowSimd: applyOpacitySimd(
when type(target) is Image: cast[ptr UncheckedArray[uint8]](image.data[0].addr),
let byteLen = target.data.len * 4 image.data.len * 4,
else: opacity
let byteLen = target.data.len )
return
let for i in 0 ..< image.data.len:
oddMask = mm_set1_epi16(cast[int16](0xff00)) var rgbx = image.data[i]
div255 = mm_set1_epi16(cast[int16](0x8081)) rgbx.r = ((rgbx.r * opacity) div 255).uint8
zeroVec = mm_setzero_si128() rgbx.g = ((rgbx.g * opacity) div 255).uint8
opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) rgbx.b = ((rgbx.b * opacity) div 255).uint8
for _ in 0 ..< byteLen div 16: rgbx.a = ((rgbx.a * opacity) div 255).uint8
when type(target) is Image: image.data[i] = rgbx
let index = i div 4
else:
let index = i
let values = mm_loadu_si128(target.data[index].addr)
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
var
valuesEven = mm_slli_epi16(values, 8)
valuesOdd = mm_and_si128(values, oddMask)
valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
target.data[index].addr,
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
i += 16
when type(target) is Image:
for j in i div 4 ..< target.data.len:
var rgbx = target.data[j]
rgbx.r = ((rgbx.r * opacity) div 255).uint8
rgbx.g = ((rgbx.g * opacity) div 255).uint8
rgbx.b = ((rgbx.b * opacity) div 255).uint8
rgbx.a = ((rgbx.a * opacity) div 255).uint8
target.data[j] = rgbx
else:
for j in i ..< target.data.len:
target.data[j] = ((target.data[j] * opacity) div 255).uint8
proc invert*(image: Image) {.raises: [].} = proc invert*(image: Image) {.raises: [].} =
## Inverts all of the colors and alpha. ## Inverts all of the colors and alpha.
var i: int when allowSimd and compiles(invertImageSimd):
when defined(amd64) and allowSimd: invertImageSimd(
let vec255 = mm_set1_epi8(cast[int8](255)) cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
for _ in 0 ..< image.data.len div 16: image.data.len
let )
a = mm_loadu_si128(image.data[i + 0].addr) return
b = mm_loadu_si128(image.data[i + 4].addr)
c = mm_loadu_si128(image.data[i + 8].addr)
d = mm_loadu_si128(image.data[i + 12].addr)
mm_storeu_si128(image.data[i + 0].addr, mm_sub_epi8(vec255, a))
mm_storeu_si128(image.data[i + 4].addr, mm_sub_epi8(vec255, b))
mm_storeu_si128(image.data[i + 8].addr, mm_sub_epi8(vec255, c))
mm_storeu_si128(image.data[i + 12].addr, mm_sub_epi8(vec255, d))
i += 16
for j in i ..< image.data.len: for i in 0 ..< image.data.len:
var rgbx = image.data[j] var rgbx = image.data[i]
rgbx.r = 255 - rgbx.r rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a rgbx.a = 255 - rgbx.a
image.data[j] = rgbx image.data[i] = rgbx
# Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This
# is not a valid premultiplied alpha color. # is not a valid premultiplied alpha color.
@ -564,22 +470,16 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} =
## Returns a new mask using the alpha values of the image. ## Returns a new mask using the alpha values of the image.
result = newMask(image.width, image.height) result = newMask(image.width, image.height)
var i: int when allowSimd and compiles(newMaskFromImageSimd):
when defined(amd64) and allowSimd: newMaskFromImageSimd(
for _ in 0 ..< image.data.len div 16: cast[ptr UncheckedArray[uint8]](result.data[0].addr),
let cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr),
a = mm_loadu_si128(image.data[i + 0].addr) image.data.len
b = mm_loadu_si128(image.data[i + 4].addr) )
c = mm_loadu_si128(image.data[i + 8].addr) return
d = mm_loadu_si128(image.data[i + 12].addr)
mm_storeu_si128(
result.data[i].addr,
pack4xAlphaValues(a, b, c, d)
)
i += 16
for j in i ..< image.data.len: for i in 0 ..< image.data.len:
result.data[j] = image.data[j].a result.data[i] = image.data[i].a
proc getRgbaSmooth*( proc getRgbaSmooth*(
image: Image, x, y: float32, wrapped = false image: Image, x, y: float32, wrapped = false

View file

@ -2,11 +2,11 @@ import bumpy, chroma, common, system/memory, vmath
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
when defined(amd64) and allowSimd: when allowSimd:
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 import simd
let
cpuHasAvx* = checkInstructionSets({AVX}) when defined(amd64):
cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) import nimsimd/sse2
template currentExceptionAsPixieError*(): untyped = template currentExceptionAsPixieError*(): untyped =
## Gets the current exception and returns it as a PixieError with stack trace. ## Gets the current exception and returns it as a PixieError with stack trace.
@ -81,45 +81,20 @@ proc fillUnsafe*(
## continuing for len indices. ## continuing for len indices.
let rgbx = color.asRgbx() let rgbx = color.asRgbx()
# If we can use AVX, do so when allowSimd and compiles(fillUnsafeSimd):
when defined(amd64) and allowSimd: fillUnsafeSimd(
if cpuHasAvx and len >= 64: cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
fillUnsafeAvx(data, rgbx, start, len) len,
return rgbx
)
return
# Use memset when every byte has the same value # Use memset when every byte has the same value
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
nimSetMem(data[start].addr, rgbx.r.cint, len * 4) nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
else: else:
var i = start for color in data.mitems:
when defined(amd64) and allowSimd: color = rgbx
# Align to 16 bytes
var p = cast[uint](data[i].addr)
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx
inc i
p += 4
# When supported, SIMD fill until we run out of room
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (start + len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(cast[pointer](p), colorVec)
mm_store_si128(cast[pointer](p + 16), colorVec)
p += 32
i += iterations * 8
else:
when sizeof(int) == 8:
# Fill 8 bytes at a time when possible
var
u32 = cast[uint32](rgbx)
u64 = cast[uint64]([u32, u32])
for _ in 0 ..< len div 2:
copyMem(data[i].addr, u64.addr, 8)
i += 2
# Fill whatever is left the slow way
for i in i ..< start + len:
data[i] = rgbx
const straightAlphaTable = block: const straightAlphaTable = block:
var table: array[256, array[256, uint8]] var table: array[256, array[256, uint8]]
@ -141,39 +116,14 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} =
## Converts an image to premultiplied alpha from straight alpha. ## Converts an image to premultiplied alpha from straight alpha.
var i: int when allowSimd and compiles(toPremultipliedAlphaSimd):
when defined(amd64) and allowSimd: toPremultipliedAlphaSimd(
if cpuHasAvx2: cast[ptr UncheckedArray[uint32]](data[0].addr),
i = toPremultipliedAlphaAvx2(data) data.len
else: )
let return
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
for _ in 0 ..< data.len div 4:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
# Convert whatever is left for i in 0 ..< data.len:
for i in i ..< data.len:
var c = data[i] var c = data[i]
if c.a != 255: if c.a != 255:
c.r = ((c.r.uint32 * c.a) div 255).uint8 c.r = ((c.r.uint32 * c.a) div 255).uint8
@ -182,41 +132,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
data[i] = c data[i] = c
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
when defined(amd64) and allowSimd: when allowSimd and compiles(isOpaqueSimd):
if cpuHasAvx2 and len >= 64: return isOpaqueSimd(
return isOpaqueAvx2(data, start, len) cast[ptr UncheckedArray[ColorRGBX]](data[start].addr),
len
)
result = true result = true
var i = start for i in start ..< start + len:
when defined(amd64) and allowSimd:
# Align to 16 bytes
var p = cast[uint](data[i].addr)
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4
let
vec255 = mm_set1_epi8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(cast[pointer](p))
values1 = mm_load_si128(cast[pointer](p + 16))
values2 = mm_load_si128(cast[pointer](p + 32))
values3 = mm_load_si128(cast[pointer](p + 48))
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
p += 64
i += 16 * iterations
for i in i ..< start + len:
if data[i].a != 255: if data[i].a != 255:
return false return false
@ -228,24 +152,7 @@ when defined(amd64) and allowSimd:
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
proc packAlphaValues(v: M128i): M128i {.inline, raises: [].} = export pack4xAlphaValues, unpackAlphaValues
## Shuffle the alpha values for these 4 colors to the first 4 bytes
result = mm_srli_epi32(v, 24)
result = mm_packus_epi16(result, mm_setzero_si128())
result = mm_packus_epi16(result, mm_setzero_si128())
proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline, raises: [].} =
let
i = packAlphaValues(i)
j = mm_slli_si128(packAlphaValues(j), 4)
k = mm_slli_si128(packAlphaValues(k), 8)
l = mm_slli_si128(packAlphaValues(l), 12)
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value)
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
when defined(release): when defined(release):
{.pop.} {.pop.}

View file

@ -75,6 +75,10 @@ proc setValue*(mask: Mask, x, y: int, value: uint8) {.inline, raises: [].} =
## Sets a value at (x, y) or does nothing if outside of bounds. ## Sets a value at (x, y) or does nothing if outside of bounds.
mask[x, y] = value mask[x, y] = value
proc fill*(mask: Mask, value: uint8) {.inline, raises: [].} =
## Fills the mask with the value.
fillUnsafe(mask.data, value, 0, mask.data.len)
proc minifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = proc minifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
## Scales the mask down by an integer scale. ## Scales the mask down by an integer scale.
if power < 0: if power < 0:
@ -179,9 +183,26 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
result.width * 4 result.width * 4
) )
proc fill*(mask: Mask, value: uint8) {.inline, raises: [].} = proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} =
## Fills the mask with the value. ## Multiplies alpha of the image by opacity.
fillUnsafe(mask.data, value, 0, mask.data.len) let opacity = round(255 * opacity).uint16
if opacity == 255:
return
if opacity == 0:
mask.fill(0)
return
when allowSimd and compiles(applyOpacitySimd):
applyOpacitySimd(
cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
mask.data.len,
opacity
)
return
for i in 0 ..< mask.data.len:
mask.data[i] = ((mask.data[i] * opacity) div 255).uint8
proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
## Gets a interpolated value with float point coordinates. ## Gets a interpolated value with float point coordinates.
@ -213,17 +234,15 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} =
proc invert*(mask: Mask) {.raises: [].} = proc invert*(mask: Mask) {.raises: [].} =
## Inverts all of the values - creates a negative of the mask. ## Inverts all of the values - creates a negative of the mask.
var i: int when allowSimd and compiles(invertImageSimd):
when defined(amd64) and allowSimd: invertMaskSimd(
let vec255 = mm_set1_epi8(255) cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
for _ in 0 ..< mask.data.len div 16: mask.data.len
var values = mm_loadu_si128(mask.data[i].addr) )
values = mm_sub_epi8(vec255, values) return
mm_storeu_si128(mask.data[i].addr, values)
i += 16
for j in i ..< mask.data.len: for i in 0 ..< mask.data.len:
mask.data[j] = 255 - mask.data[j] mask.data[i] = 255 - mask.data[i]
proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
## Grows the mask by spread. ## Grows the mask by spread.
@ -288,21 +307,16 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} =
proc ceil*(mask: Mask) {.raises: [].} = proc ceil*(mask: Mask) {.raises: [].} =
## A value of 0 stays 0. Anything else turns into 255. ## A value of 0 stays 0. Anything else turns into 255.
var i: int when allowSimd and compiles(invertImageSimd):
when defined(amd64) and allowSimd: ceilMaskSimd(
let cast[ptr UncheckedArray[uint8]](mask.data[0].addr),
zeroVec = mm_setzero_si128() mask.data.len
vec255 = mm_set1_epi8(255) )
for _ in 0 ..< mask.data.len div 16: return
var values = mm_loadu_si128(mask.data[i].addr)
values = mm_cmpeq_epi8(values, zeroVec)
values = mm_andnot_si128(values, vec255)
mm_storeu_si128(mask.data[i].addr, values)
i += 16
for j in i ..< mask.data.len: for i in 0 ..< mask.data.len:
if mask.data[j] != 0: if mask.data[i] != 0:
mask.data[j] = 255 mask.data[i] = 255
proc blur*(mask: Mask, radius: float32, outOfBounds: uint8 = 0) {.raises: [PixieError].} = proc blur*(mask: Mask, radius: float32, outOfBounds: uint8 = 0) {.raises: [PixieError].} =
## Applies Gaussian blur to the image given a radius. ## Applies Gaussian blur to the image given a radius.

View file

@ -1823,7 +1823,7 @@ proc fillHits(
proc fillShapes( proc fillShapes(
image: Image, image: Image,
shapes: seq[Polygon], shapes: var seq[Polygon],
color: SomeColor, color: SomeColor,
windingRule: WindingRule, windingRule: WindingRule,
blendMode: BlendMode blendMode: BlendMode
@ -1852,8 +1852,10 @@ proc fillShapes(
var var
partitions = partitionSegments(segments, startY, pathHeight - startY) partitions = partitionSegments(segments, startY, pathHeight - startY)
partitionIndex: int partitionIndex: int
entryIndices = newSeq[int](partitions.maxEntryCount)
numEntryIndices: int
coverages = newSeq[uint8](pathWidth) coverages = newSeq[uint8](pathWidth)
hits = newSeq[(Fixed32, int16)](partitions.maxEntryCount) hits = newSeq[(Fixed32, int16)](entryIndices.len)
numHits: int numHits: int
aa: bool aa: bool
@ -1895,13 +1897,13 @@ proc fillShapes(
y += partitionHeight y += partitionHeight
continue continue
var var allEntriesInScanlineSpanIt = true
allEntriesInScanlineSpanIt = true numEntryIndices = 0
tmp: int
entryIndices: array[2, int]
if partitions[partitionIndex].twoNonintersectingSpanningSegments: if partitions[partitionIndex].twoNonintersectingSpanningSegments:
tmp = 2 numEntryIndices = 2
entryIndices = [0, 1] entryIndices[0] = 0
entryIndices[1] = 1
else: else:
for i in 0 ..< partitions[partitionIndex].entries.len: for i in 0 ..< partitions[partitionIndex].entries.len:
if partitions[partitionIndex].entries[i].segment.to.y < y.float32 or if partitions[partitionIndex].entries[i].segment.to.y < y.float32 or
@ -1911,14 +1913,10 @@ proc fillShapes(
partitions[partitionIndex].entries[i].segment.to.y < (y + 1).float32: partitions[partitionIndex].entries[i].segment.to.y < (y + 1).float32:
allEntriesInScanlineSpanIt = false allEntriesInScanlineSpanIt = false
break break
if tmp < 2: entryIndices[numEntryIndices] = i
entryIndices[tmp] = i inc numEntryIndices
inc tmp
else:
tmp = 0
break
if allEntriesInScanlineSpanIt and tmp == 2: if allEntriesInScanlineSpanIt and numEntryIndices == 2:
var var
left = partitions[partitionIndex].entries[entryIndices[0]] left = partitions[partitionIndex].entries[entryIndices[0]]
right = partitions[partitionIndex].entries[entryIndices[1]] right = partitions[partitionIndex].entries[entryIndices[1]]

View file

@ -7,28 +7,23 @@ when defined(release):
{.push checks: off.} {.push checks: off.}
proc fillUnsafeAvx*( proc fillUnsafeAvx*(
data: var seq[ColorRGBX], data: ptr UncheckedArray[ColorRGBX],
rgbx: ColorRGBX, len: int,
start, len: int rgbx: ColorRGBX
) = ) =
var var i: int
i = start while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
data[i] = rgbx data[i] = rgbx
inc i inc i
p += 4
# When supported, SIMD fill until we run out of room
let let
iterations = (start + len - i) div 8 iterations = (len - i) div 8
colorVec = mm256_set1_epi32(cast[int32](rgbx)) colorVec = mm256_set1_epi32(cast[int32](rgbx))
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
mm256_store_si256(cast[pointer](p), colorVec) mm256_store_si256(data[i].addr, colorVec)
p += 32 i += 8
i += iterations * 8
# Fill whatever is left the slow way # Fill whatever is left the slow way
for i in i ..< start + len: for i in i ..< len:
data[i] = rgbx data[i] = rgbx
when defined(release): when defined(release):

View file

@ -6,108 +6,96 @@ when defined(gcc) or defined(clang):
when defined(release): when defined(release):
{.push checks: off.} {.push checks: off.}
proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool = proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
result = true result = true
let color = data[0] let color = data[0]
var var i: int
i = start while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i] != color: if data[i] != color:
return false return false
inc i inc i
p += 4
let let
colorVec = mm256_set1_epi32(cast[int32](color)) colorVec = mm256_set1_epi32(cast[int32](color))
iterations = (start + len - i) div 16 iterations = (len - i) div 16
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
let let
values0 = mm256_load_si256(cast[pointer](p)) values0 = mm256_load_si256(data[i].addr)
values1 = mm256_load_si256(cast[pointer](p + 32)) values1 = mm256_load_si256(data[i + 8].addr)
eq0 = mm256_cmpeq_epi8(values0, colorVec) eq0 = mm256_cmpeq_epi8(values0, colorVec)
eq1 = mm256_cmpeq_epi8(values1, colorVec) eq1 = mm256_cmpeq_epi8(values1, colorVec)
eq01 = mm256_and_si256(eq0, eq1) eq01 = mm256_and_si256(eq0, eq1)
if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff): if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
return false return false
p += 64 i += 16
i += 16 * iterations
for i in i ..< start + len: for i in i ..< len:
if data[i] != color: if data[i] != color:
return false return false
proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool = proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
result = true result = true
var var i: int
i = start while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i].a != 0: if data[i].a != 0:
return false return false
inc i inc i
p += 4
let let
vecZero = mm256_setzero_si256() vecZero = mm256_setzero_si256()
iterations = (start + len - i) div 16 iterations = (len - i) div 16
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
let let
values0 = mm256_load_si256(cast[pointer](p)) values0 = mm256_load_si256(data[i].addr)
values1 = mm256_load_si256(cast[pointer](p + 32)) values1 = mm256_load_si256(data[i + 8].addr)
values01 = mm256_or_si256(values0, values1) values01 = mm256_or_si256(values0, values1)
eq = mm256_cmpeq_epi8(values01, vecZero) eq = mm256_cmpeq_epi8(values01, vecZero)
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
return false return false
p += 64 i += 16
i += 16 * iterations
for i in i ..< start + len: for i in i ..< len:
if data[i].a != 0: if data[i].a != 0:
return false return false
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
result = true result = true
var var i: int
i = start while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes
p = cast[uint](data[i].addr)
# Align to 32 bytes
while i < (start + len) and (p and 31) != 0:
if data[i].a != 255: if data[i].a != 255:
return false return false
inc i inc i
p += 4
let let
vec255 = mm256_set1_epi8(255) vec255 = mm256_set1_epi8(255)
iterations = (start + len - i) div 16 iterations = (len - i) div 16
for _ in 0 ..< iterations: for _ in 0 ..< iterations:
let let
values0 = mm256_load_si256(cast[pointer](p)) values0 = mm256_load_si256(data[i].addr)
values1 = mm256_load_si256(cast[pointer](p + 32)) values1 = mm256_load_si256(data[i + 8].addr)
values01 = mm256_and_si256(values0, values1) values01 = mm256_and_si256(values0, values1)
eq = mm256_cmpeq_epi8(values01, vec255) eq = mm256_cmpeq_epi8(values01, vec255)
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
return false return false
p += 64 i += 16
i += 16 * iterations
for i in i ..< start + len: for i in i ..< len:
if data[i].a != 255: if data[i].a != 255:
return false return false
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int = proc toPremultipliedAlphaAvx2*(
data: ptr UncheckedArray[uint32],
len: int
): int =
let let
alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00)) oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081)) div255 = mm256_set1_epi16(cast[int16](0x8081))
for _ in 0 ..< data.len div 8: for _ in 0 ..< len div 8:
let let
values = mm256_loadu_si256(data[result].addr) values = mm256_loadu_si256(data[result].addr)
alpha = mm256_and_si256(values, alphaMask) alpha = mm256_and_si256(values, alphaMask)

322
src/pixie/simd.nim Normal file
View file

@ -0,0 +1,322 @@
import chroma, vmath
when defined(release):
{.push checks: off.}
when defined(amd64):
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx,
runtimechecked/avx2
let
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
proc packAlphaValues(v: M128i): M128i {.inline.} =
## Shuffle the alpha values for these 4 colors to the first 4 bytes.
result = mm_srli_epi32(v, 24)
result = mm_packus_epi16(result, mm_setzero_si128())
result = mm_packus_epi16(result, mm_setzero_si128())
proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} =
let
i = packAlphaValues(i)
j = mm_slli_si128(packAlphaValues(j), 4)
k = mm_slli_si128(packAlphaValues(k), 8)
l = mm_slli_si128(packAlphaValues(l), 12)
mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l))
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
proc fillUnsafeSimd*(
data: ptr UncheckedArray[ColorRGBX],
len: int,
rgbx: ColorRGBX
) =
if cpuHasAvx and len >= 64:
fillUnsafeAvx(data, len, rgbx)
else:
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
data[i] = rgbx
inc i
let
colorVec = mm_set1_epi32(cast[int32](rgbx))
iterations = (len - i) div 8
for _ in 0 ..< iterations:
mm_store_si128(data[i].addr, colorVec)
mm_store_si128(data[i + 4].addr, colorVec)
i += 8
for i in i ..< len:
data[i] = rgbx
proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
if cpuHasAvx2:
return isOneColorAvx2(data, len)
result = true
let color = data[0]
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
if data[i] != color:
return false
inc i
let
colorVec = mm_set1_epi32(cast[int32](color))
iterations = (len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
eq0 = mm_cmpeq_epi8(values0, colorVec)
eq1 = mm_cmpeq_epi8(values1, colorVec)
eq2 = mm_cmpeq_epi8(values2, colorVec)
eq3 = mm_cmpeq_epi8(values3, colorVec)
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
if mm_movemask_epi8(eq0123) != 0xffff:
return false
i += 16
for i in i ..< len:
if data[i] != color:
return false
proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
if cpuHasAvx2:
return isTransparentAvx2(data, len)
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
if data[i].a != 0:
return false
inc i
result = true
let
vecZero = mm_setzero_si128()
iterations = (len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
values01 = mm_or_si128(values0, values1)
values23 = mm_or_si128(values2, values3)
values0123 = mm_or_si128(values01, values23)
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
return false
i += 16
for i in i ..< len:
if data[i].a != 0:
return false
proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool =
if cpuHasAvx2:
return isOpaqueAvx2(data, len)
result = true
var i: int
while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes
if data[i].a != 255:
return false
inc i
let
vec255 = mm_set1_epi8(255)
iterations = (len - i) div 16
for _ in 0 ..< iterations:
let
values0 = mm_load_si128(data[i].addr)
values1 = mm_load_si128(data[i + 4].addr)
values2 = mm_load_si128(data[i + 8].addr)
values3 = mm_load_si128(data[i + 12].addr)
values01 = mm_and_si128(values0, values1)
values23 = mm_and_si128(values2, values3)
values0123 = mm_and_si128(values01, values23)
eq = mm_cmpeq_epi8(values0123, vec255)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
return false
i += 16
for i in i ..< len:
if data[i].a != 255:
return false
proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) =
var i: int
if cpuHasAvx2:
i = toPremultipliedAlphaAvx2(data, len)
else:
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
for _ in 0 ..< len div 4:
let
values = mm_loadu_si128(data[i].addr)
alpha = mm_and_si128(values, alphaMask)
eq = mm_cmpeq_epi8(values, alphaMask)
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
let
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
var
colorsEven = mm_slli_epi16(values, 8)
colorsOdd = mm_and_si128(values, oddMask)
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
)
i += 4
for i in i ..< len:
var c: ColorRGBX
copyMem(c.addr, data[i].addr, 4)
c.r = ((c.r.uint32 * c.a) div 255).uint8
c.g = ((c.g.uint32 * c.a) div 255).uint8
c.b = ((c.b.uint32 * c.a) div 255).uint8
copyMem(data[i].addr, c.addr, 4)
proc newImageFromMaskSimd*(
dst: ptr UncheckedArray[ColorRGBX],
src: ptr UncheckedArray[uint8],
len: int
) =
var i: int
for _ in 0 ..< len div 16:
var alphas = mm_loadu_si128(src[i].addr)
for j in 0 ..< 4:
var unpacked = unpackAlphaValues(alphas)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8))
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
mm_storeu_si128(dst[i + j * 4].addr, unpacked)
alphas = mm_srli_si128(alphas, 4)
i += 16
for i in i ..< len:
let v = src[i]
dst[i] = rgbx(v, v, v, v)
proc newMaskFromImageSimd*(
dst: ptr UncheckedArray[uint8],
src: ptr UncheckedArray[ColorRGBX],
len: int
) =
var i: int
for _ in 0 ..< len div 16:
let
a = mm_loadu_si128(src[i + 0].addr)
b = mm_loadu_si128(src[i + 4].addr)
c = mm_loadu_si128(src[i + 8].addr)
d = mm_loadu_si128(src[i + 12].addr)
mm_storeu_si128(
dst[i].addr,
pack4xAlphaValues(a, b, c, d)
)
i += 16
for i in i ..< len:
dst[i] = src[i].a
proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) =
var i: int
let vec255 = mm_set1_epi8(cast[int8](255))
for _ in 0 ..< len div 16:
let
a = mm_loadu_si128(data[i + 0].addr)
b = mm_loadu_si128(data[i + 4].addr)
c = mm_loadu_si128(data[i + 8].addr)
d = mm_loadu_si128(data[i + 12].addr)
mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a))
mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b))
mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c))
mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d))
i += 16
for i in i ..< len:
var rgbx = data[i]
rgbx.r = 255 - rgbx.r
rgbx.g = 255 - rgbx.g
rgbx.b = 255 - rgbx.b
rgbx.a = 255 - rgbx.a
data[i] = rgbx
toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len)
proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) =
var i: int
let vec255 = mm_set1_epi8(255)
for _ in 0 ..< len div 16:
var values = mm_loadu_si128(data[i].addr)
values = mm_sub_epi8(vec255, values)
mm_storeu_si128(data[i].addr, values)
i += 16
for j in i ..< len:
data[j] = 255 - data[j]
proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) =
var i: int
let
zeroVec = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
for _ in 0 ..< len div 16:
var values = mm_loadu_si128(data[i].addr)
values = mm_cmpeq_epi8(values, zeroVec)
values = mm_andnot_si128(values, vec255)
mm_storeu_si128(data[i].addr, values)
i += 16
for i in i ..< len:
if data[i] != 0:
data[i] = 255
proc applyOpacitySimd*(
data: ptr UncheckedArray[uint8],
len: int,
opacity: uint16
) =
var i: int
let
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
zeroVec = mm_setzero_si128()
opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8)
for _ in 0 ..< len div 16:
let values = mm_loadu_si128(data[i].addr)
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
var
valuesEven = mm_slli_epi16(values, 8)
valuesOdd = mm_and_si128(values, oddMask)
valuesEven = mm_mulhi_epu16(valuesEven, opacityVec)
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
data[i].addr,
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
i += 16
for i in i ..< len:
data[i] = ((data[i] * opacity) div 255).uint8
when defined(release):
{.pop.}