Merge pull request #453 from guzba/master
aligned sse2 + avx2 isOneColor isOpaque isTransparent, toPremultipliedAlphaAvx2
This commit is contained in:
commit
21c15a2680
8 changed files with 403 additions and 236 deletions
|
@ -194,6 +194,8 @@ block:
|
||||||
surface = imageSurfaceCreate(FORMAT_ARGB32, 900, 900)
|
surface = imageSurfaceCreate(FORMAT_ARGB32, 900, 900)
|
||||||
ctx = surface.create()
|
ctx = surface.create()
|
||||||
|
|
||||||
|
ctx.setLineWidth(1)
|
||||||
|
|
||||||
timeIt "[cairo] " & benchmark.name:
|
timeIt "[cairo] " & benchmark.name:
|
||||||
for fill in benchmark.fills:
|
for fill in benchmark.fills:
|
||||||
if fill.shapes.len > 0:
|
if fill.shapes.len > 0:
|
||||||
|
@ -221,6 +223,7 @@ block:
|
||||||
FillRuleEvenOdd
|
FillRuleEvenOdd
|
||||||
)
|
)
|
||||||
ctx.fill()
|
ctx.fill()
|
||||||
|
# ctx.stroke()
|
||||||
|
|
||||||
# discard surface.writeToPng(("cairo_" & benchmark.name & ".png").cstring)
|
# discard surface.writeToPng(("cairo_" & benchmark.name & ".png").cstring)
|
||||||
|
|
||||||
|
@ -242,5 +245,11 @@ block:
|
||||||
fill.transform,
|
fill.transform,
|
||||||
fill.windingRule
|
fill.windingRule
|
||||||
)
|
)
|
||||||
|
# image.strokePath(
|
||||||
|
# p,
|
||||||
|
# fill.paint,
|
||||||
|
# fill.transform,
|
||||||
|
# 1
|
||||||
|
# )
|
||||||
|
|
||||||
# image.writeFile("pixie_" & benchmark.name & ".png")
|
# image.writeFile("pixie_" & benchmark.name & ".png")
|
||||||
|
|
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
||||||
requires "chroma >= 0.2.5"
|
requires "chroma >= 0.2.5"
|
||||||
requires "zippy >= 0.10.2"
|
requires "zippy >= 0.10.2"
|
||||||
requires "flatty >= 0.3.4"
|
requires "flatty >= 0.3.4"
|
||||||
requires "nimsimd >= 1.1.1"
|
requires "nimsimd >= 1.1.5"
|
||||||
requires "bumpy >= 1.1.1"
|
requires "bumpy >= 1.1.1"
|
||||||
|
|
||||||
task bindings, "Generate bindings":
|
task bindings, "Generate bindings":
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
|
import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal,
|
||||||
pixie/masks, sequtils, std/decls, strutils
|
pixie/masks, std/decls, std/sequtils, std/strutils
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
import nimsimd/sse2
|
import nimsimd/sse2
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import blends, bumpy, chroma, common, masks, pixie/internal, vmath
|
import blends, bumpy, chroma, common, masks, pixie/internal, vmath
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
import nimsimd/sse2
|
import nimsimd/sse2, runtimechecked/avx2
|
||||||
|
|
||||||
const h = 0.5.float32
|
const h = 0.5.float32
|
||||||
|
|
||||||
|
@ -101,54 +101,84 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} =
|
||||||
|
|
||||||
proc isOneColor*(image: Image): bool {.raises: [].} =
|
proc isOneColor*(image: Image): bool {.raises: [].} =
|
||||||
## Checks if the entire image is the same color.
|
## Checks if the entire image is the same color.
|
||||||
|
when defined(amd64) and allowSimd:
|
||||||
|
if cpuHasAvx2:
|
||||||
|
return isOneColorAvx2(image.data, 0, image.data.len)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
let color = image.data[0]
|
let color = image.data[0]
|
||||||
|
|
||||||
var i: int
|
var i: int
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
let colorVec = mm_set1_epi32(cast[int32](color))
|
# Align to 16 bytes
|
||||||
for _ in 0 ..< image.data.len div 16:
|
var p = cast[uint](image.data[i].addr)
|
||||||
|
while i < image.data.len and (p and 15) != 0:
|
||||||
|
if image.data[i] != color:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
colorVec = mm_set1_epi32(cast[int32](color))
|
||||||
|
iterations = (image.data.len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm_loadu_si128(image.data[i + 0].addr)
|
values0 = mm_load_si128(cast[pointer](p))
|
||||||
values1 = mm_loadu_si128(image.data[i + 4].addr)
|
values1 = mm_load_si128(cast[pointer](p + 16))
|
||||||
values2 = mm_loadu_si128(image.data[i + 8].addr)
|
values2 = mm_load_si128(cast[pointer](p + 32))
|
||||||
values3 = mm_loadu_si128(image.data[i + 12].addr)
|
values3 = mm_load_si128(cast[pointer](p + 48))
|
||||||
eq0 = mm_cmpeq_epi8(values0, colorVec)
|
eq0 = mm_cmpeq_epi8(values0, colorVec)
|
||||||
eq1 = mm_cmpeq_epi8(values1, colorVec)
|
eq1 = mm_cmpeq_epi8(values1, colorVec)
|
||||||
eq2 = mm_cmpeq_epi8(values2, colorVec)
|
eq2 = mm_cmpeq_epi8(values2, colorVec)
|
||||||
eq3 = mm_cmpeq_epi8(values3, colorVec)
|
eq3 = mm_cmpeq_epi8(values3, colorVec)
|
||||||
eq = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
|
eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3))
|
||||||
if mm_movemask_epi8(eq) != 0xffff:
|
if mm_movemask_epi8(eq0123) != 0xffff:
|
||||||
return false
|
return false
|
||||||
i += 16
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
for j in i ..< image.data.len:
|
for i in i ..< image.data.len:
|
||||||
if image.data[j] != color:
|
if image.data[i] != color:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc isTransparent*(image: Image): bool {.raises: [].} =
|
proc isTransparent*(image: Image): bool {.raises: [].} =
|
||||||
## Checks if this image is fully transparent or not.
|
## Checks if this image is fully transparent or not.
|
||||||
|
when defined(amd64) and allowSimd:
|
||||||
|
if cpuHasAvx2:
|
||||||
|
return isTransparentAvx2(image.data, 0, image.data.len)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var i: int
|
var i: int
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
let vecZero = mm_setzero_si128()
|
# Align to 16 bytes
|
||||||
for _ in 0 ..< image.data.len div 16:
|
var p = cast[uint](image.data[i].addr)
|
||||||
|
while i < image.data.len and (p and 15) != 0:
|
||||||
|
if image.data[i].a != 0:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vecZero = mm_setzero_si128()
|
||||||
|
iterations = (image.data.len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm_loadu_si128(image.data[i + 0].addr)
|
values0 = mm_load_si128(cast[pointer](p))
|
||||||
values1 = mm_loadu_si128(image.data[i + 4].addr)
|
values1 = mm_load_si128(cast[pointer](p + 16))
|
||||||
values2 = mm_loadu_si128(image.data[i + 8].addr)
|
values2 = mm_load_si128(cast[pointer](p + 32))
|
||||||
values3 = mm_loadu_si128(image.data[i + 12].addr)
|
values3 = mm_load_si128(cast[pointer](p + 48))
|
||||||
values01 = mm_or_si128(values0, values1)
|
values01 = mm_or_si128(values0, values1)
|
||||||
values23 = mm_or_si128(values2, values3)
|
values23 = mm_or_si128(values2, values3)
|
||||||
values = mm_or_si128(values01, values23)
|
values0123 = mm_or_si128(values01, values23)
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi8(values, vecZero)) != 0xffff:
|
if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff:
|
||||||
return false
|
return false
|
||||||
i += 16
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
for j in i ..< image.data.len:
|
for i in i ..< image.data.len:
|
||||||
if image.data[j].a != 0:
|
if image.data[i].a != 0:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
proc isOpaque*(image: Image): bool {.raises: [].} =
|
proc isOpaque*(image: Image): bool {.raises: [].} =
|
||||||
|
|
|
@ -3,8 +3,10 @@ import bumpy, chroma, common, system/memory, vmath
|
||||||
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
import nimsimd/runtimecheck, nimsimd/sse2, simd/avx
|
import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2
|
||||||
let cpuHasAvx* = checkInstructionSets({AVX})
|
let
|
||||||
|
cpuHasAvx* = checkInstructionSets({AVX})
|
||||||
|
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
|
||||||
|
|
||||||
template currentExceptionAsPixieError*(): untyped =
|
template currentExceptionAsPixieError*(): untyped =
|
||||||
## Gets the current exception and returns it as a PixieError with stack trace.
|
## Gets the current exception and returns it as a PixieError with stack trace.
|
||||||
|
@ -141,70 +143,87 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].}
|
||||||
## Converts an image to premultiplied alpha from straight alpha.
|
## Converts an image to premultiplied alpha from straight alpha.
|
||||||
var i: int
|
var i: int
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
# When supported, SIMD convert as much as possible
|
if cpuHasAvx2:
|
||||||
let
|
i = toPremultipliedAlphaAvx2(data)
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
else:
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
|
||||||
for _ in 0 ..< data.len div 4:
|
|
||||||
let
|
let
|
||||||
values = mm_loadu_si128(data[i].addr)
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
||||||
alpha = mm_and_si128(values, alphaMask)
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
eq = mm_cmpeq_epi8(values, alphaMask)
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
for _ in 0 ..< data.len div 4:
|
||||||
let
|
let
|
||||||
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
|
values = mm_loadu_si128(data[i].addr)
|
||||||
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
|
alpha = mm_and_si128(values, alphaMask)
|
||||||
var
|
eq = mm_cmpeq_epi8(values, alphaMask)
|
||||||
colorsEven = mm_slli_epi16(values, 8)
|
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
||||||
colorsOdd = mm_and_si128(values, oddMask)
|
let
|
||||||
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
|
evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16))
|
||||||
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
|
oddMultiplier = mm_or_si128(evenMultiplier, alphaMask)
|
||||||
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
|
var
|
||||||
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
|
colorsEven = mm_slli_epi16(values, 8)
|
||||||
mm_storeu_si128(
|
colorsOdd = mm_and_si128(values, oddMask)
|
||||||
data[i].addr,
|
colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier)
|
||||||
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
|
colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier)
|
||||||
)
|
colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7)
|
||||||
i += 4
|
colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7)
|
||||||
|
mm_storeu_si128(
|
||||||
|
data[i].addr,
|
||||||
|
mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8))
|
||||||
|
)
|
||||||
|
i += 4
|
||||||
|
|
||||||
# Convert whatever is left
|
# Convert whatever is left
|
||||||
for j in i ..< data.len:
|
for i in i ..< data.len:
|
||||||
var c = data[j]
|
var c = data[i]
|
||||||
if c.a != 255:
|
if c.a != 255:
|
||||||
c.r = ((c.r.uint32 * c.a.uint32) div 255).uint8
|
c.r = ((c.r.uint32 * c.a) div 255).uint8
|
||||||
c.g = ((c.g.uint32 * c.a.uint32) div 255).uint8
|
c.g = ((c.g.uint32 * c.a) div 255).uint8
|
||||||
c.b = ((c.b.uint32 * c.a.uint32) div 255).uint8
|
c.b = ((c.b.uint32 * c.a) div 255).uint8
|
||||||
data[j] = c
|
data[i] = c
|
||||||
|
|
||||||
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
|
proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool =
|
||||||
|
when defined(amd64) and allowSimd:
|
||||||
|
if cpuHasAvx2 and len >= 64:
|
||||||
|
return isOpaqueAvx2(data, start, len)
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
var i = start
|
var i = start
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
let vec255 = mm_set1_epi32(cast[int32](uint32.high))
|
# Align to 16 bytes
|
||||||
for _ in start ..< (start + len) div 16:
|
var p = cast[uint](data[i].addr)
|
||||||
|
while i < (start + len) and (p and 15) != 0:
|
||||||
|
if data[i].a != 255:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = mm_set1_epi8(255)
|
||||||
|
iterations = (start + len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
values0 = mm_loadu_si128(data[i + 0].addr)
|
values0 = mm_load_si128(cast[pointer](p))
|
||||||
values1 = mm_loadu_si128(data[i + 4].addr)
|
values1 = mm_load_si128(cast[pointer](p + 16))
|
||||||
values2 = mm_loadu_si128(data[i + 8].addr)
|
values2 = mm_load_si128(cast[pointer](p + 32))
|
||||||
values3 = mm_loadu_si128(data[i + 12].addr)
|
values3 = mm_load_si128(cast[pointer](p + 48))
|
||||||
values01 = mm_and_si128(values0, values1)
|
values01 = mm_and_si128(values0, values1)
|
||||||
values23 = mm_and_si128(values2, values3)
|
values23 = mm_and_si128(values2, values3)
|
||||||
values = mm_and_si128(values01, values23)
|
values0123 = mm_and_si128(values01, values23)
|
||||||
eq = mm_cmpeq_epi8(values, vec255)
|
eq = mm_cmpeq_epi8(values0123, vec255)
|
||||||
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888:
|
||||||
return false
|
return false
|
||||||
i += 16
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
for j in i ..< start + len:
|
for i in i ..< start + len:
|
||||||
if data[j].a != 255:
|
if data[i].a != 255:
|
||||||
return false
|
return false
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
|
proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
|
||||||
let opacityVec = mm_set1_ps(opacity)
|
let opacityVec = mm_set1_ps(opacity)
|
||||||
var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
|
var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec))
|
||||||
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
|
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
|
||||||
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
|
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
|
||||||
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
|
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import blends, bumpy, chroma, common, fenv, images, internal, masks, paints,
|
import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv,
|
||||||
strutils, vmath
|
std/strutils, vmath
|
||||||
|
|
||||||
when defined(amd64) and allowSimd:
|
when defined(amd64) and allowSimd:
|
||||||
import nimsimd/sse2
|
import nimsimd/sse2
|
||||||
|
@ -1171,7 +1171,9 @@ proc partitionSegments(
|
||||||
|
|
||||||
var entryCounts = newSeq[int](numPartitions)
|
var entryCounts = newSeq[int](numPartitions)
|
||||||
for (segment, _) in segments:
|
for (segment, _) in segments:
|
||||||
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
for partitionIndex in segment.partitionRange(
|
||||||
|
numPartitions, startY, partitionHeight
|
||||||
|
):
|
||||||
inc entryCounts[partitionIndex]
|
inc entryCounts[partitionIndex]
|
||||||
|
|
||||||
for partitionIndex, entryCounts in entryCounts:
|
for partitionIndex, entryCounts in entryCounts:
|
||||||
|
@ -1179,7 +1181,9 @@ proc partitionSegments(
|
||||||
|
|
||||||
var indexes = newSeq[int](numPartitions)
|
var indexes = newSeq[int](numPartitions)
|
||||||
for i, (segment, winding) in segments:
|
for i, (segment, winding) in segments:
|
||||||
for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight):
|
for partitionIndex in segment.partitionRange(
|
||||||
|
numPartitions, startY, partitionHeight
|
||||||
|
):
|
||||||
result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
|
result[partitionIndex].entries[indexes[partitionIndex]] = entries[i]
|
||||||
inc indexes[partitionIndex]
|
inc indexes[partitionIndex]
|
||||||
|
|
||||||
|
@ -1915,175 +1919,147 @@ proc fillShapes(
|
||||||
break
|
break
|
||||||
|
|
||||||
if allEntriesInScanlineSpanIt and tmp == 2:
|
if allEntriesInScanlineSpanIt and tmp == 2:
|
||||||
var at: Vec2
|
var
|
||||||
if not intersectsInside(
|
left = partitions[partitionIndex].entries[entryIndices[0]]
|
||||||
partitions[partitionIndex].entries[entryIndices[0]].segment,
|
right = partitions[partitionIndex].entries[entryIndices[1]]
|
||||||
partitions[partitionIndex].entries[entryIndices[1]].segment,
|
block:
|
||||||
at
|
# Ensure left is actually on the left
|
||||||
):
|
let
|
||||||
# We have 2 non-intersecting lines
|
maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x)
|
||||||
var
|
maybeRightMaxX = max(right.segment.at.x, right.segment.to.x)
|
||||||
left = partitions[partitionIndex].entries[entryIndices[0]]
|
if maybeLeftMaxX > maybeRightMaxX:
|
||||||
right = partitions[partitionIndex].entries[entryIndices[1]]
|
swap left, right
|
||||||
block:
|
|
||||||
# Ensure left is actually on the left
|
|
||||||
let
|
|
||||||
maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x)
|
|
||||||
maybeRightMaxX = max(right.segment.at.x, right.segment.to.x)
|
|
||||||
if maybeLeftMaxX > maybeRightMaxX:
|
|
||||||
swap left, right
|
|
||||||
|
|
||||||
let requiresAntiAliasing =
|
# Use trapezoid coverage at the edges and fill in the middle
|
||||||
left.segment.requiresAntiAliasing or
|
|
||||||
right.segment.requiresAntiAliasing
|
|
||||||
|
|
||||||
if requiresAntiAliasing:
|
when allowSimd and defined(amd64):
|
||||||
# We have 2 non-intersecting lines that require anti-aliasing
|
let vecRgbx = mm_set_ps(
|
||||||
# Use trapezoid coverage at the edges and fill in the middle
|
rgbx.a.float32,
|
||||||
|
rgbx.b.float32,
|
||||||
when allowSimd and defined(amd64):
|
rgbx.g.float32,
|
||||||
let vecRgbx = mm_set_ps(
|
rgbx.r.float32
|
||||||
rgbx.a.float32,
|
)
|
||||||
rgbx.b.float32,
|
|
||||||
rgbx.g.float32,
|
|
||||||
rgbx.r.float32
|
|
||||||
)
|
|
||||||
|
|
||||||
proc solveX(entry: PartitionEntry, y: float32): float32 =
|
|
||||||
if entry.m == 0:
|
|
||||||
entry.b
|
|
||||||
else:
|
|
||||||
(y - entry.b) / entry.m
|
|
||||||
|
|
||||||
proc solveY(entry: PartitionEntry, x: float32): float32 =
|
|
||||||
entry.m * x + entry.b
|
|
||||||
|
|
||||||
var
|
|
||||||
leftTop = vec2(0, y.float32)
|
|
||||||
leftBottom = vec2(0, (y + 1).float32)
|
|
||||||
leftTop.x = left.solveX(leftTop.y.float32)
|
|
||||||
leftBottom.x = left.solveX(leftBottom.y)
|
|
||||||
|
|
||||||
var
|
|
||||||
rightTop = vec2(0, y.float32)
|
|
||||||
rightBottom = vec2(0, (y + 1).float32)
|
|
||||||
rightTop.x = right.solveX(rightTop.y)
|
|
||||||
rightBottom.x = right.solveX(rightBottom.y)
|
|
||||||
|
|
||||||
let
|
|
||||||
# leftMinX = min(leftTop.x, leftBottom.x)
|
|
||||||
leftMaxX = max(leftTop.x, leftBottom.x)
|
|
||||||
rightMinX = min(rightTop.x, rightBottom.x)
|
|
||||||
# rightMaxX = max(rightTop.x, rightBottom.x)
|
|
||||||
# leftCoverBegin = leftMinX.trunc
|
|
||||||
leftCoverEnd = leftMaxX.ceil.int
|
|
||||||
rightCoverBegin = rightMinX.trunc.int
|
|
||||||
# rightCoverEnd = rightMaxX.ceil
|
|
||||||
|
|
||||||
if leftCoverEnd < rightCoverBegin:
|
|
||||||
# Only take this shortcut if the partial coverage areas on the
|
|
||||||
# left and the right do not overlap
|
|
||||||
|
|
||||||
let blender = blendMode.blender()
|
|
||||||
|
|
||||||
block: # Left-side partial coverage
|
|
||||||
let
|
|
||||||
inverted = leftTop.x < leftBottom.x
|
|
||||||
sliverStart = min(leftTop.x, leftBottom.x)
|
|
||||||
rectStart = max(leftTop.x, leftBottom.x)
|
|
||||||
var
|
|
||||||
pen = sliverStart
|
|
||||||
prevPen = pen
|
|
||||||
penY = if inverted: y.float32 else: (y + 1).float32
|
|
||||||
prevPenY = penY
|
|
||||||
for x in sliverStart.int ..< rectStart.ceil.int:
|
|
||||||
prevPen = pen
|
|
||||||
pen = (x + 1).float32
|
|
||||||
var rightRectArea = 0.float32
|
|
||||||
if pen > rectStart:
|
|
||||||
rightRectArea = pen - rectStart
|
|
||||||
pen = rectStart
|
|
||||||
prevPenY = penY
|
|
||||||
penY = left.solveY(pen)
|
|
||||||
if x < 0 or x >= image.width:
|
|
||||||
continue
|
|
||||||
let
|
|
||||||
run = pen - prevPen
|
|
||||||
triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
|
|
||||||
rectArea =
|
|
||||||
if inverted:
|
|
||||||
(prevPenY - y.float32) * run
|
|
||||||
else:
|
|
||||||
((y + 1).float32 - prevPenY) * run
|
|
||||||
area = triangleArea + rectArea + rightRectArea
|
|
||||||
dataIndex = image.dataIndex(x, y)
|
|
||||||
backdrop = image.data[dataIndex]
|
|
||||||
source =
|
|
||||||
when allowSimd and defined(amd64):
|
|
||||||
applyOpacity(vecRgbx, area)
|
|
||||||
else:
|
|
||||||
rgbx * area
|
|
||||||
image.data[dataIndex] = blender(backdrop, source)
|
|
||||||
|
|
||||||
block: # Right-side partial coverage
|
|
||||||
let
|
|
||||||
inverted = rightTop.x > rightBottom.x
|
|
||||||
rectEnd = min(rightTop.x, rightBottom.x)
|
|
||||||
sliverEnd = max(rightTop.x, rightBottom.x)
|
|
||||||
var
|
|
||||||
pen = rectEnd
|
|
||||||
prevPen = pen
|
|
||||||
penY = if inverted: (y + 1).float32 else: y.float32
|
|
||||||
prevPenY = penY
|
|
||||||
for x in rectEnd.int ..< sliverEnd.ceil.int:
|
|
||||||
prevPen = pen
|
|
||||||
pen = (x + 1).float32
|
|
||||||
let leftRectArea = prevPen.fractional
|
|
||||||
if pen > sliverEnd:
|
|
||||||
pen = sliverEnd
|
|
||||||
prevPenY = penY
|
|
||||||
penY = right.solveY(pen)
|
|
||||||
if x < 0 or x >= image.width:
|
|
||||||
continue
|
|
||||||
let
|
|
||||||
run = pen - prevPen
|
|
||||||
triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
|
|
||||||
rectArea =
|
|
||||||
if inverted:
|
|
||||||
(penY - y.float32) * run
|
|
||||||
else:
|
|
||||||
((y + 1).float32 - penY) * run
|
|
||||||
area = leftRectArea + triangleArea + rectArea
|
|
||||||
dataIndex = image.dataIndex(x, y)
|
|
||||||
backdrop = image.data[dataIndex]
|
|
||||||
source =
|
|
||||||
when allowSimd and defined(amd64):
|
|
||||||
applyOpacity(vecRgbx, area)
|
|
||||||
else:
|
|
||||||
rgbx * area
|
|
||||||
image.data[dataIndex] = blender(backdrop, source)
|
|
||||||
|
|
||||||
let
|
|
||||||
fillBegin = leftCoverEnd.clamp(0, image.width)
|
|
||||||
fillEnd = rightCoverBegin.clamp(0, image.width)
|
|
||||||
if fillEnd - fillBegin > 0:
|
|
||||||
hits[0] = (fixed32(fillBegin.float32), 1.int16)
|
|
||||||
hits[1] = (fixed32(fillEnd.float32), -1.int16)
|
|
||||||
image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode)
|
|
||||||
|
|
||||||
inc y
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
proc solveX(entry: PartitionEntry, y: float32): float32 =
|
||||||
|
if entry.m == 0:
|
||||||
|
entry.b
|
||||||
else:
|
else:
|
||||||
|
(y - entry.b) / entry.m
|
||||||
|
|
||||||
|
proc solveY(entry: PartitionEntry, x: float32): float32 =
|
||||||
|
entry.m * x + entry.b
|
||||||
|
|
||||||
|
var
|
||||||
|
leftTop = vec2(0, y.float32)
|
||||||
|
leftBottom = vec2(0, (y + 1).float32)
|
||||||
|
leftTop.x = left.solveX(leftTop.y.float32)
|
||||||
|
leftBottom.x = left.solveX(leftBottom.y)
|
||||||
|
|
||||||
|
var
|
||||||
|
rightTop = vec2(0, y.float32)
|
||||||
|
rightBottom = vec2(0, (y + 1).float32)
|
||||||
|
rightTop.x = right.solveX(rightTop.y)
|
||||||
|
rightBottom.x = right.solveX(rightBottom.y)
|
||||||
|
|
||||||
|
let
|
||||||
|
leftMaxX = max(leftTop.x, leftBottom.x)
|
||||||
|
rightMinX = min(rightTop.x, rightBottom.x)
|
||||||
|
leftCoverEnd = leftMaxX.ceil.int
|
||||||
|
rightCoverBegin = rightMinX.trunc.int
|
||||||
|
|
||||||
|
if leftCoverEnd < rightCoverBegin:
|
||||||
|
# Only take this shortcut if the partial coverage areas on the
|
||||||
|
# left and the right do not overlap
|
||||||
|
|
||||||
|
let blender = blendMode.blender()
|
||||||
|
|
||||||
|
block: # Left-side partial coverage
|
||||||
let
|
let
|
||||||
minX = left.segment.at.x.int.clamp(0, image.width)
|
inverted = leftTop.x < leftBottom.x
|
||||||
maxX = right.segment.at.x.int.clamp(0, image.width)
|
sliverStart = min(leftTop.x, leftBottom.x)
|
||||||
hits[0] = (cast[Fixed32](minX * 256), 1.int16)
|
rectStart = max(leftTop.x, leftBottom.x)
|
||||||
hits[1] = (cast[Fixed32](maxX * 256), -1.int16)
|
var
|
||||||
|
pen = sliverStart
|
||||||
|
prevPen = pen
|
||||||
|
penY = if inverted: y.float32 else: (y + 1).float32
|
||||||
|
prevPenY = penY
|
||||||
|
for x in sliverStart.int ..< rectStart.ceil.int:
|
||||||
|
prevPen = pen
|
||||||
|
pen = (x + 1).float32
|
||||||
|
var rightRectArea = 0.float32
|
||||||
|
if pen > rectStart:
|
||||||
|
rightRectArea = pen - rectStart
|
||||||
|
pen = rectStart
|
||||||
|
prevPenY = penY
|
||||||
|
penY = left.solveY(pen)
|
||||||
|
if x < 0 or x >= image.width:
|
||||||
|
continue
|
||||||
|
let
|
||||||
|
run = pen - prevPen
|
||||||
|
triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
|
||||||
|
rectArea =
|
||||||
|
if inverted:
|
||||||
|
(prevPenY - y.float32) * run
|
||||||
|
else:
|
||||||
|
((y + 1).float32 - prevPenY) * run
|
||||||
|
area = triangleArea + rectArea + rightRectArea
|
||||||
|
dataIndex = image.dataIndex(x, y)
|
||||||
|
backdrop = image.data[dataIndex]
|
||||||
|
source =
|
||||||
|
when allowSimd and defined(amd64):
|
||||||
|
applyOpacity(vecRgbx, area)
|
||||||
|
else:
|
||||||
|
rgbx * area
|
||||||
|
image.data[dataIndex] = blender(backdrop, source)
|
||||||
|
|
||||||
|
block: # Right-side partial coverage
|
||||||
|
let
|
||||||
|
inverted = rightTop.x > rightBottom.x
|
||||||
|
rectEnd = min(rightTop.x, rightBottom.x)
|
||||||
|
sliverEnd = max(rightTop.x, rightBottom.x)
|
||||||
|
var
|
||||||
|
pen = rectEnd
|
||||||
|
prevPen = pen
|
||||||
|
penY = if inverted: (y + 1).float32 else: y.float32
|
||||||
|
prevPenY = penY
|
||||||
|
for x in rectEnd.int ..< sliverEnd.ceil.int:
|
||||||
|
prevPen = pen
|
||||||
|
pen = (x + 1).float32
|
||||||
|
let leftRectArea = prevPen.fractional
|
||||||
|
if pen > sliverEnd:
|
||||||
|
pen = sliverEnd
|
||||||
|
prevPenY = penY
|
||||||
|
penY = right.solveY(pen)
|
||||||
|
if x < 0 or x >= image.width:
|
||||||
|
continue
|
||||||
|
let
|
||||||
|
run = pen - prevPen
|
||||||
|
triangleArea = 0.5.float32 * run * abs(penY - prevPenY)
|
||||||
|
rectArea =
|
||||||
|
if inverted:
|
||||||
|
(penY - y.float32) * run
|
||||||
|
else:
|
||||||
|
((y + 1).float32 - penY) * run
|
||||||
|
area = leftRectArea + triangleArea + rectArea
|
||||||
|
dataIndex = image.dataIndex(x, y)
|
||||||
|
backdrop = image.data[dataIndex]
|
||||||
|
source =
|
||||||
|
when allowSimd and defined(amd64):
|
||||||
|
applyOpacity(vecRgbx, area)
|
||||||
|
else:
|
||||||
|
rgbx * area
|
||||||
|
image.data[dataIndex] = blender(backdrop, source)
|
||||||
|
|
||||||
|
let
|
||||||
|
fillBegin = leftCoverEnd.clamp(0, image.width)
|
||||||
|
fillEnd = rightCoverBegin.clamp(0, image.width)
|
||||||
|
if fillEnd - fillBegin > 0:
|
||||||
|
hits[0] = (fixed32(fillBegin.float32), 1.int16)
|
||||||
|
hits[1] = (fixed32(fillEnd.float32), -1.int16)
|
||||||
image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode)
|
image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode)
|
||||||
|
|
||||||
inc y
|
inc y
|
||||||
continue
|
continue
|
||||||
|
|
||||||
computeCoverage(
|
computeCoverage(
|
||||||
cast[ptr UncheckedArray[uint8]](coverages[0].addr),
|
cast[ptr UncheckedArray[uint8]](coverages[0].addr),
|
||||||
|
|
133
src/pixie/runtimechecked/avx2.nim
Normal file
133
src/pixie/runtimechecked/avx2.nim
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
import chroma, nimsimd/avx2
|
||||||
|
|
||||||
|
when defined(gcc) or defined(clang):
|
||||||
|
{.localPassc: "-mavx2".}
|
||||||
|
|
||||||
|
when defined(release):
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
|
proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
||||||
|
result = true
|
||||||
|
|
||||||
|
let color = data[0]
|
||||||
|
|
||||||
|
var
|
||||||
|
i = start
|
||||||
|
p = cast[uint](data[i].addr)
|
||||||
|
# Align to 32 bytes
|
||||||
|
while i < (start + len) and (p and 31) != 0:
|
||||||
|
if data[i] != color:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
colorVec = mm256_set1_epi32(cast[int32](color))
|
||||||
|
iterations = (start + len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm256_load_si256(cast[pointer](p))
|
||||||
|
values1 = mm256_load_si256(cast[pointer](p + 32))
|
||||||
|
eq0 = mm256_cmpeq_epi8(values0, colorVec)
|
||||||
|
eq1 = mm256_cmpeq_epi8(values1, colorVec)
|
||||||
|
eq01 = mm256_and_si256(eq0, eq1)
|
||||||
|
if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff):
|
||||||
|
return false
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< start + len:
|
||||||
|
if data[i] != color:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
||||||
|
result = true
|
||||||
|
|
||||||
|
var
|
||||||
|
i = start
|
||||||
|
p = cast[uint](data[i].addr)
|
||||||
|
# Align to 32 bytes
|
||||||
|
while i < (start + len) and (p and 31) != 0:
|
||||||
|
if data[i].a != 0:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vecZero = mm256_setzero_si256()
|
||||||
|
iterations = (start + len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm256_load_si256(cast[pointer](p))
|
||||||
|
values1 = mm256_load_si256(cast[pointer](p + 32))
|
||||||
|
values01 = mm256_or_si256(values0, values1)
|
||||||
|
eq = mm256_cmpeq_epi8(values01, vecZero)
|
||||||
|
if mm256_movemask_epi8(eq) != cast[int32](0xffffffff):
|
||||||
|
return false
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< start + len:
|
||||||
|
if data[i].a != 0:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool =
|
||||||
|
result = true
|
||||||
|
|
||||||
|
var
|
||||||
|
i = start
|
||||||
|
p = cast[uint](data[i].addr)
|
||||||
|
# Align to 32 bytes
|
||||||
|
while i < (start + len) and (p and 31) != 0:
|
||||||
|
if data[i].a != 255:
|
||||||
|
return false
|
||||||
|
inc i
|
||||||
|
p += 4
|
||||||
|
|
||||||
|
let
|
||||||
|
vec255 = mm256_set1_epi8(255)
|
||||||
|
iterations = (start + len - i) div 16
|
||||||
|
for _ in 0 ..< iterations:
|
||||||
|
let
|
||||||
|
values0 = mm256_load_si256(cast[pointer](p))
|
||||||
|
values1 = mm256_load_si256(cast[pointer](p + 32))
|
||||||
|
values01 = mm256_and_si256(values0, values1)
|
||||||
|
eq = mm256_cmpeq_epi8(values01, vec255)
|
||||||
|
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
|
||||||
|
return false
|
||||||
|
p += 64
|
||||||
|
i += 16 * iterations
|
||||||
|
|
||||||
|
for i in i ..< start + len:
|
||||||
|
if data[i].a != 255:
|
||||||
|
return false
|
||||||
|
|
||||||
|
proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int =
|
||||||
|
let
|
||||||
|
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
||||||
|
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
||||||
|
div255 = mm256_set1_epi16(cast[int16](0x8081))
|
||||||
|
for _ in 0 ..< data.len div 8:
|
||||||
|
let
|
||||||
|
values = mm256_loadu_si256(data[result].addr)
|
||||||
|
alpha = mm256_and_si256(values, alphaMask)
|
||||||
|
eq = mm256_cmpeq_epi8(values, alphaMask)
|
||||||
|
if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888:
|
||||||
|
let
|
||||||
|
evenMultiplier = mm256_or_si256(alpha, mm256_srli_epi32(alpha, 16))
|
||||||
|
oddMultiplier = mm256_or_si256(evenMultiplier, alphaMask)
|
||||||
|
var
|
||||||
|
colorsEven = mm256_slli_epi16(values, 8)
|
||||||
|
colorsOdd = mm256_and_si256(values, oddMask)
|
||||||
|
colorsEven = mm256_mulhi_epu16(colorsEven, evenMultiplier)
|
||||||
|
colorsOdd = mm256_mulhi_epu16(colorsOdd, oddMultiplier)
|
||||||
|
colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7)
|
||||||
|
colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7)
|
||||||
|
mm256_storeu_si256(
|
||||||
|
data[result].addr,
|
||||||
|
mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8))
|
||||||
|
)
|
||||||
|
result += 8
|
||||||
|
|
||||||
|
when defined(release):
|
||||||
|
{.pop.}
|
Loading…
Reference in a new issue