Merge pull request #79 from guzba/master
tiger 2x faster, uint8 coverage (less mem), small things
This commit is contained in:
commit
7bcb138c6f
7 changed files with 102 additions and 85 deletions
|
@ -1,4 +1,4 @@
|
||||||
import cairo, math, benchy, pixie, chroma
|
import cairo, math, benchy, pixie, pixie/paths, chroma
|
||||||
|
|
||||||
var
|
var
|
||||||
surface = imageSurfaceCreate(FORMAT_ARGB32, 1000, 1000)
|
surface = imageSurfaceCreate(FORMAT_ARGB32, 1000, 1000)
|
||||||
|
|
|
@ -10,5 +10,5 @@ requires "vmath >= 0.4.0"
|
||||||
requires "chroma >= 0.2.1"
|
requires "chroma >= 0.2.1"
|
||||||
requires "zippy >= 0.3.5"
|
requires "zippy >= 0.3.5"
|
||||||
requires "flatty >= 0.1.3"
|
requires "flatty >= 0.1.3"
|
||||||
requires "nimsimd >= 0.4.6"
|
requires "nimsimd >= 0.4.8"
|
||||||
requires "bumpy >= 1.0.1"
|
requires "bumpy >= 1.0.1"
|
||||||
|
|
|
@ -299,16 +299,16 @@ when defined(amd64) and not defined(pixieNoSimd):
|
||||||
else:
|
else:
|
||||||
proc alphaFix(backdrop, source, mixed: ColorRGBA): ColorRGBA {.inline.} =
|
proc alphaFix(backdrop, source, mixed: ColorRGBA): ColorRGBA {.inline.} =
|
||||||
let
|
let
|
||||||
sa = source.a.int32
|
sa = source.a.uint32
|
||||||
ba = backdrop.a.int32
|
ba = backdrop.a.uint32
|
||||||
t0 = sa * (255 - ba)
|
t0 = sa * (255 - ba)
|
||||||
t1 = sa * ba
|
t1 = sa * ba
|
||||||
t2 = (255 - sa) * ba
|
t2 = (255 - sa) * ba
|
||||||
|
|
||||||
let
|
let
|
||||||
r = t0 * source.r.int32 + t1 * mixed.r.int32 + t2 * backdrop.r.int32
|
r = t0 * source.r.uint32 + t1 * mixed.r.uint32 + t2 * backdrop.r.uint32
|
||||||
g = t0 * source.g.int32 + t1 * mixed.g.int32 + t2 * backdrop.g.int32
|
g = t0 * source.g.uint32 + t1 * mixed.g.uint32 + t2 * backdrop.g.uint32
|
||||||
b = t0 * source.b.int32 + t1 * mixed.b.int32 + t2 * backdrop.b.int32
|
b = t0 * source.b.uint32 + t1 * mixed.b.uint32 + t2 * backdrop.b.uint32
|
||||||
a = sa + ba * (255 - sa) div 255
|
a = sa + ba * (255 - sa) div 255
|
||||||
|
|
||||||
if a == 0:
|
if a == 0:
|
||||||
|
|
|
@ -47,8 +47,9 @@ proc toPremultipliedAlpha*(c: Color): Color {.inline.} =
|
||||||
|
|
||||||
proc toStraightAlpha*(c: Color): Color {.inline.} =
|
proc toStraightAlpha*(c: Color): Color {.inline.} =
|
||||||
## Converts a color to from premultiplied alpha to straight.
|
## Converts a color to from premultiplied alpha to straight.
|
||||||
if c.a == 0:
|
if c.a != 0 and c.a != 1:
|
||||||
return
|
result = c
|
||||||
|
else:
|
||||||
result.r = c.r / c.a
|
result.r = c.r / c.a
|
||||||
result.g = c.g / c.a
|
result.g = c.g / c.a
|
||||||
result.b = c.b / c.a
|
result.b = c.b / c.a
|
||||||
|
|
|
@ -437,10 +437,9 @@ proc encodePng*(
|
||||||
raise newException(PixieError, "Invalid PNG number of channels")
|
raise newException(PixieError, "Invalid PNG number of channels")
|
||||||
|
|
||||||
let data = cast[ptr UncheckedArray[uint8]](data)
|
let data = cast[ptr UncheckedArray[uint8]](data)
|
||||||
const signature = [137.uint8, 80, 78, 71, 13, 10, 26, 10]
|
|
||||||
|
|
||||||
# Add the PNG file signature
|
# Add the PNG file signature
|
||||||
result.add(signature)
|
result.add(pngSignature)
|
||||||
|
|
||||||
# Add IHDR
|
# Add IHDR
|
||||||
result.addUint32(13.uint32.swap())
|
result.addUint32(13.uint32.swap())
|
||||||
|
|
|
@ -66,7 +66,7 @@ proc `[]=`*(image: Image, x, y: int, rgba: ColorRGBA) {.inline.} =
|
||||||
image.setRgbaUnsafe(x, y, rgba)
|
image.setRgbaUnsafe(x, y, rgba)
|
||||||
|
|
||||||
proc fillUnsafe(data: var seq[ColorRGBA], rgba: ColorRGBA, start, len: int) =
|
proc fillUnsafe(data: var seq[ColorRGBA], rgba: ColorRGBA, start, len: int) =
|
||||||
## Fills the image data with a solid color starting at index start and
|
## Fills the image data with the parameter color starting at index start and
|
||||||
## continuing for len indices.
|
## continuing for len indices.
|
||||||
|
|
||||||
# Use memset when every byte has the same value
|
# Use memset when every byte has the same value
|
||||||
|
@ -95,7 +95,7 @@ proc fillUnsafe(data: var seq[ColorRGBA], rgba: ColorRGBA, start, len: int) =
|
||||||
data[j] = rgba
|
data[j] = rgba
|
||||||
|
|
||||||
proc fill*(image: Image, rgba: ColorRgba) {.inline.} =
|
proc fill*(image: Image, rgba: ColorRgba) {.inline.} =
|
||||||
## Fills the image with a solid color.
|
## Fills the image with the parameter color.
|
||||||
fillUnsafe(image.data, rgba, 0, image.data.len)
|
fillUnsafe(image.data, rgba, 0, image.data.len)
|
||||||
|
|
||||||
proc flipHorizontal*(image: Image) =
|
proc flipHorizontal*(image: Image) =
|
||||||
|
@ -234,7 +234,7 @@ proc invert*(image: Image) =
|
||||||
## Inverts all of the colors and alpha.
|
## Inverts all of the colors and alpha.
|
||||||
var i: int
|
var i: int
|
||||||
when defined(amd64) and not defined(pixieNoSimd):
|
when defined(amd64) and not defined(pixieNoSimd):
|
||||||
let vec255 = mm_set1_epi8(255)
|
let vec255 = mm_set1_epi8(cast[int8](255))
|
||||||
while i < image.data.len - 4:
|
while i < image.data.len - 4:
|
||||||
var m = mm_loadu_si128(image.data[i].addr)
|
var m = mm_loadu_si128(image.data[i].addr)
|
||||||
m = mm_sub_epi8(vec255, m)
|
m = mm_sub_epi8(vec255, m)
|
||||||
|
@ -251,18 +251,18 @@ proc invert*(image: Image) =
|
||||||
proc getRgbaSmooth*(image: Image, x, y: float32): ColorRGBA {.inline.} =
|
proc getRgbaSmooth*(image: Image, x, y: float32): ColorRGBA {.inline.} =
|
||||||
let
|
let
|
||||||
minX = x.floor.int
|
minX = x.floor.int
|
||||||
difX = x - x.floor
|
diffX = x - x.floor
|
||||||
minY = y.floor.int
|
minY = y.floor.int
|
||||||
difY = y - y.floor
|
diffY = y - y.floor
|
||||||
|
|
||||||
vX0Y0 = image[minX, minY].toPremultipliedAlpha()
|
x0y0 = image[minX, minY].toPremultipliedAlpha()
|
||||||
vX1Y0 = image[minX + 1, minY].toPremultipliedAlpha()
|
x1y0 = image[minX + 1, minY].toPremultipliedAlpha()
|
||||||
vX0Y1 = image[minX, minY + 1].toPremultipliedAlpha()
|
x0y1 = image[minX, minY + 1].toPremultipliedAlpha()
|
||||||
vX1Y1 = image[minX + 1, minY + 1].toPremultipliedAlpha()
|
x1y1 = image[minX + 1, minY + 1].toPremultipliedAlpha()
|
||||||
|
|
||||||
bottomMix = lerp(vX0Y0, vX1Y0, difX)
|
bottomMix = lerp(x0y0, x1y0, diffX)
|
||||||
topMix = lerp(vX0Y1, vX1Y1, difX)
|
topMix = lerp(x0y1, x1y1, diffX)
|
||||||
finalMix = lerp(bottomMix, topMix, difY)
|
finalMix = lerp(bottomMix, topMix, diffY)
|
||||||
|
|
||||||
finalMix.toStraightAlpha()
|
finalMix.toStraightAlpha()
|
||||||
|
|
||||||
|
@ -376,6 +376,7 @@ proc blurAlpha*(image: Image, radius: float32) =
|
||||||
|
|
||||||
proc shift*(image: Image, offset: Vec2) =
|
proc shift*(image: Image, offset: Vec2) =
|
||||||
## Shifts the image by offset.
|
## Shifts the image by offset.
|
||||||
|
if offset != vec2(0, 0):
|
||||||
let copy = image.copy() # Copy to read from.
|
let copy = image.copy() # Copy to read from.
|
||||||
image.fill(rgba(0, 0, 0, 0)) # Reset this for being drawn to.
|
image.fill(rgba(0, 0, 0, 0)) # Reset this for being drawn to.
|
||||||
image.draw(copy, offset) # Draw copy into image.
|
image.draw(copy, offset) # Draw copy into image.
|
||||||
|
@ -465,7 +466,7 @@ proc drawCorrect*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
|
||||||
proc drawUber(
|
proc drawUber(
|
||||||
a, b: Image,
|
a, b: Image,
|
||||||
p, dx, dy: Vec2,
|
p, dx, dy: Vec2,
|
||||||
lines: array[0..3, Segment],
|
segments: array[0..3, Segment],
|
||||||
blendMode: BlendMode,
|
blendMode: BlendMode,
|
||||||
smooth: bool
|
smooth: bool
|
||||||
) =
|
) =
|
||||||
|
@ -475,13 +476,13 @@ proc drawUber(
|
||||||
xMin = a.width
|
xMin = a.width
|
||||||
xMax = 0
|
xMax = 0
|
||||||
for yOffset in [0.float32, 1]:
|
for yOffset in [0.float32, 1]:
|
||||||
var scanLine = segment(
|
var scanLine = Line(
|
||||||
vec2(-100000, y.float32 + yOffset),
|
a: vec2(-1000, y.float32 + yOffset),
|
||||||
vec2(10000, y.float32 + yOffset)
|
b: vec2(1000, y.float32 + yOffset)
|
||||||
)
|
)
|
||||||
for l in lines:
|
for segment in segments:
|
||||||
var at: Vec2
|
var at: Vec2
|
||||||
if intersects(l, scanLine, at) and l.to != at:
|
if scanline.intersects(segment, at) and segment.to != at:
|
||||||
xMin = min(xMin, at.x.floor.int)
|
xMin = min(xMin, at.x.floor.int)
|
||||||
xMax = max(xMax, at.x.ceil.int)
|
xMax = max(xMax, at.x.ceil.int)
|
||||||
|
|
||||||
|
@ -519,7 +520,7 @@ proc draw*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
|
||||||
mat * vec2(b.width.float32, b.height.float32),
|
mat * vec2(b.width.float32, b.height.float32),
|
||||||
mat * vec2(0, b.height.float32)
|
mat * vec2(0, b.height.float32)
|
||||||
]
|
]
|
||||||
lines = [
|
segments = [
|
||||||
segment(corners[0], corners[1]),
|
segment(corners[0], corners[1]),
|
||||||
segment(corners[1], corners[2]),
|
segment(corners[1], corners[2]),
|
||||||
segment(corners[2], corners[3]),
|
segment(corners[2], corners[3]),
|
||||||
|
@ -543,10 +544,14 @@ proc draw*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
|
||||||
minFilterBy2 /= 2
|
minFilterBy2 /= 2
|
||||||
matInv = matInv * scale(vec2(0.5, 0.5))
|
matInv = matInv * scale(vec2(0.5, 0.5))
|
||||||
|
|
||||||
let smooth = not(dx.length == 1.0 and dy.length == 1.0 and
|
let smooth = not(
|
||||||
mat[2, 0].fractional == 0.0 and mat[2, 1].fractional == 0.0)
|
dx.length == 1.0 and
|
||||||
|
dy.length == 1.0 and
|
||||||
|
mat[2, 0].fractional == 0.0 and
|
||||||
|
mat[2, 1].fractional == 0.0
|
||||||
|
)
|
||||||
|
|
||||||
a.drawUber(b, p, dx, dy, lines, blendMode, smooth)
|
a.drawUber(b, p, dx, dy, segments, blendMode, smooth)
|
||||||
|
|
||||||
proc draw*(a, b: Image, pos = vec2(0, 0), blendMode = bmNormal) {.inline.} =
|
proc draw*(a, b: Image, pos = vec2(0, 0), blendMode = bmNormal) {.inline.} =
|
||||||
a.draw(b, translate(pos), blendMode)
|
a.draw(b, translate(pos), blendMode)
|
||||||
|
|
|
@ -744,13 +744,14 @@ proc quickSort(a: var seq[(float32, bool)], inl, inr: int) =
|
||||||
quickSort(a, inl, r)
|
quickSort(a, inl, r)
|
||||||
quickSort(a, l, inr)
|
quickSort(a, l, inr)
|
||||||
|
|
||||||
proc computeBounds(shape: seq[Vec2]): Rect =
|
proc computeBounds(shapes: seq[seq[(Segment, bool)]]): Rect =
|
||||||
var
|
var
|
||||||
xMin = float32.high
|
xMin = float32.high
|
||||||
xMax = float32.low
|
xMax = float32.low
|
||||||
yMin = float32.high
|
yMin = float32.high
|
||||||
yMax = float32.low
|
yMax = float32.low
|
||||||
for segment in shape.segments:
|
for shape in shapes:
|
||||||
|
for (segment, _) in shape:
|
||||||
xMin = min(xMin, min(segment.at.x, segment.to.x))
|
xMin = min(xMin, min(segment.at.x, segment.to.x))
|
||||||
xMax = max(xMax, max(segment.at.x, segment.to.x))
|
xMax = max(xMax, max(segment.at.x, segment.to.x))
|
||||||
yMin = min(yMin, min(segment.at.y, segment.to.y))
|
yMin = min(yMin, min(segment.at.y, segment.to.y))
|
||||||
|
@ -775,36 +776,23 @@ proc fillShapes(
|
||||||
var sortedShapes = newSeq[seq[(Segment, bool)]](shapes.len)
|
var sortedShapes = newSeq[seq[(Segment, bool)]](shapes.len)
|
||||||
for i, sorted in sortedShapes.mpairs:
|
for i, sorted in sortedShapes.mpairs:
|
||||||
for segment in shapes[i].segments:
|
for segment in shapes[i].segments:
|
||||||
if segment.at.y == segment.to.y:
|
if segment.at.y == segment.to.y: # Skip horizontal
|
||||||
# Skip horizontal and zero-length
|
|
||||||
continue
|
continue
|
||||||
var
|
let winding = segment.at.y > segment.to.y
|
||||||
segment = segment
|
|
||||||
winding = segment.at.y > segment.to.y
|
|
||||||
if winding:
|
if winding:
|
||||||
|
var segment = segment
|
||||||
swap(segment.at, segment.to)
|
swap(segment.at, segment.to)
|
||||||
sorted.add((segment, winding))
|
sorted.add((segment, winding))
|
||||||
|
else:
|
||||||
|
sorted.add((segment, winding))
|
||||||
|
|
||||||
# Compute the bounds of each shape
|
# Figure out the total bounds of all the shapes,
|
||||||
var bounds = newSeq[Rect](shapes.len)
|
# rasterize only within the total bounds
|
||||||
for i, shape in shapes:
|
|
||||||
bounds[i] = computeBounds(shape)
|
|
||||||
|
|
||||||
# Figure out the total bounds of all the shapes
|
|
||||||
var
|
|
||||||
minX = float32.high
|
|
||||||
minY = float32.high
|
|
||||||
maxY = float32.low
|
|
||||||
for bounds in bounds:
|
|
||||||
minX = min(minX, bounds.x)
|
|
||||||
minY = min(minY, bounds.y)
|
|
||||||
maxY = max(maxY, bounds.y + bounds.h)
|
|
||||||
|
|
||||||
# Rasterize only within the total bounds
|
|
||||||
let
|
let
|
||||||
startX = max(0, minX.int)
|
bounds = computeBounds(sortedShapes)
|
||||||
startY = max(0, miny.int)
|
startX = max(0, bounds.x.int)
|
||||||
stopY = min(image.height, maxY.int)
|
startY = max(0, bounds.y.int)
|
||||||
|
stopY = min(image.height, (bounds.y + bounds.h).int)
|
||||||
|
|
||||||
const
|
const
|
||||||
quality = 5 # Must divide 255 cleanly
|
quality = 5 # Must divide 255 cleanly
|
||||||
|
@ -815,12 +803,12 @@ proc fillShapes(
|
||||||
|
|
||||||
var
|
var
|
||||||
hits = newSeq[(float32, bool)](4)
|
hits = newSeq[(float32, bool)](4)
|
||||||
coverages = newSeq[uint32](image.width)
|
coverages = newSeq[uint8](image.width)
|
||||||
numHits: int
|
numHits: int
|
||||||
|
|
||||||
for y in startY ..< stopY:
|
for y in startY ..< stopY:
|
||||||
# Reset buffer for this row
|
# Reset buffer for this row
|
||||||
zeroMem(coverages[0].addr, coverages.len * 4)
|
zeroMem(coverages[0].addr, coverages.len)
|
||||||
|
|
||||||
# Do scanlines for this row
|
# Do scanlines for this row
|
||||||
for m in 0 ..< quality:
|
for m in 0 ..< quality:
|
||||||
|
@ -829,10 +817,9 @@ proc fillShapes(
|
||||||
scanline = Line(a: vec2(0, yLine), b: vec2(1000, yLine))
|
scanline = Line(a: vec2(0, yLine), b: vec2(1000, yLine))
|
||||||
numHits = 0
|
numHits = 0
|
||||||
for i, shape in sortedShapes:
|
for i, shape in sortedShapes:
|
||||||
let bounds = bounds[i]
|
|
||||||
if bounds.y > y.float32 or bounds.y + bounds.h < y.float32:
|
|
||||||
continue
|
|
||||||
for (segment, winding) in shape:
|
for (segment, winding) in shape:
|
||||||
|
if segment.at.y > yLine or segment.to.y < y.float32:
|
||||||
|
continue
|
||||||
var at: Vec2
|
var at: Vec2
|
||||||
if scanline.intersects(segment, at):# and segment.to != at:
|
if scanline.intersects(segment, at):# and segment.to != at:
|
||||||
if numHits == hits.len:
|
if numHits == hits.len:
|
||||||
|
@ -872,11 +859,14 @@ proc fillShapes(
|
||||||
if fillLen > 0 and shouldFill(windingRule, count):
|
if fillLen > 0 and shouldFill(windingRule, count):
|
||||||
var i = fillStart
|
var i = fillStart
|
||||||
when defined(amd64) and not defined(pixieNoSimd):
|
when defined(amd64) and not defined(pixieNoSimd):
|
||||||
let m = mm_set1_epi32(sampleCoverage.int32)
|
let vSampleCoverage = mm_set1_epi8(cast[int8](sampleCoverage))
|
||||||
for j in countup(i, fillStart + fillLen - 4, 4):
|
for j in countup(i, fillStart + fillLen - 16, 16):
|
||||||
let current = mm_loadu_si128(coverages[j].addr)
|
let current = mm_loadu_si128(coverages[j].addr)
|
||||||
mm_storeu_si128(coverages[j].addr, mm_add_epi32(m, current))
|
mm_storeu_si128(
|
||||||
i += 4
|
coverages[j].addr,
|
||||||
|
mm_add_epi8(current, vSampleCoverage)
|
||||||
|
)
|
||||||
|
i += 16
|
||||||
for j in i ..< fillStart + fillLen:
|
for j in i ..< fillStart + fillLen:
|
||||||
coverages[j] += sampleCoverage
|
coverages[j] += sampleCoverage
|
||||||
|
|
||||||
|
@ -889,17 +879,39 @@ proc fillShapes(
|
||||||
# When supported, SIMD blend as much as possible
|
# When supported, SIMD blend as much as possible
|
||||||
|
|
||||||
let
|
let
|
||||||
|
coverageMask1 = cast[M128i]([0xffffffff, 0, 0, 0]) # First 32 bits
|
||||||
|
coverageMask3 = mm_set1_epi32(cast[int32](0x000000ff)) # Only `r`
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
zero = mm_set1_epi32(0)
|
zero = mm_set1_epi32(0)
|
||||||
v255 = mm_set1_epi32(255)
|
v255 = mm_set1_epi32(255)
|
||||||
|
vColor = mm_set1_epi32(cast[int32](color))
|
||||||
|
|
||||||
for _ in countup(x, coverages.len - 4, 4):
|
for _ in countup(x, coverages.len - 16, 16):
|
||||||
var coverage = mm_loadu_si128(coverages[x].addr)
|
var coverage = mm_loadu_si128(coverages[x].addr)
|
||||||
|
coverage = mm_and_si128(coverage, coverageMask1)
|
||||||
|
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, zero)) != 0xffff:
|
if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zero)) != 0xffff:
|
||||||
# If the coverages are not all zero
|
# If the coverages are not all zero
|
||||||
var source = mm_set1_epi32(cast[int32](color))
|
var source = vColor
|
||||||
|
coverage = mm_slli_si128(coverage, 2)
|
||||||
|
coverage = mm_shuffle_epi32(coverage, MM_SHUFFLE(1, 1, 0, 0))
|
||||||
|
|
||||||
|
var
|
||||||
|
a = mm_and_si128(coverage, coverageMask1)
|
||||||
|
b = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 4))
|
||||||
|
c = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 8))
|
||||||
|
d = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 12))
|
||||||
|
|
||||||
|
# Shift the coverages to `r`
|
||||||
|
a = mm_srli_si128(a, 2)
|
||||||
|
b = mm_srli_si128(b, 3)
|
||||||
|
d = mm_srli_si128(d, 1)
|
||||||
|
|
||||||
|
coverage = mm_and_si128(
|
||||||
|
mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)),
|
||||||
|
coverageMask3
|
||||||
|
)
|
||||||
|
|
||||||
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff:
|
if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff:
|
||||||
# If the coverages are not all 255
|
# If the coverages are not all 255
|
||||||
|
@ -932,10 +944,10 @@ proc fillShapes(
|
||||||
x += 4
|
x += 4
|
||||||
|
|
||||||
while x < image.width:
|
while x < image.width:
|
||||||
if x + 2 <= coverages.len:
|
if x + 8 <= coverages.len:
|
||||||
let peeked = cast[ptr uint64](coverages[x].addr)[]
|
let peeked = cast[ptr uint64](coverages[x].addr)[]
|
||||||
if peeked == 0:
|
if peeked == 0:
|
||||||
x += 2
|
x += 8
|
||||||
continue
|
continue
|
||||||
|
|
||||||
let coverage = coverages[x]
|
let coverage = coverages[x]
|
||||||
|
|
Loading…
Reference in a new issue