From 426c9766f1eee952826c2132723c5e3a74d44ece Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 13 Feb 2021 17:37:59 -0600 Subject: [PATCH 1/4] f --- experiments/benchmark_cairo.nim | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/benchmark_cairo.nim b/experiments/benchmark_cairo.nim index 975c3a1..b9b6cfe 100644 --- a/experiments/benchmark_cairo.nim +++ b/experiments/benchmark_cairo.nim @@ -1,4 +1,4 @@ -import cairo, math, benchy, pixie, pixie/paths, chroma +import cairo, math, benchy, pixie, chroma var surface = imageSurfaceCreate(FORMAT_ARGB32, 1000, 1000) @@ -18,7 +18,7 @@ timeIt "cairo": ctx.fill() surface.flush() -discard surface.writeToPng("cairo.png") +# discard surface.writeToPng("cairo.png") var a = newImage(1000, 1000) a.fill(rgba(0, 0, 0, 255)) @@ -32,4 +32,4 @@ timeIt "pixie": p.closePath() a.fillPath(p, rgba(0, 0, 255, 255)) -discard surface.writeToPng("pixie.png") +# a.writeFile("pixie.png") From b800c77ab54dd3e4c16d6b5a2f38528cb1fe0d08 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 13 Feb 2021 18:46:59 -0600 Subject: [PATCH 2/4] cleaner + faster simd --- src/pixie/blends.nim | 35 ++++++++++++++++++++++++++++------- src/pixie/images.nim | 24 ++---------------------- src/pixie/paths.nim | 41 ++++++++--------------------------------- 3 files changed, 38 insertions(+), 62 deletions(-) diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index 2324cdd..48543ca 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -160,7 +160,7 @@ proc SetSat(C: Color, s: float32): Color {.inline.} = if satC > 0: result = (C - min([C.r, C.g, C.b])) * s / satC -proc blendNormal(backdrop, source: ColorRGBA): ColorRGBA = +proc blendNormal*(backdrop, source: ColorRGBA): ColorRGBA = if backdrop.a == 0: return source if source.a == 255: @@ -516,6 +516,31 @@ when defined(amd64) and not defined(pixieNoSimd): result = mm_or_si128(mm_or_si128(result, i), mm_or_si128(j, k)) result = mm_and_si128(result, first32) + proc unpackAlphaValues*(v: M128i): M128i {.inline.} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value) + let + first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a` + + result = mm_shuffle_epi32(v, MM_SHUFFLE(0, 0, 0, 0)) + + var + i = mm_and_si128(result, first32) + j = mm_and_si128(result, mm_slli_si128(first32, 4)) + k = mm_and_si128(result, mm_slli_si128(first32, 8)) + l = mm_and_si128(result, mm_slli_si128(first32, 12)) + + # Shift the values to `a` + i = mm_slli_si128(i, 3) + j = mm_slli_si128(j, 2) + k = mm_slli_si128(k, 1) + # l = mm_slli_si128(l, 0) + + result = mm_and_si128( + mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)), + alphaMask + ) + proc blendNormalSimd*(backdrop, source: M128i): M128i = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) @@ -615,9 +640,7 @@ when defined(amd64) and not defined(pixieNoSimd): blendedEven = mm_add_epi16(sourceEven, backdropEven) blendedOdd = mm_add_epi16(sourceOdd, backdropOdd) - blendedOdd = mm_slli_epi16(blendedOdd, 8) - - mm_or_si128(blendedEven, blendedOdd) + mm_or_si128(blendedEven, mm_slli_epi16(blendedOdd, 8)) proc maskMaskSimd*(backdrop, source: M128i): M128i = let @@ -638,9 +661,7 @@ when defined(amd64) and not defined(pixieNoSimd): backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - backdropOdd = mm_slli_epi16(backdropOdd, 8) - - mm_or_si128(backdropEven, backdropOdd) + mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) proc maskerSimd*(blendMode: BlendMode): MaskerSimd = case blendMode: diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 886bfcd..aa0d312 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -686,10 +686,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) = # Check we are not rotated before using SIMD blends when type(a) is Image: if blendMode.hasSimdBlender(): - let - blenderSimd = blendMode.blenderSimd() - first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a` + let blenderSimd = blendMode.blenderSimd() for _ in countup(x, xMax - 4, 4): let srcPos = p + dx * x.float32 + dy * y.float32 @@ -701,24 +698,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) = else: # b is a Mask # Need to move 4 mask values into the alpha slots var source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) - source = mm_slli_si128(source, 2) - source = mm_shuffle_epi32(source, MM_SHUFFLE(1, 1, 0, 0)) - - var - i = mm_and_si128(source, first32) - j = mm_and_si128(source, mm_slli_si128(first32, 4)) - k = mm_and_si128(source, mm_slli_si128(first32, 8)) - l = mm_and_si128(source, mm_slli_si128(first32, 12)) - - # Shift the values to `a` - i = mm_slli_si128(i, 1) - k = mm_slli_si128(k, 3) - l = mm_slli_si128(l, 2) - - source = mm_and_si128( - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)), - alphaMask - ) + source = unpackAlphaValues(source) mm_storeu_si128( a.data[a.dataIndex(x, y)].addr, diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index e8690e0..39d6056 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -966,13 +966,11 @@ proc fillShapes( # When supported, SIMD blend as much as possible let first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits - redMask = mm_set1_epi32(cast[int32](0x000000ff)) # Only `r` oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) - v255 = mm_set1_epi32(255) vColor = mm_set1_epi32(cast[int32](color)) - for _ in countup(x, coverages.len - 16, 16): + for _ in countup(x, image.width - 16, 4): var coverage = mm_loadu_si128(coverages[x].addr) coverage = mm_and_si128(coverage, first32) @@ -981,32 +979,11 @@ proc fillShapes( # If the coverages are not all zero var source = vColor - coverage = mm_slli_si128(coverage, 2) - coverage = mm_shuffle_epi32(coverage, MM_SHUFFLE(1, 1, 0, 0)) - - var - a = mm_and_si128(coverage, first32) - b = mm_and_si128(coverage, mm_slli_si128(first32, 4)) - c = mm_and_si128(coverage, mm_slli_si128(first32, 8)) - d = mm_and_si128(coverage, mm_slli_si128(first32, 12)) - - # Shift the coverages to `r` - a = mm_srli_si128(a, 2) - b = mm_srli_si128(b, 3) - d = mm_srli_si128(d, 1) - - coverage = mm_and_si128( - mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)), - redMask - ) - - if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff: + if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) != 0xffff: # If the coverages are not all 255 - - # Shift the coverages from `r` to `g` and `a` for multiplying later - coverage = mm_or_si128( - mm_slli_epi32(coverage, 8), mm_slli_epi32(coverage, 24) - ) + coverage = unpackAlphaValues(coverage) + # Shift the coverages from `a` to `g` and `a` for multiplying + coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16)) var colorEven = mm_slli_epi16(source, 8) @@ -1085,18 +1062,16 @@ proc fillShapes( when defined(amd64) and not defined(pixieNoSimd): # When supported, SIMD blend as much as possible for _ in countup(x, coverages.len - 16, 16): - var coverage = mm_loadu_si128(coverages[x].addr) - - let eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) + let + coverage = mm_loadu_si128(coverages[x].addr) + eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) if mm_movemask_epi8(eqZero) != 0xffff: # If the coverages are not all zero let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) - mm_storeu_si128( mask.data[mask.dataIndex(x, y)].addr, maskNormalSimd(backdrop, coverage) ) - x += 16 while x < mask.width: From 28f831249fee703c0c504f2f3f4557d931a573d9 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 13 Feb 2021 20:52:46 -0600 Subject: [PATCH 3/4] better segment partitioning, faster svg path fill --- src/pixie/paths.nim | 96 ++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 39d6056..1e5ba96 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -783,14 +783,14 @@ proc quickSort(a: var seq[(float32, int16)], inl, inr: int) = quickSort(a, inl, r) quickSort(a, l, inr) -proc computeBounds(seqs: varargs[seq[(Segment, int16)]]): Rect = +proc computeBounds(partitions: seq[seq[(Segment, int16)]]): Rect = var xMin = float32.high xMax = float32.low yMin = float32.high yMax = float32.low - for s in seqs: - for (segment, _) in s: + for partition in partitions: + for (segment, _) in partition: xMin = min(xMin, min(segment.at.x, segment.to.x)) xMax = max(xMax, max(segment.at.x, segment.to.x)) yMin = min(yMin, min(segment.at.y, segment.to.y)) @@ -813,11 +813,23 @@ proc shouldFill(windingRule: WindingRule, count: int): bool {.inline.} = of wrEvenOdd: count mod 2 != 0 -proc partitionSegments(shapes: seq[seq[Vec2]], middle: int): tuple[ - topHalf: seq[(Segment, int16)], - bottomHalf: seq[(Segment, int16)], - fullHeight: seq[(Segment, int16)] -] = +proc partitionSegments( + shapes: seq[seq[Vec2]], height: int +): seq[seq[(Segment, int16)]] = + ## Puts segments into the height partitions they intersect with. + + var segmentCount: int + for shape in shapes: + segmentCount += shape.len - 1 + + let + maxPartitions = max(1, height div 10) + numPartitions = min(maxPartitions, max(1, segmentCount div 10)) + + result.setLen(numPartitions) + + let partitionHeight = height div numPartitions + for shape in shapes: for segment in shape.segments: if segment.at.y == segment.to.y: # Skip horizontal @@ -828,19 +840,22 @@ proc partitionSegments(shapes: seq[seq[Vec2]], middle: int): tuple[ if segment.at.y > segment.to.y: swap(segment.at, segment.to) winding = -1 - if ceil(segment.to.y).int < middle: - result.topHalf.add((segment, winding)) - elif segment.at.y.int >= middle: - result.bottomHalf.add((segment, winding)) + + if partitionHeight == 0: + result[0].add((segment, winding)) else: - result.fullHeight.add((segment, winding)) + let + atPartition = max(0, segment.at.y).int div partitionHeight + toPartition = max(0, ceil(segment.to.y)).int div partitionHeight + for i in min(atPartition, result.high) .. min(toPartition, result.high): + result[i].add((segment, winding)) proc computeCoverages( coverages: var seq[uint8], hits: var seq[(float32, int16)], size: Vec2, y: int, - topHalf, bottomHalf, fullHeight: seq[(Segment, int16)], + partitions: seq[seq[(Segment, int16)]], windingRule: WindingRule ) = const @@ -850,37 +865,30 @@ proc computeCoverages( offset = 1 / quality.float32 initialOffset = offset / 2 - proc intersects( - scanline: Line, - segment: Segment, - winding: int16, - hits: var seq[(float32, int16)], - numHits: var int - ) {.inline.} = - if segment.at.y <= scanline.a.y and segment.to.y >= scanline.a.y: - var at: Vec2 - if scanline.intersects(segment, at):# and segment.to != at: - if numHits == hits.len: - hits.setLen(hits.len * 2) - hits[numHits] = (at.x.clamp(0, scanline.b.x), winding) - inc numHits - var numHits: int + let + partitionHeight = size.y.int div partitions.len + partition = + if partitionHeight == 0: + 0 + else: + min(y div partitionHeight, partitions.high) + # Do scanlines for this row for m in 0 ..< quality: let yLine = y.float32 + initialOffset + offset * m.float32 + ep scanline = Line(a: vec2(0, yLine), b: vec2(size.x, yLine)) numHits = 0 - if y < size.y.int div 2: - for (segment, winding) in topHalf: - scanline.intersects(segment, winding, hits, numHits) - else: - for (segment, winding) in bottomHalf: - scanline.intersects(segment, winding, hits, numHits) - for (segment, winding) in fullHeight: - scanline.intersects(segment, winding, hits, numHits) + for (segment, winding) in partitions[partition]: + if segment.at.y <= scanline.a.y and segment.to.y >= scanline.a.y: + var at: Vec2 + if scanline.intersects(segment, at):# and segment.to != at: + if numHits == hits.len: + hits.setLen(hits.len * 2) + hits[numHits] = (at.x.clamp(0, scanline.b.x), winding) + inc numHits quickSort(hits, 0, numHits - 1) @@ -928,13 +936,12 @@ proc fillShapes( windingRule: WindingRule, blendMode: BlendMode ) = - let (topHalf, bottomHalf, fullHeight) = - partitionSegments(shapes, image.height div 2) + let partitions = partitionSegments(shapes, image.height) # Figure out the total bounds of all the shapes, # rasterize only within the total bounds let - bounds = computeBounds(topHalf, bottomHalf, fullHeight) + bounds = computeBounds(partitions) startX = max(0, bounds.x.int) startY = max(0, bounds.y.int) stopY = min(image.height, (bounds.y + bounds.h).int) @@ -956,7 +963,7 @@ proc fillShapes( hits, image.wh, y, - topHalf, bottomHalf, fullHeight, + partitions, windingRule ) @@ -1029,13 +1036,12 @@ proc fillShapes( shapes: seq[seq[Vec2]], windingRule: WindingRule ) = - let (topHalf, bottomHalf, fullHeight) = - partitionSegments(shapes, mask.height div 2) + let partitions = partitionSegments(shapes, mask.height) # Figure out the total bounds of all the shapes, # rasterize only within the total bounds let - bounds = computeBounds(topHalf, bottomHalf, fullHeight) + bounds = computeBounds(partitions) startX = max(0, bounds.x.int) startY = max(0, bounds.y.int) stopY = min(mask.height, (bounds.y + bounds.h).int) @@ -1053,7 +1059,7 @@ proc fillShapes( hits, mask.wh, y, - topHalf, bottomHalf, fullHeight, + partitions, windingRule ) From fc23bda80f5f190a9e59d1a8038b98c01a4ebadb Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 13 Feb 2021 20:57:58 -0600 Subject: [PATCH 4/4] blends exports --- src/pixie/blends.nim | 12 ++++++------ src/pixie/paths.nim | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index 48543ca..b992f6b 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -160,7 +160,7 @@ proc SetSat(C: Color, s: float32): Color {.inline.} = if satC > 0: result = (C - min([C.r, C.g, C.b])) * s / satC -proc blendNormal*(backdrop, source: ColorRGBA): ColorRGBA = +proc blendNormal(backdrop, source: ColorRGBA): ColorRGBA = if backdrop.a == 0: return source if source.a == 255: @@ -541,7 +541,7 @@ when defined(amd64) and not defined(pixieNoSimd): alphaMask ) - proc blendNormalSimd*(backdrop, source: M128i): M128i = + proc blendNormalSimd(backdrop, source: M128i): M128i = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) @@ -570,7 +570,7 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) - proc blendMaskSimd*(backdrop, source: M128i): M128i = + proc blendMaskSimd(backdrop, source: M128i): M128i = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) @@ -591,7 +591,7 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - proc blendOverwriteSimd*(backdrop, source: M128i): M128i = + proc blendOverwriteSimd(backdrop, source: M128i): M128i = source proc blenderSimd*(blendMode: BlendMode): BlenderSimd = @@ -605,7 +605,7 @@ when defined(amd64) and not defined(pixieNoSimd): proc hasSimdBlender*(blendMode: BlendMode): bool = blendMode in {bmNormal, bmMask, bmOverwrite} - proc maskNormalSimd*(backdrop, source: M128i): M128i = + proc maskNormalSimd(backdrop, source: M128i): M128i = ## Blending masks let oddMask = mm_set1_epi16(cast[int16](0xff00)) @@ -642,7 +642,7 @@ when defined(amd64) and not defined(pixieNoSimd): mm_or_si128(blendedEven, mm_slli_epi16(blendedOdd, 8)) - proc maskMaskSimd*(backdrop, source: M128i): M128i = + proc maskMaskSimd(backdrop, source: M128i): M128i = let oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 1e5ba96..3242c29 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1050,6 +1050,10 @@ proc fillShapes( coverages = newSeq[uint8](mask.width) hits = newSeq[(float32, int16)](4) + + when defined(amd64) and not defined(pixieNoSimd): + let maskerSimd = bmNormal.maskerSimd() + for y in startY ..< stopY: # Reset buffer for this row zeroMem(coverages[0].addr, coverages.len) @@ -1076,7 +1080,7 @@ proc fillShapes( let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) mm_storeu_si128( mask.data[mask.dataIndex(x, y)].addr, - maskNormalSimd(backdrop, coverage) + maskerSimd(backdrop, coverage) ) x += 16