From 3a289a6172a74976e0cccb06fc7423818f592f9d Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 26 Jun 2022 21:53:51 -0500 Subject: [PATCH 01/11] rm --- src/pixie/paths.nim | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 6dcafac..75c248c 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1971,14 +1971,10 @@ proc fillShapes( rightBottom.x = right.solveX(rightBottom.y) let - # leftMinX = min(leftTop.x, leftBottom.x) leftMaxX = max(leftTop.x, leftBottom.x) rightMinX = min(rightTop.x, rightBottom.x) - # rightMaxX = max(rightTop.x, rightBottom.x) - # leftCoverBegin = leftMinX.trunc leftCoverEnd = leftMaxX.ceil.int rightCoverBegin = rightMinX.trunc.int - # rightCoverEnd = rightMaxX.ceil if leftCoverEnd < rightCoverBegin: # Only take this shortcut if the partial coverage areas on the From ce062a5949cb5014284ef2733bf37a28b2171010 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 26 Jun 2022 21:58:50 -0500 Subject: [PATCH 02/11] morepretty --- src/pixie/internal.nim | 4 ++-- src/pixie/paths.nim | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index bbfaff9..a53b95b 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -203,8 +203,8 @@ proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = when defined(amd64) and allowSimd: proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = - let opacityVec = mm_set1_ps(opacity) - var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec)) + let opacityVec = mm_set1_ps(opacity) + var finalColor = mm_cvtps_epi32(mm_mul_ps(color, opacityVec)) finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 75c248c..631d421 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1171,7 +1171,9 @@ proc partitionSegments( var entryCounts = newSeq[int](numPartitions) for (segment, _) in segments: - for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight): + for partitionIndex in segment.partitionRange( + numPartitions, startY, partitionHeight + ): inc entryCounts[partitionIndex] for partitionIndex, entryCounts in entryCounts: @@ -1179,7 +1181,9 @@ proc partitionSegments( var indexes = newSeq[int](numPartitions) for i, (segment, winding) in segments: - for partitionIndex in segment.partitionRange(numPartitions, startY, partitionHeight): + for partitionIndex in segment.partitionRange( + numPartitions, startY, partitionHeight + ): result[partitionIndex].entries[indexes[partitionIndex]] = entries[i] inc indexes[partitionIndex] From cf700221445363b114750f02159d9e39e5222346 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 26 Jun 2022 22:46:49 -0500 Subject: [PATCH 03/11] stroke --- experiments/benchmark_cairo.nim | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/experiments/benchmark_cairo.nim b/experiments/benchmark_cairo.nim index daaa3fe..ee862fc 100644 --- a/experiments/benchmark_cairo.nim +++ b/experiments/benchmark_cairo.nim @@ -194,6 +194,8 @@ block: surface = imageSurfaceCreate(FORMAT_ARGB32, 900, 900) ctx = surface.create() + ctx.setLineWidth(1) + timeIt "[cairo] " & benchmark.name: for fill in benchmark.fills: if fill.shapes.len > 0: @@ -221,6 +223,7 @@ block: FillRuleEvenOdd ) ctx.fill() + # ctx.stroke() # discard surface.writeToPng(("cairo_" & benchmark.name & ".png").cstring) @@ -242,5 +245,11 @@ block: fill.transform, fill.windingRule ) + # image.strokePath( + # p, + # fill.paint, + # fill.transform, + # 1 + # ) # image.writeFile("pixie_" & benchmark.name & ".png") From 741234786c8689c4e8c0551542b739a8928ac64e Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Mon, 27 Jun 2022 00:35:09 -0500 Subject: [PATCH 04/11] rename --- src/pixie/internal.nim | 2 +- src/pixie/{simd => runtimechecked}/avx.nim | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/pixie/{simd => runtimechecked}/avx.nim (100%) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index a53b95b..fc032e9 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -3,7 +3,7 @@ import bumpy, chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) when defined(amd64) and allowSimd: - import nimsimd/runtimecheck, nimsimd/sse2, simd/avx + import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx let cpuHasAvx* = checkInstructionSets({AVX}) template currentExceptionAsPixieError*(): untyped = diff --git a/src/pixie/simd/avx.nim b/src/pixie/runtimechecked/avx.nim similarity index 100% rename from src/pixie/simd/avx.nim rename to src/pixie/runtimechecked/avx.nim From d80df8f9584c25ed9af7378c48c4a6cbcafcfd1f Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Mon, 27 Jun 2022 00:36:45 -0500 Subject: [PATCH 05/11] f --- src/pixie/internal.nim | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index fc032e9..1a5a752 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -169,13 +169,13 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} i += 4 # Convert whatever is left - for j in i ..< data.len: - var c = data[j] + for i in i ..< data.len: + var c = data[i] if c.a != 255: - c.r = ((c.r.uint32 * c.a.uint32) div 255).uint8 - c.g = ((c.g.uint32 * c.a.uint32) div 255).uint8 - c.b = ((c.b.uint32 * c.a.uint32) div 255).uint8 - data[j] = c + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = result = true From c244b8cb8185b61446019b79786d9579a7e69852 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 15:16:22 -0500 Subject: [PATCH 06/11] std/ --- src/pixie/fileformats/jpeg.nim | 2 +- src/pixie/paths.nim | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pixie/fileformats/jpeg.nim b/src/pixie/fileformats/jpeg.nim index c346772..4078d74 100644 --- a/src/pixie/fileformats/jpeg.nim +++ b/src/pixie/fileformats/jpeg.nim @@ -1,5 +1,5 @@ import chroma, flatty/binny, pixie/common, pixie/images, pixie/internal, - pixie/masks, sequtils, std/decls, strutils + pixie/masks, std/decls, std/sequtils, std/strutils when defined(amd64) and allowSimd: import nimsimd/sse2 diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 631d421..f579926 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1,5 +1,5 @@ -import blends, bumpy, chroma, common, fenv, images, internal, masks, paints, - strutils, vmath +import blends, bumpy, chroma, common, images, internal, masks, paints, std/fenv, + std/strutils, vmath when defined(amd64) and allowSimd: import nimsimd/sse2 From ffc2b5b4d5205fd646032acbb0caef3b6c0eea50 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 17:59:50 -0500 Subject: [PATCH 07/11] aligned sse2 + avx2 versions isOneColor isOpaque isTransparent --- pixie.nimble | 2 +- src/pixie/images.nim | 76 ++++++++++++++------- src/pixie/internal.nim | 43 ++++++++---- src/pixie/runtimechecked/avx2.nim | 106 ++++++++++++++++++++++++++++++ 4 files changed, 190 insertions(+), 37 deletions(-) create mode 100644 src/pixie/runtimechecked/avx2.nim diff --git a/pixie.nimble b/pixie.nimble index c79cb13..ef57b10 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.5" requires "zippy >= 0.10.2" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.1" +requires "nimsimd >= 1.1.5" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 5f23e00..ebc2c2d 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,7 +1,7 @@ import blends, bumpy, chroma, common, masks, pixie/internal, vmath when defined(amd64) and allowSimd: - import nimsimd/sse2 + import nimsimd/sse2, runtimechecked/avx2 const h = 0.5.float32 @@ -101,54 +101,84 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. + when defined(amd64) and allowSimd: + if cpuHasAvx2: + return isOneColorAvx2(image.data, 0, image.data.len) + result = true let color = image.data[0] var i: int when defined(amd64) and allowSimd: - let colorVec = mm_set1_epi32(cast[int32](color)) - for _ in 0 ..< image.data.len div 16: + # Align to 16 bytes + var p = cast[uint](image.data[i].addr) + while i < image.data.len and (p and 15) != 0: + if image.data[i] != color: + return false + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](color)) + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(image.data[i + 0].addr) - values1 = mm_loadu_si128(image.data[i + 4].addr) - values2 = mm_loadu_si128(image.data[i + 8].addr) - values3 = mm_loadu_si128(image.data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) eq0 = mm_cmpeq_epi8(values0, colorVec) eq1 = mm_cmpeq_epi8(values1, colorVec) eq2 = mm_cmpeq_epi8(values2, colorVec) eq3 = mm_cmpeq_epi8(values3, colorVec) - eq = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) - if mm_movemask_epi8(eq) != 0xffff: + eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) + if mm_movemask_epi8(eq0123) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< image.data.len: - if image.data[j] != color: + for i in i ..< image.data.len: + if image.data[i] != color: return false proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. + when defined(amd64) and allowSimd: + if cpuHasAvx2: + return isTransparentAvx2(image.data, 0, image.data.len) + result = true var i: int when defined(amd64) and allowSimd: - let vecZero = mm_setzero_si128() - for _ in 0 ..< image.data.len div 16: + # Align to 16 bytes + var p = cast[uint](image.data[i].addr) + while i < image.data.len and (p and 15) != 0: + if image.data[i].a != 0: + return false + inc i + p += 4 + + let + vecZero = mm_setzero_si128() + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(image.data[i + 0].addr) - values1 = mm_loadu_si128(image.data[i + 4].addr) - values2 = mm_loadu_si128(image.data[i + 8].addr) - values3 = mm_loadu_si128(image.data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_or_si128(values0, values1) values23 = mm_or_si128(values2, values3) - values = mm_or_si128(values01, values23) - if mm_movemask_epi8(mm_cmpeq_epi8(values, vecZero)) != 0xffff: + values0123 = mm_or_si128(values01, values23) + if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< image.data.len: - if image.data[j].a != 0: + for i in i ..< image.data.len: + if image.data[i].a != 0: return false proc isOpaque*(image: Image): bool {.raises: [].} = diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 1a5a752..b7211f4 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -3,8 +3,10 @@ import bumpy, chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) when defined(amd64) and allowSimd: - import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx - let cpuHasAvx* = checkInstructionSets({AVX}) + import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 + let + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -178,27 +180,42 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = + when defined(amd64) and allowSimd: + if cpuHasAvx2 and len >= 64: + return isOpaqueAvx2(data, start, len) + result = true var i = start when defined(amd64) and allowSimd: - let vec255 = mm_set1_epi32(cast[int32](uint32.high)) - for _ in start ..< (start + len) div 16: + # Align to 16 bytes + var p = cast[uint](data[i].addr) + while i < (start + len) and (p and 15) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let + vec255 = mm_set1_epi8(255) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(data[i + 0].addr) - values1 = mm_loadu_si128(data[i + 4].addr) - values2 = mm_loadu_si128(data[i + 8].addr) - values3 = mm_loadu_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_and_si128(values0, values1) values23 = mm_and_si128(values2, values3) - values = mm_and_si128(values01, values23) - eq = mm_cmpeq_epi8(values, vec255) + values0123 = mm_and_si128(values01, values23) + eq = mm_cmpeq_epi8(values0123, vec255) if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< start + len: - if data[j].a != 255: + for i in i ..< start + len: + if data[i].a != 255: return false when defined(amd64) and allowSimd: diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim new file mode 100644 index 0000000..afd68ca --- /dev/null +++ b/src/pixie/runtimechecked/avx2.nim @@ -0,0 +1,106 @@ +import chroma, nimsimd/avx2 + +when defined(gcc) or defined(clang): + {.localPassc: "-mavx2".} + +when defined(release): + {.push checks: off.} + +proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + let color = data[0] + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i] != color: + return false + inc i + p += 4 + + let + colorVec = mm256_set1_epi32(cast[int32](color)) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + eq0 = mm256_cmpeq_epi8(values0, colorVec) + eq1 = mm256_cmpeq_epi8(values1, colorVec) + eq01 = mm256_and_si256(eq0, eq1) + if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff): + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i] != color: + return false + +proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i].a != 0: + return false + inc i + p += 4 + + let + vecZero = mm256_setzero_si256() + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + values01 = mm256_or_si256(values0, values1) + eq = mm256_cmpeq_epi8(values01, vecZero) + if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i].a != 0: + return false + +proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let + vec255 = mm256_set1_epi8(255) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + values01 = mm256_and_si256(values0, values1) + eq = mm256_cmpeq_epi8(values01, vec255) + if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i].a != 255: + return false + +when defined(release): + {.pop.} From e0ac7dc7a2585488b8f2f8ea78f9449c37fccbf2 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 18:21:37 -0500 Subject: [PATCH 08/11] toPremultipliedAlphaAvx2 --- src/pixie/internal.nim | 5 +++++ src/pixie/runtimechecked/avx2.nim | 37 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index b7211f4..aea0e55 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -141,6 +141,11 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. + when defined(amd64) and allowSimd: + if cpuHasAvx2: + toPremultipliedAlphaAvx2(data) + return + var i: int when defined(amd64) and allowSimd: # When supported, SIMD convert as much as possible diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index afd68ca..f7b33ac 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -102,5 +102,42 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = if data[i].a != 255: return false +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = + var i: int + + let + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + for _ in 0 ..< data.len div 8: + let + values = mm256_loadu_si256(data[i].addr) + alpha = mm256_and_si256(values, alphaMask) + eq = mm256_cmpeq_epi8(values, alphaMask) + if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: + let + evenMultiplier = mm256_or_si256(alpha, mm256_srli_epi32(alpha, 16)) + oddMultiplier = mm256_or_si256(evenMultiplier, alphaMask) + var + colorsEven = mm256_slli_epi16(values, 8) + colorsOdd = mm256_and_si256(values, oddMask) + colorsEven = mm256_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm256_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) + mm256_storeu_si256( + data[i].addr, + mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) + ) + i += 8 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c + when defined(release): {.pop.} From fe488708cd173b76e194689b2e727543a7b793e8 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 18:42:06 -0500 Subject: [PATCH 09/11] use return as progress marker --- src/pixie/internal.nim | 55 +++++++++++++++---------------- src/pixie/runtimechecked/avx2.nim | 18 +++------- 2 files changed, 30 insertions(+), 43 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index aea0e55..cbeb522 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -141,39 +141,36 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. - when defined(amd64) and allowSimd: - if cpuHasAvx2: - toPremultipliedAlphaAvx2(data) - return - var i: int when defined(amd64) and allowSimd: - # When supported, SIMD convert as much as possible - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< data.len div 4: + if cpuHasAvx2: + i = toPremultipliedAlphaAvx2(data) + else: let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + for _ in 0 ..< data.len div 4: let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + let + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 # Convert whatever is left for i in i ..< data.len: diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index f7b33ac..3f4a86d 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -102,16 +102,14 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = - var i: int - +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int = let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) div255 = mm256_set1_epi16(cast[int16](0x8081)) for _ in 0 ..< data.len div 8: let - values = mm256_loadu_si256(data[i].addr) + values = mm256_loadu_si256(data[result].addr) alpha = mm256_and_si256(values, alphaMask) eq = mm256_cmpeq_epi8(values, alphaMask) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: @@ -126,18 +124,10 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) mm256_storeu_si256( - data[i].addr, + data[result].addr, mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) ) - i += 8 - - for i in i ..< data.len: - var c = data[i] - if c.a != 255: - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 - data[i] = c + result += 8 when defined(release): {.pop.} From 0affa5284b11555b6b6ac0496cc887826d8c941d Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 19:53:12 -0500 Subject: [PATCH 10/11] simpler --- src/pixie/paths.nim | 246 +++++++++++++++++++++----------------------- 1 file changed, 115 insertions(+), 131 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index f579926..f23a534 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1937,150 +1937,134 @@ proc fillShapes( if maybeLeftMaxX > maybeRightMaxX: swap left, right - let requiresAntiAliasing = - left.segment.requiresAntiAliasing or - right.segment.requiresAntiAliasing + # We have 2 non-intersecting lines that require anti-aliasing + # Use trapezoid coverage at the edges and fill in the middle - if requiresAntiAliasing: - # We have 2 non-intersecting lines that require anti-aliasing - # Use trapezoid coverage at the edges and fill in the middle + when allowSimd and defined(amd64): + let vecRgbx = mm_set_ps( + rgbx.a.float32, + rgbx.b.float32, + rgbx.g.float32, + rgbx.r.float32 + ) - when allowSimd and defined(amd64): - let vecRgbx = mm_set_ps( - rgbx.a.float32, - rgbx.b.float32, - rgbx.g.float32, - rgbx.r.float32 - ) + proc solveX(entry: PartitionEntry, y: float32): float32 = + if entry.m == 0: + entry.b + else: + (y - entry.b) / entry.m - proc solveX(entry: PartitionEntry, y: float32): float32 = - if entry.m == 0: - entry.b - else: - (y - entry.b) / entry.m + proc solveY(entry: PartitionEntry, x: float32): float32 = + entry.m * x + entry.b - proc solveY(entry: PartitionEntry, x: float32): float32 = - entry.m * x + entry.b + var + leftTop = vec2(0, y.float32) + leftBottom = vec2(0, (y + 1).float32) + leftTop.x = left.solveX(leftTop.y.float32) + leftBottom.x = left.solveX(leftBottom.y) - var - leftTop = vec2(0, y.float32) - leftBottom = vec2(0, (y + 1).float32) - leftTop.x = left.solveX(leftTop.y.float32) - leftBottom.x = left.solveX(leftBottom.y) + var + rightTop = vec2(0, y.float32) + rightBottom = vec2(0, (y + 1).float32) + rightTop.x = right.solveX(rightTop.y) + rightBottom.x = right.solveX(rightBottom.y) - var - rightTop = vec2(0, y.float32) - rightBottom = vec2(0, (y + 1).float32) - rightTop.x = right.solveX(rightTop.y) - rightBottom.x = right.solveX(rightBottom.y) + let + leftMaxX = max(leftTop.x, leftBottom.x) + rightMinX = min(rightTop.x, rightBottom.x) + leftCoverEnd = leftMaxX.ceil.int + rightCoverBegin = rightMinX.trunc.int - let - leftMaxX = max(leftTop.x, leftBottom.x) - rightMinX = min(rightTop.x, rightBottom.x) - leftCoverEnd = leftMaxX.ceil.int - rightCoverBegin = rightMinX.trunc.int + if leftCoverEnd < rightCoverBegin: + # Only take this shortcut if the partial coverage areas on the + # left and the right do not overlap - if leftCoverEnd < rightCoverBegin: - # Only take this shortcut if the partial coverage areas on the - # left and the right do not overlap - - let blender = blendMode.blender() - - block: # Left-side partial coverage - let - inverted = leftTop.x < leftBottom.x - sliverStart = min(leftTop.x, leftBottom.x) - rectStart = max(leftTop.x, leftBottom.x) - var - pen = sliverStart - prevPen = pen - penY = if inverted: y.float32 else: (y + 1).float32 - prevPenY = penY - for x in sliverStart.int ..< rectStart.ceil.int: - prevPen = pen - pen = (x + 1).float32 - var rightRectArea = 0.float32 - if pen > rectStart: - rightRectArea = pen - rectStart - pen = rectStart - prevPenY = penY - penY = left.solveY(pen) - if x < 0 or x >= image.width: - continue - let - run = pen - prevPen - triangleArea = 0.5.float32 * run * abs(penY - prevPenY) - rectArea = - if inverted: - (prevPenY - y.float32) * run - else: - ((y + 1).float32 - prevPenY) * run - area = triangleArea + rectArea + rightRectArea - dataIndex = image.dataIndex(x, y) - backdrop = image.data[dataIndex] - source = - when allowSimd and defined(amd64): - applyOpacity(vecRgbx, area) - else: - rgbx * area - image.data[dataIndex] = blender(backdrop, source) - - block: # Right-side partial coverage - let - inverted = rightTop.x > rightBottom.x - rectEnd = min(rightTop.x, rightBottom.x) - sliverEnd = max(rightTop.x, rightBottom.x) - var - pen = rectEnd - prevPen = pen - penY = if inverted: (y + 1).float32 else: y.float32 - prevPenY = penY - for x in rectEnd.int ..< sliverEnd.ceil.int: - prevPen = pen - pen = (x + 1).float32 - let leftRectArea = prevPen.fractional - if pen > sliverEnd: - pen = sliverEnd - prevPenY = penY - penY = right.solveY(pen) - if x < 0 or x >= image.width: - continue - let - run = pen - prevPen - triangleArea = 0.5.float32 * run * abs(penY - prevPenY) - rectArea = - if inverted: - (penY - y.float32) * run - else: - ((y + 1).float32 - penY) * run - area = leftRectArea + triangleArea + rectArea - dataIndex = image.dataIndex(x, y) - backdrop = image.data[dataIndex] - source = - when allowSimd and defined(amd64): - applyOpacity(vecRgbx, area) - else: - rgbx * area - image.data[dataIndex] = blender(backdrop, source) + let blender = blendMode.blender() + block: # Left-side partial coverage let - fillBegin = leftCoverEnd.clamp(0, image.width) - fillEnd = rightCoverBegin.clamp(0, image.width) - if fillEnd - fillBegin > 0: - hits[0] = (fixed32(fillBegin.float32), 1.int16) - hits[1] = (fixed32(fillEnd.float32), -1.int16) - image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode) + inverted = leftTop.x < leftBottom.x + sliverStart = min(leftTop.x, leftBottom.x) + rectStart = max(leftTop.x, leftBottom.x) + var + pen = sliverStart + prevPen = pen + penY = if inverted: y.float32 else: (y + 1).float32 + prevPenY = penY + for x in sliverStart.int ..< rectStart.ceil.int: + prevPen = pen + pen = (x + 1).float32 + var rightRectArea = 0.float32 + if pen > rectStart: + rightRectArea = pen - rectStart + pen = rectStart + prevPenY = penY + penY = left.solveY(pen) + if x < 0 or x >= image.width: + continue + let + run = pen - prevPen + triangleArea = 0.5.float32 * run * abs(penY - prevPenY) + rectArea = + if inverted: + (prevPenY - y.float32) * run + else: + ((y + 1).float32 - prevPenY) * run + area = triangleArea + rectArea + rightRectArea + dataIndex = image.dataIndex(x, y) + backdrop = image.data[dataIndex] + source = + when allowSimd and defined(amd64): + applyOpacity(vecRgbx, area) + else: + rgbx * area + image.data[dataIndex] = blender(backdrop, source) - inc y - continue + block: # Right-side partial coverage + let + inverted = rightTop.x > rightBottom.x + rectEnd = min(rightTop.x, rightBottom.x) + sliverEnd = max(rightTop.x, rightBottom.x) + var + pen = rectEnd + prevPen = pen + penY = if inverted: (y + 1).float32 else: y.float32 + prevPenY = penY + for x in rectEnd.int ..< sliverEnd.ceil.int: + prevPen = pen + pen = (x + 1).float32 + let leftRectArea = prevPen.fractional + if pen > sliverEnd: + pen = sliverEnd + prevPenY = penY + penY = right.solveY(pen) + if x < 0 or x >= image.width: + continue + let + run = pen - prevPen + triangleArea = 0.5.float32 * run * abs(penY - prevPenY) + rectArea = + if inverted: + (penY - y.float32) * run + else: + ((y + 1).float32 - penY) * run + area = leftRectArea + triangleArea + rectArea + dataIndex = image.dataIndex(x, y) + backdrop = image.data[dataIndex] + source = + when allowSimd and defined(amd64): + applyOpacity(vecRgbx, area) + else: + rgbx * area + image.data[dataIndex] = blender(backdrop, source) - else: let - minX = left.segment.at.x.int.clamp(0, image.width) - maxX = right.segment.at.x.int.clamp(0, image.width) - hits[0] = (cast[Fixed32](minX * 256), 1.int16) - hits[1] = (cast[Fixed32](maxX * 256), -1.int16) - image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode) + fillBegin = leftCoverEnd.clamp(0, image.width) + fillEnd = rightCoverBegin.clamp(0, image.width) + if fillEnd - fillBegin > 0: + hits[0] = (fixed32(fillBegin.float32), 1.int16) + hits[1] = (fixed32(fillEnd.float32), -1.int16) + image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode) inc y continue From f41f895e2425a5380d95dada22b0946f8e427079 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 28 Jun 2022 20:05:43 -0500 Subject: [PATCH 11/11] simpler --- src/pixie/paths.nim | 280 +++++++++++++++++++++----------------------- 1 file changed, 136 insertions(+), 144 deletions(-) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index f23a534..ef0d6e2 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1919,155 +1919,147 @@ proc fillShapes( break if allEntriesInScanlineSpanIt and tmp == 2: - var at: Vec2 - if not intersectsInside( - partitions[partitionIndex].entries[entryIndices[0]].segment, - partitions[partitionIndex].entries[entryIndices[1]].segment, - at - ): - # We have 2 non-intersecting lines - var - left = partitions[partitionIndex].entries[entryIndices[0]] - right = partitions[partitionIndex].entries[entryIndices[1]] - block: - # Ensure left is actually on the left + var + left = partitions[partitionIndex].entries[entryIndices[0]] + right = partitions[partitionIndex].entries[entryIndices[1]] + block: + # Ensure left is actually on the left + let + maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x) + maybeRightMaxX = max(right.segment.at.x, right.segment.to.x) + if maybeLeftMaxX > maybeRightMaxX: + swap left, right + + # Use trapezoid coverage at the edges and fill in the middle + + when allowSimd and defined(amd64): + let vecRgbx = mm_set_ps( + rgbx.a.float32, + rgbx.b.float32, + rgbx.g.float32, + rgbx.r.float32 + ) + + proc solveX(entry: PartitionEntry, y: float32): float32 = + if entry.m == 0: + entry.b + else: + (y - entry.b) / entry.m + + proc solveY(entry: PartitionEntry, x: float32): float32 = + entry.m * x + entry.b + + var + leftTop = vec2(0, y.float32) + leftBottom = vec2(0, (y + 1).float32) + leftTop.x = left.solveX(leftTop.y.float32) + leftBottom.x = left.solveX(leftBottom.y) + + var + rightTop = vec2(0, y.float32) + rightBottom = vec2(0, (y + 1).float32) + rightTop.x = right.solveX(rightTop.y) + rightBottom.x = right.solveX(rightBottom.y) + + let + leftMaxX = max(leftTop.x, leftBottom.x) + rightMinX = min(rightTop.x, rightBottom.x) + leftCoverEnd = leftMaxX.ceil.int + rightCoverBegin = rightMinX.trunc.int + + if leftCoverEnd < rightCoverBegin: + # Only take this shortcut if the partial coverage areas on the + # left and the right do not overlap + + let blender = blendMode.blender() + + block: # Left-side partial coverage let - maybeLeftMaxX = max(left.segment.at.x, left.segment.to.x) - maybeRightMaxX = max(right.segment.at.x, right.segment.to.x) - if maybeLeftMaxX > maybeRightMaxX: - swap left, right + inverted = leftTop.x < leftBottom.x + sliverStart = min(leftTop.x, leftBottom.x) + rectStart = max(leftTop.x, leftBottom.x) + var + pen = sliverStart + prevPen = pen + penY = if inverted: y.float32 else: (y + 1).float32 + prevPenY = penY + for x in sliverStart.int ..< rectStart.ceil.int: + prevPen = pen + pen = (x + 1).float32 + var rightRectArea = 0.float32 + if pen > rectStart: + rightRectArea = pen - rectStart + pen = rectStart + prevPenY = penY + penY = left.solveY(pen) + if x < 0 or x >= image.width: + continue + let + run = pen - prevPen + triangleArea = 0.5.float32 * run * abs(penY - prevPenY) + rectArea = + if inverted: + (prevPenY - y.float32) * run + else: + ((y + 1).float32 - prevPenY) * run + area = triangleArea + rectArea + rightRectArea + dataIndex = image.dataIndex(x, y) + backdrop = image.data[dataIndex] + source = + when allowSimd and defined(amd64): + applyOpacity(vecRgbx, area) + else: + rgbx * area + image.data[dataIndex] = blender(backdrop, source) - # We have 2 non-intersecting lines that require anti-aliasing - # Use trapezoid coverage at the edges and fill in the middle - - when allowSimd and defined(amd64): - let vecRgbx = mm_set_ps( - rgbx.a.float32, - rgbx.b.float32, - rgbx.g.float32, - rgbx.r.float32 - ) - - proc solveX(entry: PartitionEntry, y: float32): float32 = - if entry.m == 0: - entry.b - else: - (y - entry.b) / entry.m - - proc solveY(entry: PartitionEntry, x: float32): float32 = - entry.m * x + entry.b - - var - leftTop = vec2(0, y.float32) - leftBottom = vec2(0, (y + 1).float32) - leftTop.x = left.solveX(leftTop.y.float32) - leftBottom.x = left.solveX(leftBottom.y) - - var - rightTop = vec2(0, y.float32) - rightBottom = vec2(0, (y + 1).float32) - rightTop.x = right.solveX(rightTop.y) - rightBottom.x = right.solveX(rightBottom.y) + block: # Right-side partial coverage + let + inverted = rightTop.x > rightBottom.x + rectEnd = min(rightTop.x, rightBottom.x) + sliverEnd = max(rightTop.x, rightBottom.x) + var + pen = rectEnd + prevPen = pen + penY = if inverted: (y + 1).float32 else: y.float32 + prevPenY = penY + for x in rectEnd.int ..< sliverEnd.ceil.int: + prevPen = pen + pen = (x + 1).float32 + let leftRectArea = prevPen.fractional + if pen > sliverEnd: + pen = sliverEnd + prevPenY = penY + penY = right.solveY(pen) + if x < 0 or x >= image.width: + continue + let + run = pen - prevPen + triangleArea = 0.5.float32 * run * abs(penY - prevPenY) + rectArea = + if inverted: + (penY - y.float32) * run + else: + ((y + 1).float32 - penY) * run + area = leftRectArea + triangleArea + rectArea + dataIndex = image.dataIndex(x, y) + backdrop = image.data[dataIndex] + source = + when allowSimd and defined(amd64): + applyOpacity(vecRgbx, area) + else: + rgbx * area + image.data[dataIndex] = blender(backdrop, source) let - leftMaxX = max(leftTop.x, leftBottom.x) - rightMinX = min(rightTop.x, rightBottom.x) - leftCoverEnd = leftMaxX.ceil.int - rightCoverBegin = rightMinX.trunc.int + fillBegin = leftCoverEnd.clamp(0, image.width) + fillEnd = rightCoverBegin.clamp(0, image.width) + if fillEnd - fillBegin > 0: + hits[0] = (fixed32(fillBegin.float32), 1.int16) + hits[1] = (fixed32(fillEnd.float32), -1.int16) + image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode) - if leftCoverEnd < rightCoverBegin: - # Only take this shortcut if the partial coverage areas on the - # left and the right do not overlap - - let blender = blendMode.blender() - - block: # Left-side partial coverage - let - inverted = leftTop.x < leftBottom.x - sliverStart = min(leftTop.x, leftBottom.x) - rectStart = max(leftTop.x, leftBottom.x) - var - pen = sliverStart - prevPen = pen - penY = if inverted: y.float32 else: (y + 1).float32 - prevPenY = penY - for x in sliverStart.int ..< rectStart.ceil.int: - prevPen = pen - pen = (x + 1).float32 - var rightRectArea = 0.float32 - if pen > rectStart: - rightRectArea = pen - rectStart - pen = rectStart - prevPenY = penY - penY = left.solveY(pen) - if x < 0 or x >= image.width: - continue - let - run = pen - prevPen - triangleArea = 0.5.float32 * run * abs(penY - prevPenY) - rectArea = - if inverted: - (prevPenY - y.float32) * run - else: - ((y + 1).float32 - prevPenY) * run - area = triangleArea + rectArea + rightRectArea - dataIndex = image.dataIndex(x, y) - backdrop = image.data[dataIndex] - source = - when allowSimd and defined(amd64): - applyOpacity(vecRgbx, area) - else: - rgbx * area - image.data[dataIndex] = blender(backdrop, source) - - block: # Right-side partial coverage - let - inverted = rightTop.x > rightBottom.x - rectEnd = min(rightTop.x, rightBottom.x) - sliverEnd = max(rightTop.x, rightBottom.x) - var - pen = rectEnd - prevPen = pen - penY = if inverted: (y + 1).float32 else: y.float32 - prevPenY = penY - for x in rectEnd.int ..< sliverEnd.ceil.int: - prevPen = pen - pen = (x + 1).float32 - let leftRectArea = prevPen.fractional - if pen > sliverEnd: - pen = sliverEnd - prevPenY = penY - penY = right.solveY(pen) - if x < 0 or x >= image.width: - continue - let - run = pen - prevPen - triangleArea = 0.5.float32 * run * abs(penY - prevPenY) - rectArea = - if inverted: - (penY - y.float32) * run - else: - ((y + 1).float32 - penY) * run - area = leftRectArea + triangleArea + rectArea - dataIndex = image.dataIndex(x, y) - backdrop = image.data[dataIndex] - source = - when allowSimd and defined(amd64): - applyOpacity(vecRgbx, area) - else: - rgbx * area - image.data[dataIndex] = blender(backdrop, source) - - let - fillBegin = leftCoverEnd.clamp(0, image.width) - fillEnd = rightCoverBegin.clamp(0, image.width) - if fillEnd - fillBegin > 0: - hits[0] = (fixed32(fillBegin.float32), 1.int16) - hits[1] = (fixed32(fillEnd.float32), -1.int16) - image.fillHits(rgbx, 0, y, hits, 2, NonZero, blendMode) - - inc y - continue + inc y + continue computeCoverage( cast[ptr UncheckedArray[uint8]](coverages[0].addr),