From f8e8de9c5f1e8442af6ae3a637364f8f87718cb3 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Mon, 25 Jul 2022 23:50:32 -0500 Subject: [PATCH 1/4] faster --- src/pixie/images.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 6fc99f4..ba48e7a 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -853,7 +853,7 @@ proc spread(image: Image, spread: float32) {.raises: [PixieError].} = maxValue = value if maxValue == 255: break - spreadX.unsafe[y, x] = rgbx(0, 0, 0, maxValue) + spreadX.unsafe[y, x].a = maxValue # Spread in the Y direction and modify mask. for y in 0 ..< image.height: From cc37dd9d54a8a3459a812d800ef976e48bee2c5c Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Mon, 25 Jul 2022 23:40:42 -0500 Subject: [PATCH 2/4] simd stuff --- src/pixie/simd/avx2.nim | 82 +++++++++++++++++++++++++++++++++++------ src/pixie/simd/sse2.nim | 78 +++++++++++++++++++++++++++++---------- tests/bench_images.nim | 4 ++ 3 files changed, 132 insertions(+), 32 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 4e89ea5..ea07247 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -91,7 +91,19 @@ proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = return false proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = - var i: int + var + i: int + p = cast[uint](data[0].addr) + # Align to 32 bytes + while i < data.len and (p and 31) != 0: + var rgbx = data[i] + if rgbx.a != 255: + rgbx.r = ((rgbx.r.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.g = ((rgbx.g.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.b = ((rgbx.b.uint32 * rgbx.a + 127) div 255).uint8 + data[i] = rgbx + inc i + p += 4 let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) @@ -101,7 +113,7 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = iterations = data.len div 8 for _ in 0 ..< iterations: let - values = mm256_loadu_si256(data[i].addr) + values = mm256_load_si256(cast[pointer](p)) alpha = mm256_and_si256(values, alphaMask) eq = mm256_cmpeq_epi8(values, alphaMask) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: @@ -122,16 +134,17 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = colorsOdd = mm256_add_epi16(colorsOdd, tmpOdd) colorsEven = mm256_srli_epi16(colorsEven, 8) colorsOdd = mm256_and_si256(colorsOdd, hiMask) - mm256_storeu_si256(data[i].addr, mm256_or_si256(colorsEven, colorsOdd)) - i += 8 + mm256_store_si256(cast[pointer](p), mm256_or_si256(colorsEven, colorsOdd)) + p += 32 + i += 8 * iterations for i in i ..< data.len: - var c = data[i] - if c.a != 255: - c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 - c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 - c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 - data[i] = c + var rgbx = data[i] + if rgbx.a != 255: + rgbx.r = ((rgbx.r.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.g = ((rgbx.g.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.b = ((rgbx.b.uint32 * rgbx.a + 127) div 255).uint8 + data[i] = rgbx proc invertAvx2*(image: Image) {.simd.} = var @@ -182,6 +195,16 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = var i: int p = cast[uint](image.data[0].addr) + # Align to 32 bytes + while i < image.data.len and (p and 31) != 0: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + inc i + p += 4 let oddMask = mm256_set1_epi16(0xff00) @@ -191,7 +214,7 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = iterations = image.data.len div 8 for _ in 0 ..< iterations: let - values = mm256_loadu_si256(cast[pointer](p)) + values = mm256_load_si256(cast[pointer](p)) eqZero = mm256_cmpeq_epi16(values, zeroVec) if mm256_movemask_epi8(eqZero) != cast[int32](0xffffffff): var @@ -201,7 +224,7 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = valuesOdd = mm256_mulhi_epu16(valuesOdd, opacityVec) valuesEven = mm256_srli_epi16(mm256_mulhi_epu16(valuesEven, div255), 7) valuesOdd = mm256_srli_epi16(mm256_mulhi_epu16(valuesOdd, div255), 7) - mm256_storeu_si256( + mm256_store_si256( cast[pointer](p), mm256_or_si256(valuesEven, mm256_slli_epi16(valuesOdd, 8)) ) @@ -216,5 +239,40 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = rgbx.a = ((rgbx.a * opacity) div 255).uint8 image.data[i] = rgbx +proc ceilAvx2*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 32 bytes + while i < image.data.len and (p and 31) != 0: + var rgbx = image.data[i] + rgbx.r = if rgbx.r == 0: 0 else: 255 + rgbx.g = if rgbx.g == 0: 0 else: 255 + rgbx.b = if rgbx.b == 0: 0 else: 255 + rgbx.a = if rgbx.a == 0: 0 else: 255 + image.data[i] = rgbx + inc i + p += 4 + + let + vecZero = mm256_setzero_si256() + vec255 = mm256_set1_epi8(255) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + var values = mm256_load_si256(cast[pointer](p)) + values = mm256_cmpeq_epi8(values, vecZero) + values = mm256_andnot_si256(values, vec255) + mm256_store_si256(cast[pointer](p), values) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = if rgbx.r == 0: 0 else: 255 + rgbx.g = if rgbx.g == 0: 0 else: 255 + rgbx.b = if rgbx.b == 0: 0 else: 255 + rgbx.a = if rgbx.a == 0: 0 else: 255 + image.data[i] = rgbx + when defined(release): {.pop.} diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 313c756..6b4f78c 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -10,20 +10,6 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) -proc packAlphaValues(v: M128i): M128i {.inline.} = - ## Shuffle the alpha values for these 4 colors to the first 4 bytes. - result = mm_srli_epi32(v, 24) - result = mm_packus_epi16(result, mm_setzero_si128()) - result = mm_packus_epi16(result, mm_setzero_si128()) - -proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} = - let - i = packAlphaValues(i) - j = mm_slli_si128(packAlphaValues(j), 4) - k = mm_slli_si128(packAlphaValues(k), 8) - l = mm_slli_si128(packAlphaValues(l), 12) - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) - proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). result = mm_unpacklo_epi8(mm_setzero_si128(), v) @@ -167,6 +153,8 @@ proc isOpaqueSse2*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = var i: int + # Not worth aligning + let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(0xff00) @@ -200,12 +188,12 @@ proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = i += 4 for i in i ..< data.len: - var c = data[i] - if c.a != 255: - c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 - c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 - c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 - data[i] = c + var rgbx = data[i] + if rgbx.a != 255: + rgbx.r = ((rgbx.r.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.g = ((rgbx.g.uint32 * rgbx.a + 127) div 255).uint8 + rgbx.b = ((rgbx.b.uint32 * rgbx.a + 127) div 255).uint8 + data[i] = rgbx proc invertSse2*(image: Image) {.simd.} = var @@ -260,6 +248,16 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = var i: int p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + inc i + p += 4 let oddMask = mm_set1_epi16(0xff00) @@ -292,6 +290,46 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = rgbx.a = ((rgbx.a * opacity) div 255).uint8 image.data[i] = rgbx +proc ceilSse2*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = if rgbx.r == 0: 0 else: 255 + rgbx.g = if rgbx.g == 0: 0 else: 255 + rgbx.b = if rgbx.b == 0: 0 else: 255 + rgbx.a = if rgbx.a == 0: 0 else: 255 + image.data[i] = rgbx + inc i + p += 4 + + let + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + var + values0 = mm_loadu_si128(cast[pointer](p)) + values1 = mm_loadu_si128(cast[pointer](p + 16)) + values0 = mm_cmpeq_epi8(values0, vecZero) + values1 = mm_cmpeq_epi8(values1, vecZero) + values0 = mm_andnot_si128(values0, vec255) + values1 = mm_andnot_si128(values1, vec255) + mm_storeu_si128(cast[pointer](p), values0) + mm_storeu_si128(cast[pointer](p + 16), values1) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = if rgbx.r == 0: 0 else: 255 + rgbx.g = if rgbx.g == 0: 0 else: 255 + rgbx.b = if rgbx.b == 0: 0 else: 255 + rgbx.a = if rgbx.a == 0: 0 else: 255 + image.data[i] = rgbx + proc blitLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = diff --git a/tests/bench_images.nim b/tests/bench_images.nim index 28ab7aa..423cd57 100644 --- a/tests/bench_images.nim +++ b/tests/bench_images.nim @@ -89,6 +89,10 @@ timeIt "toStraightAlpha": reset() +timeIt "ceil": + reset() + image.ceil() + block: let image = newImage(200, 200) image.fill(rgbx(255, 0, 0, 255)) From 5ca6f57aaef1f48e8e07e21c49d7695f893ac643 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 26 Jul 2022 12:47:58 -0500 Subject: [PATCH 3/4] fix --- experiments/bench_cairo.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/bench_cairo.nim b/experiments/bench_cairo.nim index ee862fc..3cb0f04 100644 --- a/experiments/bench_cairo.nim +++ b/experiments/bench_cairo.nim @@ -174,7 +174,7 @@ block: # Tiger props.strokeDashArray, props.transform.pixelScale ) - let paint = newPaint(props.stroke) + let paint = props.stroke.copy() paint.color.a *= (props.opacity * props.strokeOpacity) fills.add(Fill( shapes: strokeShapes, From e5c4ba1605416da19722ba74a6249f9a31a8b608 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 26 Jul 2022 13:56:13 -0500 Subject: [PATCH 4/4] minifyBy2 magnifyBy2 simd --- src/pixie/common.nim | 22 +++++++ src/pixie/images.nim | 124 +++++++++-------------------------- src/pixie/internal.nim | 12 ---- src/pixie/simd/avx2.nim | 103 +++++++++++++++++++++++++++++ src/pixie/simd/sse2.nim | 140 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 297 insertions(+), 104 deletions(-) diff --git a/src/pixie/common.nim b/src/pixie/common.nim index 8b04458..f558666 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -46,6 +46,16 @@ proc newImage*(width, height: int): Image {.raises: [PixieError].} = result.height = height result.data = newSeq[ColorRGBX](width * height) +proc copy*(image: Image): Image {.raises: [].} = + ## Copies the image data into a new image. + result = Image() + result.width = image.width + result.height = image.height + result.data = image.data + +template dataIndex*(image: Image, x, y: int): int = + image.width * y + x + proc mix*(a, b: uint8, t: float32): uint8 {.inline, raises: [].} = ## Linearly interpolate between a and b using t. let t = round(t * 255).uint32 @@ -59,6 +69,18 @@ proc mix*(a, b: ColorRGBX, t: float32): ColorRGBX {.inline, raises: [].} = result.b = ((a.b.uint32 * (255 - x) + b.b.uint32 * x) div 255).uint8 result.a = ((a.a.uint32 * (255 - x) + b.a.uint32 * x) div 255).uint8 +proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = + if opacity == 0: + rgbx(0, 0, 0, 0) + else: + let + x = round(opacity * 255).uint32 + r = ((color.r * x) div 255).uint8 + g = ((color.g * x) div 255).uint8 + b = ((color.b * x) div 255).uint8 + a = ((color.a * x) div 255).uint8 + rgbx(r, g, b, a) + proc snapToPixels*(rect: Rect): Rect {.raises: [].} = let xMin = rect.x diff --git a/src/pixie/images.nim b/src/pixie/images.nim index ba48e7a..d2871e4 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,6 +1,6 @@ import blends, bumpy, chroma, common, internal, simd, vmath -export Image, newImage +export Image, newImage, copy, dataIndex const h = 0.5.float32 @@ -9,13 +9,6 @@ type UnsafeImage = distinct Image when defined(release): {.push checks: off.} -proc copy*(image: Image): Image {.raises: [].} = - ## Copies the image data into a new image. - result = Image() - result.width = image.width - result.height = image.height - result.data = image.data - proc `$`*(image: Image): string {.raises: [].} = ## Prints the image size. "" @@ -24,9 +17,6 @@ proc inside*(image: Image, x, y: int): bool {.inline, raises: [].} = ## Returns true if (x, y) is inside the image. x >= 0 and x < image.width and y >= 0 and y < image.height -proc dataIndex*(image: Image, x, y: int): int {.inline, raises: [].} = - image.width * y + x - template unsafe*(src: Image): UnsafeImage = cast[UnsafeImage](src) @@ -167,7 +157,9 @@ proc diff*(master, image: Image): (float32, Image) {.raises: [PixieError].} = (100 * diffScore.float32 / diffTotal.float32, diffImage) -proc minifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = +proc minifyBy2*( + image: Image, power = 1 +): Image {.hasSimd, raises: [PixieError].} = ## Scales the image down by an integer scale. if power < 0: raise newException(PixieError, "Cannot minifyBy2 with negative power") @@ -188,90 +180,50 @@ proc minifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight ) for y in 0 ..< resultEvenHeight: - var x: int - when defined(amd64) and allowSimd: + let + topRowStart = src.dataIndex(0, y * 2) + bottomRowStart = src.dataIndex(0, y * 2 + 1) + for x in 0 ..< resultEvenWidth: let - oddMask = mm_set1_epi16(cast[int16](0xff00)) - mergedMask = mm_set_epi32(0, uint32.high, 0, uint32.high) - for _ in countup(0, resultEvenWidth - 4, 2): - let - top = mm_loadu_si128(src.data[src.dataIndex(x * 2, y * 2 + 0)].addr) - btm = mm_loadu_si128(src.data[src.dataIndex(x * 2, y * 2 + 1)].addr) - topShifted = mm_srli_si128(top, 4) - btmShifted = mm_srli_si128(btm, 4) - - topEven = mm_andnot_si128(oddMask, top) - topOdd = mm_srli_epi16(top, 8) - btmEven = mm_andnot_si128(oddMask, btm) - btmOdd = mm_srli_epi16(btm, 8) - - topShiftedEven = mm_andnot_si128(oddMask, topShifted) - topShiftedOdd = mm_srli_epi16(topShifted, 8) - btmShiftedEven = mm_andnot_si128(oddMask, btmShifted) - btmShiftedOdd = mm_srli_epi16(btmShifted, 8) - - topAddedEven = mm_add_epi16(topEven, topShiftedEven) - btmAddedEven = mm_add_epi16(btmEven, btmShiftedEven) - topAddedOdd = mm_add_epi16(topOdd, topShiftedOdd) - btmAddedOdd = mm_add_epi16(btmOdd, btmShiftedOdd) - - addedEven = mm_add_epi16(topAddedEven, btmAddedEven) - addedOdd = mm_add_epi16(topAddedOdd, btmAddedOdd) - - addedEvenDiv4 = mm_srli_epi16(addedEven, 2) - addedOddDiv4 = mm_srli_epi16(addedOdd, 2) - - merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) - # Merged has the correct values for the next two pixels at - # index 0 and 2 so mask the others out and shift 0 and 2 into - # position and store - masked = mm_and_si128(merged, mergedMask) - - mm_storeu_si128( - result.data[result.dataIndex(x, y)].addr, - mm_shuffle_epi32(masked, MM_SHUFFLE(0, 0, 2, 0)) - ) - x += 2 - - for x in x ..< resultEvenWidth: - let - a = src.unsafe[x * 2 + 0, y * 2 + 0] - b = src.unsafe[x * 2 + 1, y * 2 + 0] - c = src.unsafe[x * 2 + 1, y * 2 + 1] - d = src.unsafe[x * 2 + 0, y * 2 + 1] + a = src.data[topRowStart + x * 2] + b = src.data[topRowStart + x * 2 + 1] + c = src.data[bottomRowStart + x * 2 + 1] + d = src.data[bottomRowStart + x * 2] mixed = rgbx( ((a.r.uint32 + b.r + c.r + d.r) div 4).uint8, ((a.g.uint32 + b.g + c.g + d.g) div 4).uint8, ((a.b.uint32 + b.b + c.b + d.b) div 4).uint8, ((a.a.uint32 + b.a + c.a + d.a) div 4).uint8 ) - result.unsafe[x, y] = mixed + result.data[result.dataIndex(x, y)] = mixed if srcWidthIsOdd: let rgbx = mix( - src.unsafe[src.width - 1, y * 2 + 0], - src.unsafe[src.width - 1, y * 2 + 1], + src.data[src.dataIndex(src.width - 1, y * 2 + 0)], + src.data[src.dataIndex(src.width - 1, y * 2 + 1)], 0.5 ) * 0.5 - result.unsafe[result.width - 1, y] = rgbx + result.data[result.dataIndex(result.width - 1, y)] = rgbx if srcHeightIsOdd: for x in 0 ..< resultEvenWidth: let rgbx = mix( - src.unsafe[x * 2 + 0, src.height - 1], - src.unsafe[x * 2 + 1, src.height - 1], + src.data[src.dataIndex(x * 2 + 0, src.height - 1)], + src.data[src.dataIndex(x * 2 + 1, src.height - 1)], 0.5 ) * 0.5 - result.unsafe[x, result.height - 1] = rgbx + result.data[result.dataIndex(x, result.height - 1)] = rgbx if srcWidthIsOdd: - result.unsafe[result.width - 1, result.height - 1] = - src.unsafe[src.width - 1, src.height - 1] * 0.25 + result.data[result.dataIndex(result.width - 1, result.height - 1)] = + src.data[src.dataIndex(src.width - 1, src.height - 1)] * 0.25 # Set src as this result for if we do another power src = result -proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = +proc magnifyBy2*( + image: Image, power = 1 +): Image {.hasSimd, raises: [PixieError].} = ## Scales image up by 2 ^ power. if power < 0: raise newException(PixieError, "Cannot magnifyBy2 with negative power") @@ -281,32 +233,20 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = for y in 0 ..< image.height: # Write one row of pixels duplicated by scale - var x: int - when defined(amd64) and allowSimd: - if scale == 2: - while x <= image.width - 4: - let values = mm_loadu_si128(image.data[image.dataIndex(x, y)].addr) - mm_storeu_si128( - result.data[result.dataIndex(x * scale + 0, y * scale)].addr, - mm_unpacklo_epi32(values, values) - ) - mm_storeu_si128( - result.data[result.dataIndex(x * scale + 4, y * scale)].addr, - mm_unpackhi_epi32(values, values) - ) - x += 4 - for x in x ..< image.width: + let + sourceRowStart = image.dataIndex(0, y) + resultRowStart = result.dataIndex(0, y * scale) + for x in 0 ..< image.width: let - rgbx = image.unsafe[x, y] - resultIdx = result.dataIndex(x * scale, y * scale) + rgbx = image.data[sourceRowStart + x] + resultIdx = resultRowStart + x * scale for i in 0 ..< scale: result.data[resultIdx + i] = rgbx # Copy that row of pixels into (scale - 1) more rows - let rowStart = result.dataIndex(0, y * scale) for i in 1 ..< scale: copyMem( - result.data[rowStart + result.width * i].addr, - result.data[rowStart].addr, + result.data[resultRowStart + result.width * i].addr, + result.data[resultRowStart].addr, result.width * 4 ) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 3f00fca..0120333 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -33,18 +33,6 @@ proc gaussianKernel*(radius: int): seq[uint16] {.raises: [].} = for i, f in floats: result[i] = round(f * 255 * 256).uint16 -proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = - if opacity == 0: - rgbx(0, 0, 0, 0) - else: - let - x = round(opacity * 255).uint32 - r = ((color.r * x) div 255).uint8 - g = ((color.g * x) div 255).uint8 - b = ((color.b * x) div 255).uint8 - a = ((color.a * x) div 255).uint8 - rgbx(r, g, b, a) - proc intersectsInside*(a, b: Segment, at: var Vec2): bool {.inline.} = ## Checks if the a segment intersects b segment (excluding endpoints). ## If it returns true, at will have point of intersection diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index ea07247..3e36f8f 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -274,5 +274,108 @@ proc ceilAvx2*(image: Image) {.simd.} = rgbx.a = if rgbx.a == 0: 0 else: 255 image.data[i] = rgbx +proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = + ## Scales the image down by an integer scale. + if power < 0: + raise newException(PixieError, "Cannot minifyBy2 with negative power") + if power == 0: + return image.copy() + + var src = image + for _ in 1 .. power: + # When minifying an image of odd size, round the result image size up + # so a 99 x 99 src image returns a 50 x 50 image. + let + srcWidthIsOdd = (src.width mod 2) != 0 + srcHeightIsOdd = (src.height mod 2) != 0 + resultEvenWidth = src.width div 2 + resultEvenHeight = src.height div 2 + result = newImage( + if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth, + if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight + ) + let + oddMask = mm256_set1_epi16(0xff00) + mergedMask = mm256_set_epi32( + 0, uint32.high, 0, uint32.high, 0, uint32.high, 0, uint32.high + ) + permuteControl = mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0) + for y in 0 ..< resultEvenHeight: + let + topRowStart = src.dataIndex(0, y * 2) + bottomRowStart = src.dataIndex(0, y * 2 + 1) + + var x: int + while x <= resultEvenWidth - 8: + let + top = mm256_loadu_si256(src.data[topRowStart + x * 2].addr) + bottom = mm256_loadu_si256(src.data[bottomRowStart + x * 2].addr) + topShifted = mm256_srli_si256(top, 4) + bottomShifted = mm256_srli_si256(bottom, 4) + topEven = mm256_andnot_si256(oddMask, top) + topOdd = mm256_srli_epi16(top, 8) + bottomEven = mm256_andnot_si256(oddMask, bottom) + bottomOdd = mm256_srli_epi16(bottom, 8) + topShiftedEven = mm256_andnot_si256(oddMask, topShifted) + topShiftedOdd = mm256_srli_epi16(topShifted, 8) + bottomShiftedEven = mm256_andnot_si256(oddMask, bottomShifted) + bottomShiftedOdd = mm256_srli_epi16(bottomShifted, 8) + topAddedEven = mm256_add_epi16(topEven, topShiftedEven) + bottomAddedEven = mm256_add_epi16(bottomEven, bottomShiftedEven) + topAddedOdd = mm256_add_epi16(topOdd, topShiftedOdd) + bottomAddedOdd = mm256_add_epi16(bottomOdd, bottomShiftedOdd) + addedEven = mm256_add_epi16(topAddedEven, bottomAddedEven) + addedOdd = mm256_add_epi16(topAddedOdd, bottomAddedOdd) + addedEvenDiv4 = mm256_srli_epi16(addedEven, 2) + addedOddDiv4 = mm256_srli_epi16(addedOdd, 2) + merged = mm256_or_si256(addedEvenDiv4, mm256_slli_epi16(addedOddDiv4, 8)) + # Merged has the correct values for the next two pixels at + # index 0, 2, 4, 6 so mask the others out and permute into position + masked = mm256_and_si256(merged, mergedMask) + permuted = mm_256_permutevar8x32_epi32(masked, permuteControl) + mm_storeu_si128( + result.data[result.dataIndex(x, y)].addr, + mm256_castsi256_si128(permuted) + ) + x += 4 + + for x in x ..< resultEvenWidth: + let + a = src.data[topRowStart + x * 2] + b = src.data[topRowStart + x * 2 + 1] + c = src.data[bottomRowStart + x * 2 + 1] + d = src.data[bottomRowStart + x * 2] + mixed = rgbx( + ((a.r.uint32 + b.r + c.r + d.r) div 4).uint8, + ((a.g.uint32 + b.g + c.g + d.g) div 4).uint8, + ((a.b.uint32 + b.b + c.b + d.b) div 4).uint8, + ((a.a.uint32 + b.a + c.a + d.a) div 4).uint8 + ) + result.data[result.dataIndex(x, y)] = mixed + + if srcWidthIsOdd: + let rgbx = mix( + src.data[src.dataIndex(src.width - 1, y * 2 + 0)], + src.data[src.dataIndex(src.width - 1, y * 2 + 1)], + 0.5 + ) * 0.5 + result.data[result.dataIndex(result.width - 1, y)] = rgbx + + if srcHeightIsOdd: + for x in 0 ..< resultEvenWidth: + let rgbx = mix( + src.data[src.dataIndex(x * 2 + 0, src.height - 1)], + src.data[src.dataIndex(x * 2 + 1, src.height - 1)], + 0.5 + ) * 0.5 + result.data[result.dataIndex(x, result.height - 1)] = rgbx + + if srcWidthIsOdd: + result.data[result.dataIndex(result.width - 1, result.height - 1)] = + src.data[src.dataIndex(src.width - 1, src.height - 1)] * 0.25 + + # Set src as this result for if we do another power + src = result + when defined(release): {.pop.} diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 6b4f78c..815b880 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -330,6 +330,146 @@ proc ceilSse2*(image: Image) {.simd.} = rgbx.a = if rgbx.a == 0: 0 else: 255 image.data[i] = rgbx +proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = + ## Scales the image down by an integer scale. + if power < 0: + raise newException(PixieError, "Cannot minifyBy2 with negative power") + if power == 0: + return image.copy() + + var src = image + for _ in 1 .. power: + # When minifying an image of odd size, round the result image size up + # so a 99 x 99 src image returns a 50 x 50 image. + let + srcWidthIsOdd = (src.width mod 2) != 0 + srcHeightIsOdd = (src.height mod 2) != 0 + resultEvenWidth = src.width div 2 + resultEvenHeight = src.height div 2 + result = newImage( + if srcWidthIsOdd: resultEvenWidth + 1 else: resultEvenWidth, + if srcHeightIsOdd: resultEvenHeight + 1 else: resultEvenHeight + ) + let + oddMask = mm_set1_epi16(0xff00) + mergedMask = mm_set_epi32(0, uint32.high, 0, uint32.high) + for y in 0 ..< resultEvenHeight: + let + topRowStart = src.dataIndex(0, y * 2) + bottomRowStart = src.dataIndex(0, y * 2 + 1) + + var x: int + while x <= resultEvenWidth - 4: + let + top = mm_loadu_si128(src.data[topRowStart + x * 2].addr) + bottom = mm_loadu_si128(src.data[bottomRowStart + x * 2].addr) + topShifted = mm_srli_si128(top, 4) + bottomShifted = mm_srli_si128(bottom, 4) + topEven = mm_andnot_si128(oddMask, top) + topOdd = mm_srli_epi16(top, 8) + bottomEven = mm_andnot_si128(oddMask, bottom) + bottomOdd = mm_srli_epi16(bottom, 8) + topShiftedEven = mm_andnot_si128(oddMask, topShifted) + topShiftedOdd = mm_srli_epi16(topShifted, 8) + bottomShiftedEven = mm_andnot_si128(oddMask, bottomShifted) + bottomShiftedOdd = mm_srli_epi16(bottomShifted, 8) + topAddedEven = mm_add_epi16(topEven, topShiftedEven) + bottomAddedEven = mm_add_epi16(bottomEven, bottomShiftedEven) + topAddedOdd = mm_add_epi16(topOdd, topShiftedOdd) + bottomAddedOdd = mm_add_epi16(bottomOdd, bottomShiftedOdd) + addedEven = mm_add_epi16(topAddedEven, bottomAddedEven) + addedOdd = mm_add_epi16(topAddedOdd, bottomAddedOdd) + addedEvenDiv4 = mm_srli_epi16(addedEven, 2) + addedOddDiv4 = mm_srli_epi16(addedOdd, 2) + merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) + # Merged has the correct values for the next two pixels at + # index 0 and 2 so mask the others out and shift 0 and 2 into + # position and store + masked = mm_and_si128(merged, mergedMask) + mm_storeu_si128( + result.data[result.dataIndex(x, y)].addr, + mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0)) + ) + x += 2 + + for x in x ..< resultEvenWidth: + let + a = src.data[topRowStart + x * 2] + b = src.data[topRowStart + x * 2 + 1] + c = src.data[bottomRowStart + x * 2 + 1] + d = src.data[bottomRowStart + x * 2] + mixed = rgbx( + ((a.r.uint32 + b.r + c.r + d.r) div 4).uint8, + ((a.g.uint32 + b.g + c.g + d.g) div 4).uint8, + ((a.b.uint32 + b.b + c.b + d.b) div 4).uint8, + ((a.a.uint32 + b.a + c.a + d.a) div 4).uint8 + ) + result.data[result.dataIndex(x, y)] = mixed + + if srcWidthIsOdd: + let rgbx = mix( + src.data[src.dataIndex(src.width - 1, y * 2 + 0)], + src.data[src.dataIndex(src.width - 1, y * 2 + 1)], + 0.5 + ) * 0.5 + result.data[result.dataIndex(result.width - 1, y)] = rgbx + + if srcHeightIsOdd: + for x in 0 ..< resultEvenWidth: + let rgbx = mix( + src.data[src.dataIndex(x * 2 + 0, src.height - 1)], + src.data[src.dataIndex(x * 2 + 1, src.height - 1)], + 0.5 + ) * 0.5 + result.data[result.dataIndex(x, result.height - 1)] = rgbx + + if srcWidthIsOdd: + result.data[result.dataIndex(result.width - 1, result.height - 1)] = + src.data[src.dataIndex(src.width - 1, src.height - 1)] * 0.25 + + # Set src as this result for if we do another power + src = result + +proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = + ## Scales image up by 2 ^ power. + if power < 0: + raise newException(PixieError, "Cannot magnifyBy2 with negative power") + + let scale = 2 ^ power + result = newImage(image.width * scale, image.height * scale) + + for y in 0 ..< image.height: + # Write one row of pixels duplicated by scale + let + sourceRowStart = image.dataIndex(0, y) + resultRowStart = result.dataIndex(0, y * scale) + var x: int + if scale == 2: + while x <= image.width - 4: + let values = mm_loadu_si128(image.data[sourceRowStart + x].addr) + mm_storeu_si128( + result.data[resultRowStart + x * scale].addr, + mm_unpacklo_epi32(values, values) + ) + mm_storeu_si128( + result.data[resultRowStart + x * scale + 4].addr, + mm_unpackhi_epi32(values, values) + ) + x += 4 + for x in x ..< image.width: + let + rgbx = image.data[sourceRowStart + x] + resultIdx = resultRowStart + x * scale + for i in 0 ..< scale: + result.data[resultIdx + i] = rgbx + # Copy that row of pixels into (scale - 1) more rows + for i in 1 ..< scale: + copyMem( + result.data[resultRowStart + result.width * i].addr, + result.data[resultRowStart].addr, + result.width * 4 + ) + proc blitLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} =