diff --git a/experiments/benchmark_cairo.nim b/experiments/bench_cairo.nim similarity index 100% rename from experiments/benchmark_cairo.nim rename to experiments/bench_cairo.nim diff --git a/experiments/benchmark_cairo_draw.nim b/experiments/bench_cairo_draw.nim similarity index 100% rename from experiments/benchmark_cairo_draw.nim rename to experiments/bench_cairo_draw.nim diff --git a/pixie.nimble b/pixie.nimble index c139064..4b819c3 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -7,10 +7,10 @@ srcDir = "src" requires "nim >= 1.4.8" requires "vmath >= 1.1.4" -requires "chroma >= 0.2.5" -requires "zippy >= 0.10.2" +requires "chroma >= 0.2.6" +requires "zippy >= 0.10.3" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.6" +requires "nimsimd >= 1.1.7" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index e6a38be..2694c73 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -553,16 +553,11 @@ when defined(amd64) and allowSimd: var backdropEven = mm_slli_epi16(backdrop, 8) backdropOdd = mm_and_si128(backdrop, oddMask) - - # backdrop * k backdropEven = mm_mulhi_epu16(backdropEven, evenK) backdropOdd = mm_mulhi_epu16(backdropOdd, oddK) - - # div 255 backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - # Shift from high to low bits sourceEven = mm_srli_epi16(sourceEven, 8) sourceOdd = mm_srli_epi16(sourceOdd, 8) @@ -582,12 +577,8 @@ when defined(amd64) and allowSimd: var backdropEven = mm_slli_epi16(backdrop, 8) backdropOdd = mm_and_si128(backdrop, oddMask) - - # backdrop * source backdropEven = mm_mulhi_epu16(backdropEven, sourceEven) backdropOdd = mm_mulhi_epu16(backdropOdd, sourceOdd) - - # div 255 backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 8a6e2ef..e3466b2 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -104,9 +104,9 @@ proc toPremultipliedAlpha*( for i in 0 ..< data.len: var c = data[i] if c.a != 255: - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 + c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 + c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 + c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool {.hasSimd.} = diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index a692692..be900bd 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -96,7 +96,8 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(0xff00) - div255 = mm256_set1_epi16(0x8081) + vec128 = mm256_set1_epi16(128) + hiMask = mm256_set1_epi16(255 shl 8) iterations = data.len div 8 for _ in 0 ..< iterations: let @@ -112,20 +113,24 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = colorsOdd = mm256_and_si256(values, oddMask) colorsEven = mm256_mulhi_epu16(colorsEven, evenMultiplier) colorsOdd = mm256_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) - mm256_storeu_si256( - data[i].addr, - mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) - ) + let + tmpEven = mm256_add_epi16(colorsEven, vec128) + tmpOdd = mm256_add_epi16(colorsOdd, vec128) + colorsEven = mm256_srli_epi16(tmpEven, 8) + colorsOdd = mm256_srli_epi16(tmpOdd, 8) + colorsEven = mm256_add_epi16(colorsEven, tmpEven) + colorsOdd = mm256_add_epi16(colorsOdd, tmpOdd) + colorsEven = mm256_srli_epi16(colorsEven, 8) + colorsOdd = mm256_and_si256(colorsOdd, hiMask) + mm256_storeu_si256(data[i].addr, mm256_or_si256(colorsEven, colorsOdd)) i += 8 for i in i ..< data.len: var c = data[i] if c.a != 255: - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 + c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 + c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 + c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 data[i] = c when defined(release): diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 1386ec0..739a224 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -123,6 +123,43 @@ proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = if data[i].a != 255: return false +proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 + c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 + c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 + data[i] = c + inc i + p += 4 + + proc premultiply(c, a: uint8x8): uint8x8 {.inline.} = + let ca = vmull_u8(c, a) + vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) + + let iterations = (data.len - i) div 8 + for _ in 0 ..< iterations: + var channels = vld4_u8(cast[pointer](p)) + channels.val[0] = premultiply(channels.val[0], channels.val[3]) + channels.val[1] = premultiply(channels.val[1], channels.val[3]) + channels.val[2] = premultiply(channels.val[2], channels.val[3]) + vst4_u8(cast[pointer](p), channels) + p += 32 + i += 8 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 + c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 + c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 + data[i] = c + proc newImageNeon*(mask: Mask): Image {.simd.} = result = newImage(mask.width, mask.height) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 7421b9d..b5021fc 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -170,7 +170,8 @@ proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) + vec128 = mm_set1_epi16(128) + hiMask = mm_set1_epi16(255 shl 8) iterations = data.len div 4 for _ in 0 ..< iterations: let @@ -186,20 +187,24 @@ proc toPremultipliedAlphaSse2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = colorsOdd = mm_and_si128(values, oddMask) colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) + let + tmpEven = mm_add_epi16(colorsEven, vec128) + tmpOdd = mm_add_epi16(colorsOdd, vec128) + colorsEven = mm_srli_epi16(tmpEven, 8) + colorsOdd = mm_srli_epi16(tmpOdd, 8) + colorsEven = mm_add_epi16(colorsEven, tmpEven) + colorsOdd = mm_add_epi16(colorsOdd, tmpOdd) + colorsEven = mm_srli_epi16(colorsEven, 8) + colorsOdd = mm_and_si128(colorsOdd, hiMask) + mm_storeu_si128(data[i].addr, mm_or_si128(colorsEven, colorsOdd)) i += 4 for i in i ..< data.len: var c = data[i] if c.a != 255: - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 + c.r = ((c.r.uint32 * c.a + 127) div 255).uint8 + c.g = ((c.g.uint32 * c.a + 127) div 255).uint8 + c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 data[i] = c proc newImageSse2*(mask: Mask): Image {.simd.} = diff --git a/tests/test_images.nim b/tests/test_images.nim index 9351af0..1405b5c 100644 --- a/tests/test_images.nim +++ b/tests/test_images.nim @@ -121,7 +121,7 @@ block: let a = newImage(100, 100) a.fill(rgbx(50, 100, 150, 200)) a.invert() - doAssert a[0, 0] == rgbx(44, 33, 22, 55) + doAssert a[0, 0] == rgbx(44, 33, 23, 55) block: let ctx = newContext(100, 100) @@ -226,3 +226,29 @@ block: 292.0, 45.0, 1.0 ) ) + +block: + var + colors: seq[ColorRGBA] + premultiplied: seq[ColorRGBX] + for a in 0.uint8 .. 255: + for r in 0.uint8 .. 255: + let + rgba = rgba(r, 0, 0, a) + floats = rgba.color() + premul = color(floats.r * floats.a, 0, 0, floats.a) + rgbx = rgbx( + round(premul.r * 255).uint8, + 0, + 0, + round(premul.a * 255).uint8 + ) + colors.add(rgba) + premultiplied.add(rgbx) + + var converted = cast[seq[ColorRGBX]](colors) + toPremultipliedAlpha(converted) + + for i in 0 ..< premultiplied.len: + doAssert premultiplied[i] == converted[i] + doAssert colors[i].rgbx == converted[i] diff --git a/tests/test_images_draw.nim b/tests/test_images_draw.nim index c2d99f1..d6f3546 100644 --- a/tests/test_images_draw.nim +++ b/tests/test_images_draw.nim @@ -279,7 +279,7 @@ block: image.draw(strokeImage) image.xray("tests/images/fillOptimization.png") - doAssert image[10, 10] == rgbx(255, 127, 63, 255) + doAssert image[10, 10] == rgbx(255, 128, 64, 255) block: let a = newImage(100, 100)