diff --git a/src/pixie/fileformats/png.nim b/src/pixie/fileformats/png.nim index f53bfaf..cebfb80 100644 --- a/src/pixie/fileformats/png.nim +++ b/src/pixie/fileformats/png.nim @@ -129,16 +129,25 @@ proc unfilter( uncompressedStartIdx = uncompressedIdx(1, y) unfilteredStartIx = unfiteredIdx(0, y) var x: int - when allowSimd and defined(amd64): + when allowSimd and (defined(amd64) or defined(arm64)): if y - 1 >= 0: for _ in 0 ..< rowBytes div 16: - let - bytes = mm_loadu_si128(uncompressed[uncompressedStartIdx + x].addr) - up = mm_loadu_si128(result[unfilteredStartIx + x - rowBytes].addr) - mm_storeu_si128( - result[unfilteredStartIx + x].addr, - mm_add_epi8(bytes, up) - ) + when defined(amd64): + let + bytes = mm_loadu_si128(uncompressed[uncompressedStartIdx + x].addr) + up = mm_loadu_si128(result[unfilteredStartIx + x - rowBytes].addr) + mm_storeu_si128( + result[unfilteredStartIx + x].addr, + mm_add_epi8(bytes, up) + ) + else: # arm64 + let + bytes = vld1q_u8(uncompressed[uncompressedStartIdx + x].addr) + up = vld1q_u8(result[unfilteredStartIx + x - rowBytes].addr) + vst1q_u8( + result[unfilteredStartIx + x].addr, + vaddq_u8(bytes, up) + ) x += 16 for x in x ..< rowBytes: var value = uncompressed[uncompressedStartIdx + x] diff --git a/src/pixie/images.nim b/src/pixie/images.nim index f444328..5c3ac20 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -338,8 +338,9 @@ proc blur*( var values: array[4, uint32] for xx in x - radius ..< min(x + radius, 0): values += outOfBounds * kernel[xx - x + radius] + var idx = image.dataIndex(0, y) for xx in max(x - radius, 0) .. min(x + radius, image.width - 1): - values += image.unsafe[xx, y] * kernel[xx - x + radius] + values += image.data[idx + xx] * kernel[xx - x + radius] for xx in max(x - radius, image.width) .. x + radius: values += outOfBounds * kernel[xx - x + radius] blurX.unsafe[y, x] = rgbx(values) @@ -350,8 +351,9 @@ proc blur*( var values: array[4, uint32] for yy in y - radius ..< min(y + radius, 0): values += outOfBounds * kernel[yy - y + radius] + var idx = blurX.dataIndex(0, x) for yy in max(y - radius, 0) .. min(y + radius, image.height - 1): - values += blurX.unsafe[yy, x] * kernel[yy - y + radius] + values += blurX.data[idx + yy] * kernel[yy - y + radius] for yy in max(y - radius, image.height) .. y + radius: values += outOfBounds * kernel[yy - y + radius] image.unsafe[x, y] = rgbx(values) @@ -447,7 +449,9 @@ proc blendLineOverwrite( ) {.inline.} = copyMem(a[0].addr, b[0].addr, len * 4) -proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = +proc blendLineNormal( + a, b: ptr UncheckedArray[ColorRGBX], len: int +) {.hasSimd.} = for i in 0 ..< len: a[i] = blendNormal(a[i], b[i]) diff --git a/src/pixie/paints.nim b/src/pixie/paints.nim index 5cfd451..530932c 100644 --- a/src/pixie/paints.nim +++ b/src/pixie/paints.nim @@ -119,7 +119,7 @@ proc fillGradientLinear(image: Image, paint: Paint) = if at.y == to.y: # Horizontal gradient var x: int while x < image.width: - when defined(amd64) and allowSimd: + when allowSimd and (defined(amd64) or defined(arm64)): if x + 4 <= image.width: var colors: array[4, ColorRGBX] for i in 0 ..< 4: @@ -128,10 +128,14 @@ proc fillGradientLinear(image: Image, paint: Paint) = t = toLineSpace(at, to, xy) rgbx = paint.gradientColor(t) colors[i] = rgbx - - let colorVec = cast[M128i](colors) - for y in 0 ..< image.height: - mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec) + when defined(amd64): + let colorVec = mm_loadu_si128(colors[0].addr) + for y in 0 ..< image.height: + mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec) + else: # arm64 + let colorVec = vld1q_u32(colors[0].addr) + for y in 0 ..< image.height: + vst1q_u32(image.data[image.dataIndex(x, y)].addr, colorVec) x += 4 continue @@ -150,11 +154,17 @@ proc fillGradientLinear(image: Image, paint: Paint) = t = toLineSpace(at, to, xy) rgbx = paint.gradientColor(t) var x: int - when defined(amd64) and allowSimd: - let colorVec = mm_set1_epi32(cast[int32](rgbx)) - for _ in 0 ..< image.width div 4: - mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec) - x += 4 + when allowSimd: + when defined(amd64): + let colorVec = mm_set1_epi32(cast[int32](rgbx)) + for _ in 0 ..< image.width div 4: + mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec) + x += 4 + elif defined(arm64): + let colorVec = vmovq_n_u32(cast[uint32](rgbx)) + for _ in 0 ..< image.width div 4: + vst1q_u32(image.data[image.dataIndex(x, y)].addr, colorVec) + x += 4 for x in x ..< image.width: image.unsafe[x, y] = rgbx @@ -227,7 +237,6 @@ proc fillGradientAngular(image: Image, paint: Paint) = proc fillGradient*(image: Image, paint: Paint) {.raises: [PixieError].} = ## Fills with the Paint gradient. - case paint.kind: of LinearGradientPaint: image.fillGradientLinear(paint) diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index e738e54..bf7908c 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1410,13 +1410,21 @@ proc computeCoverage( let fillLen = at.integer - fillStart if fillLen > 0: var i = fillStart - when defined(amd64) and allowSimd: - let sampleCoverageVec = mm_set1_epi8(sampleCoverage) - for _ in 0 ..< fillLen div 16: - var coverageVec = mm_loadu_si128(coverages[i - startX].addr) - coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec) - mm_storeu_si128(coverages[i - startX].addr, coverageVec) - i += 16 + when allowSimd: + when defined(amd64): + let sampleCoverageVec = mm_set1_epi8(sampleCoverage) + for _ in 0 ..< fillLen div 16: + var coverageVec = mm_loadu_si128(coverages[i - startX].addr) + coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec) + mm_storeu_si128(coverages[i - startX].addr, coverageVec) + i += 16 + elif defined(arm64): + let sampleCoverageVec = vmovq_n_u8(sampleCoverage) + for _ in 0 ..< fillLen div 16: + var coverageVec = vld1q_u8(coverages[i - startX].addr) + coverageVec = vaddq_u8(coverageVec, sampleCoverageVec) + vst1q_u8(coverages[i - startX].addr, coverageVec) + i += 16 for j in i ..< fillStart + fillLen: coverages[j - startX] += sampleCoverage diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 97807c3..0d0c325 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -415,6 +415,76 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = # Set src as this result for if we do another power src = result +template applyCoverage*(rgbxVec: M256i, coverage: M128i): M256i = + ## Unpack the first 8 coverage bytes. + let + unpacked0 = mm_shuffle_epi8(coverage, coverageShuffle) + unpacked1 = mm_shuffle_epi8(mm_srli_si128(coverage, 4), coverageShuffle) + unpacked = + mm256_insertf128_si256(mm256_castsi128_si256(unpacked0), unpacked1, 1) + + var + rgbxEven = mm256_slli_epi16(rgbxVec, 8) + rgbxOdd = mm256_and_si256(rgbxVec, oddMask) + rgbxEven = mm256_mulhi_epu16(rgbxEven, unpacked) + rgbxOdd = mm256_mulhi_epu16(rgbxOdd, unpacked) + rgbxEven = mm256_srli_epi16(mm256_mulhi_epu16(rgbxEven, div255), 7) + rgbxOdd = mm256_srli_epi16(mm256_mulhi_epu16(rgbxOdd, div255), 7) + + mm256_or_si256(rgbxEven, mm256_slli_epi16(rgbxOdd, 8)) + +proc blendLineCoverageOverwriteAvx2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 31) != 0: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + inc i + + let + rgbxVec = mm256_set1_epi32(cast[uint32](rgbx)) + vecZero = mm256_setzero_si256() + vec255 = mm256_set1_epi8(255) + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + coverageShuffle = mm_set_epi8( + 3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1 + ) + while i < len - 32: + let + coverage = mm256_loadu_si256(coverages[i].addr) + eqZero = mm256_cmpeq_epi8(coverage, vecZero) + eq255 = mm256_cmpeq_epi8(coverage, vec255) + if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff): + i += 32 + elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff): + for _ in 0 ..< 4: + mm256_store_si256(line[i].addr, rgbxVec) + i += 8 + else: + let + coverageLo = mm256_castsi256_si128(coverage) + coverageHi = mm256_extractf128_si256(coverage, 1) + coverages = [ + coverageLo, + mm_srli_si128(coverageLo, 8), + coverageHi, + mm_srli_si128(coverageHi, 8), + ] + for j in 0 ..< 4: + mm256_store_si256(line[i].addr, rgbxVec.applyCoverage(coverages[j])) + i += 8 + + for i in i ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + proc blendLineNormalAvx2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -473,6 +543,71 @@ proc blendLineNormalAvx2*( for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineCoverageNormalAvx2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 31) != 0: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm256_set1_epi32(cast[uint32](rgbx)) + vecZero = mm256_setzero_si256() + vec255 = mm256_set1_epi8(255) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + coverageShuffle = mm_set_epi8( + 3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1 + ) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 32: + let + coverage = mm256_loadu_si256(coverages[i].addr) + eqZero = mm256_cmpeq_epi8(coverage, vecZero) + eq255 = mm256_cmpeq_epi8(coverage, vec255) + if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff): + i += 32 + elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff) and rgbx.a == 255: + for _ in 0 ..< 4: + mm256_store_si256(line[i].addr, rgbxVec) + i += 8 + else: + let + coverageLo = mm256_castsi256_si128(coverage) + coverageHi = mm256_extractf128_si256(coverage, 1) + coverages = [ + coverageLo, + mm_srli_si128(coverageLo, 8), + coverageHi, + mm_srli_si128(coverageHi, 8), + ] + for j in 0 ..< 4: + let + backdrop = mm256_loadu_si256(line[i].addr) + source = rgbxVec.applyCoverage(coverages[j]) + mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source)) + i += 8 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + proc blendLineMaskAvx2*( line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int ) {.simd.} = @@ -529,5 +664,73 @@ proc blendLineMaskAvx2*( for i in i ..< len: a[i] = blendMask(a[i], b[i]) +proc blendLineCoverageMaskAvx2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 31) != 0: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm256_set1_epi32(cast[uint32](rgbx)) + vecZero = mm256_setzero_si256() + vec255 = mm256_set1_epi8(255) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + coverageShuffle = mm_set_epi8( + 3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1 + ) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 16: + let + coverage = mm256_loadu_si256(coverages[i].addr) + eqZero = mm256_cmpeq_epi8(coverage, vecZero) + eq255 = mm256_cmpeq_epi8(coverage, vec255) + if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff): + for _ in 0 ..< 4: + mm256_store_si256(line[i].addr, vecZero) + i += 8 + elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff) and rgbx.a == 255: + i += 32 + else: + let + coverageLo = mm256_castsi256_si128(coverage) + coverageHi = mm256_extractf128_si256(coverage, 1) + coverages = [ + coverageLo, + mm_srli_si128(coverageLo, 8), + coverageHi, + mm_srli_si128(coverageHi, 8), + ] + for j in 0 ..< 4: + let + backdrop = mm256_loadu_si256(line[i].addr) + source = rgbxVec.applyCoverage(coverages[j]) + mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source)) + i += 8 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + when defined(release): {.pop.} diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 8beca4f..de63f63 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -3,6 +3,30 @@ import chroma, internal, nimsimd/neon, pixie/blends, pixie/common, vmath when defined(release): {.push checks: off.} +template multiplyDiv255*(c, a: uint8x8): uint8x8 = + let ca = vmull_u8(c, a) + vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) + +template multiplyDiv255*(c, a: uint8x16): uint8x16 = + vcombine_u8( + multiplyDiv255(vget_low_u8(c), vget_low_u8(a)), + multiplyDiv255(vget_high_u8(c), vget_high_u8(a)) + ) + +template blendNormalSimd*(backdrop, source: uint8x16x4): uint8x16x4 = + let multiplier = vsubq_u8(vec255, source.val[3]) + + var blended: uint8x16x4 + blended.val[0] = multiplyDiv255(backdrop.val[0], multiplier) + blended.val[1] = multiplyDiv255(backdrop.val[1], multiplier) + blended.val[2] = multiplyDiv255(backdrop.val[2], multiplier) + blended.val[3] = multiplyDiv255(backdrop.val[3], multiplier) + blended.val[0] = vaddq_u8(blended.val[0], source.val[0]) + blended.val[1] = vaddq_u8(blended.val[1], source.val[1]) + blended.val[2] = vaddq_u8(blended.val[2], source.val[2]) + blended.val[3] = vaddq_u8(blended.val[3], source.val[3]) + blended + proc fillUnsafeNeon*( data: var seq[ColorRGBX], color: SomeColor, @@ -146,22 +170,12 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = inc i p += 4 - template multiply(c, a: uint8x8): uint8x8 = - let ca = vmull_u8(c, a) - vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - - template multiply(c, a: uint8x16): uint8x16 = - vcombine_u8( - multiply(vget_low_u8(c), vget_low_u8(a)), - multiply(vget_high_u8(c), vget_high_u8(a)) - ) - let iterations = (data.len - i) div 16 for _ in 0 ..< iterations: var channels = vld4q_u8(cast[pointer](p)) - channels.val[0] = multiply(channels.val[0], channels.val[3]) - channels.val[1] = multiply(channels.val[1], channels.val[3]) - channels.val[2] = multiply(channels.val[2], channels.val[3]) + channels.val[0] = multiplyDiv255(channels.val[0], channels.val[3]) + channels.val[1] = multiplyDiv255(channels.val[1], channels.val[3]) + channels.val[2] = multiplyDiv255(channels.val[2], channels.val[3]) vst4q_u8(cast[pointer](p), channels) p += 64 i += 16 * iterations @@ -225,19 +239,15 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = i: int p = cast[uint](image.data[0].addr) - template multiply(c, a: uint8x8): uint8x8 = - let ca = vmull_u8(c, a) - vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - let opacityVec = vmov_n_u8(opacity) iterations = image.data.len div 8 for _ in 0 ..< iterations: var channels = vld4_u8(cast[pointer](p)) - channels.val[0] = multiply(channels.val[0], opacityVec) - channels.val[1] = multiply(channels.val[1], opacityVec) - channels.val[2] = multiply(channels.val[2], opacityVec) - channels.val[3] = multiply(channels.val[3], opacityVec) + channels.val[0] = multiplyDiv255(channels.val[0], opacityVec) + channels.val[1] = multiplyDiv255(channels.val[1], opacityVec) + channels.val[2] = multiplyDiv255(channels.val[2], opacityVec) + channels.val[3] = multiplyDiv255(channels.val[3], opacityVec) vst4_u8(cast[pointer](p), channels) p += 32 i += 8 * iterations @@ -414,11 +424,86 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) +proc blendLineCoverageOverwriteNeon*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + inc i + + var vecRgbx: uint8x16x4 + vecRgbx.val[0] = vmovq_n_u8(rgbx.r) + vecRgbx.val[1] = vmovq_n_u8(rgbx.g) + vecRgbx.val[2] = vmovq_n_u8(rgbx.b) + vecRgbx.val[3] = vmovq_n_u8(rgbx.a) + + let + vecZero = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + while i < len - 16: + let + coverage = vld1q_u8(coverages[i].addr) + eqZero = vceqq_u8(coverage, vecZero) + eq255 = vceqq_u8(coverage, vec255) + maskZero = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eqZero), vget_high_u8(eqZero) + )), 0) + mask255 = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq255), vget_high_u8(eq255) + )), 0) + if maskZero == uint64.high: + discard + elif mask255 == uint64.high: + vst4q_u8(line[i].addr, vecRgbx) + else: + var source: uint8x16x4 + source.val[0] = multiplyDiv255(vecRgbx.val[0], coverage) + source.val[1] = multiplyDiv255(vecRgbx.val[1], coverage) + source.val[2] = multiplyDiv255(vecRgbx.val[2], coverage) + source.val[3] = multiplyDiv255(vecRgbx.val[3], coverage) + vst4q_u8(line[i].addr, source) + + i += 16 + + for i in i ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + +proc blendLineNormalNeon*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendNormal(line[i], rgbx) + inc i + + var vecRgbx: uint8x16x4 + vecRgbx.val[0] = vmovq_n_u8(rgbx.r) + vecRgbx.val[1] = vmovq_n_u8(rgbx.g) + vecRgbx.val[2] = vmovq_n_u8(rgbx.b) + vecRgbx.val[3] = vmovq_n_u8(rgbx.a) + + let vec255 = vmovq_n_u8(255) + while i < len - 16: + let backdrop = vld4q_u8(line[i].addr) + vst4q_u8(line[i].addr, blendNormalSimd(backdrop, vecRgbx)) + i += 16 + + for i in i ..< len: + line[i] = blendNormal(line[i], rgbx) + proc blendLineNormalNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendNormal(a[i], b[i]) inc i @@ -433,41 +518,99 @@ proc blendLineNormalNeon*( if mask == uint64.high: vst4q_u8(a[i].addr, source) else: - template multiply(c, a: uint8x8): uint8x8 = - let ca = vmull_u8(c, a) - vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - - template multiply(c, a: uint8x16): uint8x16 = - vcombine_u8( - multiply(vget_low_u8(c), vget_low_u8(a)), - multiply(vget_high_u8(c), vget_high_u8(a)) - ) - - let - backdrop = vld4q_u8(a[i].addr) - multiplier = vsubq_u8(vec255, source.val[3]) - - var blended: uint8x16x4 - blended.val[0] = multiply(backdrop.val[0], multiplier) - blended.val[1] = multiply(backdrop.val[1], multiplier) - blended.val[2] = multiply(backdrop.val[2], multiplier) - blended.val[3] = multiply(backdrop.val[3], multiplier) - blended.val[0] = vaddq_u8(blended.val[0], source.val[0]) - blended.val[1] = vaddq_u8(blended.val[1], source.val[1]) - blended.val[2] = vaddq_u8(blended.val[2], source.val[2]) - blended.val[3] = vaddq_u8(blended.val[3], source.val[3]) - vst4q_u8(a[i].addr, blended) + let backdrop = vld4q_u8(a[i].addr) + vst4q_u8(a[i].addr, blendNormalSimd(backdrop, source)) i += 16 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) +proc blendLineCoverageNormalNeon*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + inc i + + var vecRgbx: uint8x16x4 + vecRgbx.val[0] = vmovq_n_u8(rgbx.r) + vecRgbx.val[1] = vmovq_n_u8(rgbx.g) + vecRgbx.val[2] = vmovq_n_u8(rgbx.b) + vecRgbx.val[3] = vmovq_n_u8(rgbx.a) + + let + vecZero = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + while i < len - 16: + let + coverage = vld1q_u8(coverages[i].addr) + eqZero = vceqq_u8(coverage, vecZero) + eq255 = vceqq_u8(coverage, vec255) + maskZero = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eqZero), vget_high_u8(eqZero) + )), 0) + mask255 = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq255), vget_high_u8(eq255) + )), 0) + if maskZero == uint64.high: + discard + elif mask255 == uint64.high and rgbx.a == 255: + vst4q_u8(line[i].addr, vecRgbx) + else: + var source: uint8x16x4 + source.val[0] = multiplyDiv255(vecRgbx.val[0], coverage) + source.val[1] = multiplyDiv255(vecRgbx.val[1], coverage) + source.val[2] = multiplyDiv255(vecRgbx.val[2], coverage) + source.val[3] = multiplyDiv255(vecRgbx.val[3], coverage) + + let backdrop = vld4q_u8(line[i].addr) + vst4q_u8(line[i].addr, blendNormalSimd(backdrop, source)) + + i += 16 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + +proc blendLineMaskNeon*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendMask(line[i], rgbx) + inc i + + let alpha = vmovq_n_u8(rgbx.a) + while i < len - 16: + let backdrop = vld4q_u8(line[i].addr) + var blended: uint8x16x4 + blended.val[0] = multiplyDiv255(backdrop.val[0], alpha) + blended.val[1] = multiplyDiv255(backdrop.val[1], alpha) + blended.val[2] = multiplyDiv255(backdrop.val[2], alpha) + blended.val[3] = multiplyDiv255(backdrop.val[3], alpha) + vst4q_u8(line[i].addr, blended) + i += 16 + + for i in i ..< len: + line[i] = blendMask(line[i], rgbx) + proc blendLineMaskNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendMask(a[i], b[i]) inc i @@ -482,22 +625,12 @@ proc blendLineMaskNeon*( if mask == uint64.high: discard else: - template multiply(c, a: uint8x8): uint8x8 = - let ca = vmull_u8(c, a) - vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - - template multiply(c, a: uint8x16): uint8x16 = - vcombine_u8( - multiply(vget_low_u8(c), vget_low_u8(a)), - multiply(vget_high_u8(c), vget_high_u8(a)) - ) - let backdrop = vld4q_u8(a[i].addr) var blended: uint8x16x4 - blended.val[0] = multiply(backdrop.val[0], source.val[3]) - blended.val[1] = multiply(backdrop.val[1], source.val[3]) - blended.val[2] = multiply(backdrop.val[2], source.val[3]) - blended.val[3] = multiply(backdrop.val[3], source.val[3]) + blended.val[0] = multiplyDiv255(backdrop.val[0], source.val[3]) + blended.val[1] = multiplyDiv255(backdrop.val[1], source.val[3]) + blended.val[2] = multiplyDiv255(backdrop.val[2], source.val[3]) + blended.val[3] = multiplyDiv255(backdrop.val[3], source.val[3]) vst4q_u8(a[i].addr, blended) i += 16 @@ -505,5 +638,66 @@ proc blendLineMaskNeon*( for i in i ..< len: a[i] = blendMask(a[i], b[i]) +proc blendLineCoverageMaskNeon*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + inc i + + let + alpha = vmovq_n_u8(rgbx.a) + vecZero = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + while i < len - 16: + let + coverage = vld1q_u8(coverages[i].addr) + eqZero = vceqq_u8(coverage, vecZero) + eq255 = vceqq_u8(coverage, vec255) + maskZero = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eqZero), vget_high_u8(eqZero) + )), 0) + mask255 = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq255), vget_high_u8(eq255) + )), 0) + if maskZero == uint64.high: + vst1q_u8(line[i].addr, vecZero) + vst1q_u8(line[i + 4].addr, vecZero) + vst1q_u8(line[i + 8].addr, vecZero) + vst1q_u8(line[i + 12].addr, vecZero) + elif mask255 == uint64.high and rgbx.a == 255: + discard + else: + let + backdrop = vld4q_u8(line[i].addr) + alpha = multiplyDiv255(alpha, coverage) + var blended: uint8x16x4 + blended.val[0] = multiplyDiv255(backdrop.val[0], alpha) + blended.val[1] = multiplyDiv255(backdrop.val[1], alpha) + blended.val[2] = multiplyDiv255(backdrop.val[2], alpha) + blended.val[3] = multiplyDiv255(backdrop.val[3], alpha) + vst4q_u8(line[i].addr, blended) + + i += 16 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + when defined(release): {.pop.} diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index a5880ed..0fbd045 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -528,7 +528,7 @@ proc blendLineCoverageOverwriteSse2*( coverages: ptr UncheckedArray[uint8], rgbx: ColorRGBX, len: int - ) {.simd.} = +) {.simd.} = var i: int while i < len and (cast[uint](line[i].addr) and 15) != 0: let coverage = coverages[i] @@ -691,7 +691,7 @@ proc blendLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while i < len and (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendMask(a[i], b[i]) inc i @@ -721,7 +721,7 @@ proc blendLineCoverageMaskSse2*( len: int ) {.simd.} = var i: int - while i < len and (cast[uint](line[i].addr) and 15) != 0: + while i < len and (cast[uint](line[i].addr) and 15) != 0: let coverage = coverages[i] if coverage == 0: line[i] = rgbx(0, 0, 0, 0)