diff --git a/src/pixie/blends.nim b/src/pixie/blends.nim index 2324cdd..48543ca 100644 --- a/src/pixie/blends.nim +++ b/src/pixie/blends.nim @@ -160,7 +160,7 @@ proc SetSat(C: Color, s: float32): Color {.inline.} = if satC > 0: result = (C - min([C.r, C.g, C.b])) * s / satC -proc blendNormal(backdrop, source: ColorRGBA): ColorRGBA = +proc blendNormal*(backdrop, source: ColorRGBA): ColorRGBA = if backdrop.a == 0: return source if source.a == 255: @@ -516,6 +516,31 @@ when defined(amd64) and not defined(pixieNoSimd): result = mm_or_si128(mm_or_si128(result, i), mm_or_si128(j, k)) result = mm_and_si128(result, first32) + proc unpackAlphaValues*(v: M128i): M128i {.inline.} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value) + let + first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a` + + result = mm_shuffle_epi32(v, MM_SHUFFLE(0, 0, 0, 0)) + + var + i = mm_and_si128(result, first32) + j = mm_and_si128(result, mm_slli_si128(first32, 4)) + k = mm_and_si128(result, mm_slli_si128(first32, 8)) + l = mm_and_si128(result, mm_slli_si128(first32, 12)) + + # Shift the values to `a` + i = mm_slli_si128(i, 3) + j = mm_slli_si128(j, 2) + k = mm_slli_si128(k, 1) + # l = mm_slli_si128(l, 0) + + result = mm_and_si128( + mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)), + alphaMask + ) + proc blendNormalSimd*(backdrop, source: M128i): M128i = let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) @@ -615,9 +640,7 @@ when defined(amd64) and not defined(pixieNoSimd): blendedEven = mm_add_epi16(sourceEven, backdropEven) blendedOdd = mm_add_epi16(sourceOdd, backdropOdd) - blendedOdd = mm_slli_epi16(blendedOdd, 8) - - mm_or_si128(blendedEven, blendedOdd) + mm_or_si128(blendedEven, mm_slli_epi16(blendedOdd, 8)) proc maskMaskSimd*(backdrop, source: M128i): M128i = let @@ -638,9 +661,7 @@ when defined(amd64) and not defined(pixieNoSimd): backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - backdropOdd = mm_slli_epi16(backdropOdd, 8) - - mm_or_si128(backdropEven, backdropOdd) + mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) proc maskerSimd*(blendMode: BlendMode): MaskerSimd = case blendMode: diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 886bfcd..aa0d312 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -686,10 +686,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) = # Check we are not rotated before using SIMD blends when type(a) is Image: if blendMode.hasSimdBlender(): - let - blenderSimd = blendMode.blenderSimd() - first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) # Only `a` + let blenderSimd = blendMode.blenderSimd() for _ in countup(x, xMax - 4, 4): let srcPos = p + dx * x.float32 + dy * y.float32 @@ -701,24 +698,7 @@ proc drawUber(a, b: Image | Mask, mat = mat3(), blendMode = bmNormal) = else: # b is a Mask # Need to move 4 mask values into the alpha slots var source = mm_loadu_si128(b.data[b.dataIndex(sx, sy)].addr) - source = mm_slli_si128(source, 2) - source = mm_shuffle_epi32(source, MM_SHUFFLE(1, 1, 0, 0)) - - var - i = mm_and_si128(source, first32) - j = mm_and_si128(source, mm_slli_si128(first32, 4)) - k = mm_and_si128(source, mm_slli_si128(first32, 8)) - l = mm_and_si128(source, mm_slli_si128(first32, 12)) - - # Shift the values to `a` - i = mm_slli_si128(i, 1) - k = mm_slli_si128(k, 3) - l = mm_slli_si128(l, 2) - - source = mm_and_si128( - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)), - alphaMask - ) + source = unpackAlphaValues(source) mm_storeu_si128( a.data[a.dataIndex(x, y)].addr, diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index e8690e0..39d6056 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -966,13 +966,11 @@ proc fillShapes( # When supported, SIMD blend as much as possible let first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits - redMask = mm_set1_epi32(cast[int32](0x000000ff)) # Only `r` oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) - v255 = mm_set1_epi32(255) vColor = mm_set1_epi32(cast[int32](color)) - for _ in countup(x, coverages.len - 16, 16): + for _ in countup(x, image.width - 16, 4): var coverage = mm_loadu_si128(coverages[x].addr) coverage = mm_and_si128(coverage, first32) @@ -981,32 +979,11 @@ proc fillShapes( # If the coverages are not all zero var source = vColor - coverage = mm_slli_si128(coverage, 2) - coverage = mm_shuffle_epi32(coverage, MM_SHUFFLE(1, 1, 0, 0)) - - var - a = mm_and_si128(coverage, first32) - b = mm_and_si128(coverage, mm_slli_si128(first32, 4)) - c = mm_and_si128(coverage, mm_slli_si128(first32, 8)) - d = mm_and_si128(coverage, mm_slli_si128(first32, 12)) - - # Shift the coverages to `r` - a = mm_srli_si128(a, 2) - b = mm_srli_si128(b, 3) - d = mm_srli_si128(d, 1) - - coverage = mm_and_si128( - mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)), - redMask - ) - - if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff: + if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) != 0xffff: # If the coverages are not all 255 - - # Shift the coverages from `r` to `g` and `a` for multiplying later - coverage = mm_or_si128( - mm_slli_epi32(coverage, 8), mm_slli_epi32(coverage, 24) - ) + coverage = unpackAlphaValues(coverage) + # Shift the coverages from `a` to `g` and `a` for multiplying + coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16)) var colorEven = mm_slli_epi16(source, 8) @@ -1085,18 +1062,16 @@ proc fillShapes( when defined(amd64) and not defined(pixieNoSimd): # When supported, SIMD blend as much as possible for _ in countup(x, coverages.len - 16, 16): - var coverage = mm_loadu_si128(coverages[x].addr) - - let eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) + let + coverage = mm_loadu_si128(coverages[x].addr) + eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) if mm_movemask_epi8(eqZero) != 0xffff: # If the coverages are not all zero let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) - mm_storeu_si128( mask.data[mask.dataIndex(x, y)].addr, maskNormalSimd(backdrop, coverage) ) - x += 16 while x < mask.width: