diff --git a/pixie.nimble b/pixie.nimble index fa96bd3..f845152 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -1,4 +1,4 @@ -version = "3.0.2" +version = "3.0.3" author = "Andre von Houck and Ryan Oldenburg" description = "Full-featured 2d graphics library for Nim." license = "MIT" diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index fa53219..1932819 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -56,13 +56,11 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) - for j in countup(i, data.len - 4, 4): + for _ in countup(i, data.len - 4, 4): var - color = mm_loadu_si128(data[j].addr) + color = mm_loadu_si128(data[i].addr) alpha = mm_and_si128(color, alphaMask) - - let eqOpaque = mm_cmpeq_epi16(alpha, alphaMask) - if mm_movemask_epi8(eqOpaque) != 0xffff: + if mm_movemask_epi8(mm_cmpeq_epi16(alpha, alphaMask)) != 0xffff: # If not all of the alpha values are 255, premultiply var colorEven = mm_slli_epi16(color, 8) @@ -81,8 +79,10 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} mm_and_si128(alpha, alphaMask), mm_and_si128(color, notAlphaMask) ) - mm_storeu_si128(data[j].addr, color) + mm_storeu_si128(data[i].addr, color) + i += 4 + # Convert whatever is left for j in i ..< data.len: var c = data[j] diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 3d96213..ad34dfa 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1164,10 +1164,10 @@ iterator walk( windingRule: WindingRule, y: int, width: float32 -): (float32, float32, int32) = +): (float32, float32, int) = var prevAt: float32 - count: int32 + count: int for i in 0 ..< numHits: let (at, winding) = hits[i] if windingRule == wrNonZero and @@ -1257,10 +1257,10 @@ proc computeCoverages( if fillLen > 0: var i = fillStart when defined(amd64) and not defined(pixieNoSimd): - let vSampleCoverage = mm_set1_epi8(cast[int8](sampleCoverage)) + let sampleCoverageVec = mm_set1_epi8(cast[int8](sampleCoverage)) for j in countup(i, fillStart + fillLen - 16, 16): var coverage = mm_loadu_si128(coverages[j - startX].addr) - coverage = mm_add_epi8(coverage, vSampleCoverage) + coverage = mm_add_epi8(coverage, sampleCoverageVec) mm_storeu_si128(coverages[j - startX].addr, coverage) i += 16 for j in i ..< fillStart + fillLen: @@ -1291,56 +1291,64 @@ proc fillCoverage( # When supported, SIMD blend as much as possible let blenderSimd = blendMode.blenderSimd() - first32 = cast[M128i]([uint32.high, 0, 0, 0]) # First 32 bits oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) - vColor = mm_set1_epi32(cast[int32](rgbx)) - for _ in countup(x, startX + coverages.len - 16, 4): - var coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr) - coverage = mm_and_si128(coverage, first32) - + vec255 = mm_set1_epi32(cast[int32](uint32.high)) + zeroVec = mm_setzero_si128() + colorVec = mm_set1_epi32(cast[int32](rgbx)) + for _ in countup(x, startX + coverages.len - 16, 16): let index = image.dataIndex(x, y) - eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) - if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask: + coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr) + + if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff: # If the coverages are not all zero - if mm_movemask_epi8(mm_cmpeq_epi32(coverage, first32)) == 0xffff: - # Coverages are all 255 + if mm_movemask_epi8(mm_cmpeq_epi32(coverage, vec255)) == 0xffff: + # If the coverages are all 255 if blendMode == bmNormal and rgbx.a == 255: - mm_storeu_si128(image.data[index].addr, vColor) + for i in 0 ..< 4: + mm_storeu_si128(image.data[index + i * 4].addr, colorVec) else: - let backdrop = mm_loadu_si128(image.data[index].addr) - mm_storeu_si128( - image.data[index].addr, - blenderSimd(backdrop, vColor) - ) + for i in 0 ..< 4: + let backdrop = mm_loadu_si128(image.data[index + i * 4].addr) + mm_storeu_si128( + image.data[index + i * 4].addr, + blenderSimd(backdrop, colorVec) + ) else: # Coverages are not all 255 - coverage = unpackAlphaValues(coverage) - # Shift the coverages from `a` to `g` and `a` for multiplying - coverage = mm_or_si128(coverage, mm_srli_epi32(coverage, 16)) + var coverage = coverage + for i in 0 ..< 4: + var unpacked = unpackAlphaValues(coverage) + # Shift the coverages from `a` to `g` and `a` for multiplying + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - var - source = vColor - sourceEven = mm_slli_epi16(source, 8) - sourceOdd = mm_and_si128(source, oddMask) + var + source = colorVec + sourceEven = mm_slli_epi16(source, 8) + sourceOdd = mm_and_si128(source, oddMask) - sourceEven = mm_mulhi_epu16(sourceEven, coverage) - sourceOdd = mm_mulhi_epu16(sourceOdd, coverage) + sourceEven = mm_mulhi_epu16(sourceEven, unpacked) + sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked) - sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) - sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) + sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) + sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) - source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) + source = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) + + let backdrop = mm_loadu_si128(image.data[index + i * 4].addr) + mm_storeu_si128( + image.data[index + i * 4].addr, + blenderSimd(backdrop, source) + ) + + coverage = mm_srli_si128(coverage, 4) - let backdrop = mm_loadu_si128(image.data[index].addr) - mm_storeu_si128( - image.data[index].addr, - blenderSimd(backdrop, source) - ) elif blendMode == bmMask: - mm_storeu_si128(image.data[index].addr, mm_setzero_si128()) - x += 4 + for i in 0 ..< 4: + mm_storeu_si128(image.data[index + i * 4].addr, zeroVec) + + x += 16 let blender = blendMode.blender() while x < startX + coverages.len: @@ -1375,13 +1383,14 @@ proc fillCoverage( var x = startX when defined(amd64) and not defined(pixieNoSimd): if blendMode.hasSimdMasker(): - let maskerSimd = blendMode.maskerSimd() + let + maskerSimd = blendMode.maskerSimd() + zeroVec = mm_setzero_si128() for _ in countup(x, startX + coverages.len - 16, 16): let index = mask.dataIndex(x, y) coverage = mm_loadu_si128(coverages[x - startX].unsafeAddr) - eqZero = mm_cmpeq_epi16(coverage, mm_setzero_si128()) - if mm_movemask_epi8(eqZero) != 0xffff: # or blendMode == bmExcludeMask: + if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zeroVec)) != 0xffff: # If the coverages are not all zero let backdrop = mm_loadu_si128(mask.data[index].addr) mm_storeu_si128( @@ -1389,7 +1398,7 @@ proc fillCoverage( maskerSimd(backdrop, coverage) ) elif blendMode == bmMask: - mm_storeu_si128(mask.data[index].addr, mm_setzero_si128()) + mm_storeu_si128(mask.data[index].addr, zeroVec) x += 16 let masker = blendMode.masker() @@ -1438,16 +1447,16 @@ proc fillHits( # When supported, SIMD blend as much as possible let blenderSimd = blendMode.blenderSimd() - vColor = mm_set1_epi32(cast[int32](rgbx)) - for _ in countup(fillStart, fillLen - 16, 4): - let - index = image.dataIndex(x, y) - backdrop = mm_loadu_si128(image.data[index].addr) - mm_storeu_si128( - image.data[index].addr, - blenderSimd(backdrop, vColor) - ) - x += 4 + colorVec = mm_set1_epi32(cast[int32](rgbx)) + for _ in countup(fillStart, fillLen - 16, 16): + let index = image.dataIndex(x, y) + for i in 0 ..< 4: + let backdrop = mm_loadu_si128(image.data[index + i * 4].addr) + mm_storeu_si128( + image.data[index + i * 4].addr, + blenderSimd(backdrop, colorVec) + ) + x += 16 for x in x ..< fillStart + fillLen: let backdrop = image.getRgbaUnsafe(x, y) @@ -1487,12 +1496,12 @@ proc fillHits( if blendMode.hasSimdMasker(): let maskerSimd = blendMode.maskerSimd() - vValue = mm_set1_epi8(cast[int8](255)) + valueVec = mm_set1_epi8(cast[int8](255)) for _ in countup(fillStart, fillLen - 16, 16): let backdrop = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) mm_storeu_si128( mask.data[mask.dataIndex(x, y)].addr, - maskerSimd(backdrop, vValue) + maskerSimd(backdrop, valueVec) ) x += 16 diff --git a/tests/contexts/draw_image_rhino.png b/tests/contexts/draw_image_rhino.png index 2fcd692..2f6a9f4 100644 Binary files a/tests/contexts/draw_image_rhino.png and b/tests/contexts/draw_image_rhino.png differ diff --git a/tests/contexts/draw_image_rhino2.png b/tests/contexts/draw_image_rhino2.png index 2fcd692..2f6a9f4 100644 Binary files a/tests/contexts/draw_image_rhino2.png and b/tests/contexts/draw_image_rhino2.png differ