diff --git a/src/pixie/common.nim b/src/pixie/common.nim index 902d55f..3e4bc40 100644 --- a/src/pixie/common.nim +++ b/src/pixie/common.nim @@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = a = ((color.a * x + 127) div 255).uint8 rgbx(r, g, b, a) +proc `*`*(rgbx: ColorRGBX, opacity: uint8): ColorRGBX {.inline.} = + if opacity == 0: + discard + elif opacity == 255: + result = rgbx + else: + result = rgbx( + ((rgbx.r.uint32 * opacity + 127) div 255).uint8, + ((rgbx.g.uint32 * opacity + 127) div 255).uint8, + ((rgbx.b.uint32 * opacity + 127) div 255).uint8, + ((rgbx.a.uint32 * opacity + 127) div 255).uint8 + ) + proc snapToPixels*(rect: Rect): Rect {.raises: [].} = let xMin = rect.x diff --git a/src/pixie/images.nim b/src/pixie/images.nim index ddc1153..f444328 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,6 +1,6 @@ import blends, bumpy, chroma, common, internal, simd, vmath -export Image, newImage, copy, dataIndex +export Image, copy, dataIndex, newImage const h = 0.5.float32 @@ -436,27 +436,26 @@ proc drawCorrect( blended = blender(backdrop, sample) a.unsafe[x, y] = blended -template getUncheckedArray( - image: Image, x, y: int -): ptr UncheckedArray[ColorRGBX] = - cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr) - -proc blitLine(a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender) {.inline.} = +proc blendLine( + a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender +) {.inline.} = for i in 0 ..< len: a[i] = blender(a[i], b[i]) -proc blitLineOverwrite(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.inline.} = +proc blendLineOverwrite( + a, b: ptr UncheckedArray[ColorRGBX], len: int +) {.inline.} = copyMem(a[0].addr, b[0].addr, len * 4) -proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = +proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = for i in 0 ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = +proc blendLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} = for i in 0 ..< len: a[i] = blendMask(a[i], b[i]) -proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = +proc blendRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = let px = pos.x.int py = pos.y.int @@ -475,14 +474,14 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = case blendMode: of NormalBlend: for y in yStart ..< yEnd: - blitLineNormal( + blendLineNormal( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart ) of OverwriteBlend: for y in yStart ..< yEnd: - blitLineOverwrite( + blendLineOverwrite( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart @@ -494,7 +493,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = for y in yStart ..< yEnd: if xStart + px > 0: zeroMem(a.data[a.dataIndex(0, y + py)].addr, (xStart + px) * 4) - blitLineMask( + blendLineMask( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart @@ -512,7 +511,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) = else: let blender = blendMode.blender() for y in yStart ..< yEnd: - blitLine( + blendLine( a.getUncheckedArray(xStart + px, y + py), b.getUncheckedArray(xStart, y), xEnd - xStart, @@ -560,7 +559,7 @@ proc draw*( if hasRotationOrScaling or smooth: a.drawCorrect(b, inverseTransform.inverse(), blendMode, false) else: - a.blitRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode) + a.blendRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode) proc drawTiled*( dst, src: Image, mat: Mat3, blendMode = NormalBlend diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 0120333..a4e9938 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -47,6 +47,11 @@ proc intersectsInside*(a, b: Segment, at: var Vec2): bool {.inline.} = at = a.at + (t * s1) return true +template getUncheckedArray*( + image: Image, x, y: int +): ptr UncheckedArray[ColorRGBX] = + cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr) + proc fillUnsafe*( data: var seq[ColorRGBX], color: SomeColor, start, len: int ) {.hasSimd, raises: [].} = diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index d296bbb..e738e54 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1429,6 +1429,43 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) = len = image.dataIndex(toX, toY) - start fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len) +proc blendLineCoverageOverwrite( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + +proc blendLineCoverageNormal( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + +proc blendLineCoverageMask( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.hasSimd.} = + for i in 0 ..< len: + let coverage = coverages[i] + if coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + proc fillCoverage( image: Image, rgbx: ColorRGBX, @@ -1440,181 +1477,56 @@ proc fillCoverage( x = startX dataIndex = image.dataIndex(x, y) - when allowSimd: - when defined(amd64): - iterator simd( - coverages: seq[uint8], x: var int, startX: int - ): (M128i, bool, bool) = - for _ in 0 ..< coverages.len div 16: - let - coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr) - eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128()) - eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255)) - allZeroes = mm_movemask_epi8(eqZero) == 0xffff - all255 = mm_movemask_epi8(eq255) == 0xffff - yield (coverageVec, allZeroes, all255) - x += 16 - - proc source(colorVec, coverageVec: M128i): M128i {.inline.} = - let - oddMask = mm_set1_epi16(0xff00) - div255 = mm_set1_epi16(0x8081) - - var unpacked = unpackAlphaValues(coverageVec) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - - var - sourceEven = mm_slli_epi16(colorVec, 8) - sourceOdd = mm_and_si128(colorVec, oddMask) - sourceEven = mm_mulhi_epu16(sourceEven, unpacked) - sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked) - sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7) - sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7) - result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8)) - - let colorVec = mm_set1_epi32(cast[int32](rgbx)) - - proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} = - if coverage == 0: - discard - elif coverage == 255: - result = rgbx - else: - result = rgbx( - ((rgbx.r.uint32 * coverage) div 255).uint8, - ((rgbx.g.uint32 * coverage) div 255).uint8, - ((rgbx.b.uint32 * coverage) div 255).uint8, - ((rgbx.a.uint32 * coverage) div 255).uint8 - ) - case blendMode: of OverwriteBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let source = source(colorVec, coverageVec) - mm_storeu_si128(image.data[dataIndex].addr, source) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage != 0: - image.data[dataIndex] = source(rgbx, coverage) - inc dataIndex + blendLineCoverageOverwrite( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of NormalBlend: - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if allZeroes: - dataIndex += 16 - else: - if all255 and rgbx.a == 255: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, colorVec) - dataIndex += 4 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendNormalSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 255 and rgbx.a == 255: - image.data[dataIndex] = rgbx - elif coverage == 0: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageNormal( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) of MaskBlend: {.linearScanEnd.} - - when allowSimd: - when defined(amd64): - for (coverageVec, allZeroes, all255) in simd(coverages, x, startX): - if not allZeroes: - if all255: - dataIndex += 16 - else: - var coverageVec = coverageVec - for i in 0 ..< 4: - let - backdrop = mm_loadu_si128(image.data[dataIndex].addr) - source = source(colorVec, coverageVec) - mm_storeu_si128( - image.data[dataIndex].addr, - blendMaskSimd(backdrop, source) - ) - coverageVec = mm_srli_si128(coverageVec, 4) - dataIndex += 4 - else: - for i in 0 ..< 4: - mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128()) - dataIndex += 4 - - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 0: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - elif coverage == 255: - discard - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage)) - inc dataIndex + blendLineCoverageMask( + image.getUncheckedArray(startX, y), + cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr), + rgbx, + coverages.len + ) image.clearUnsafe(0, y, startX, y) image.clearUnsafe(startX + coverages.len, y, image.width, y) - of SubtractMaskBlend: - for x in x ..< startX + coverages.len: - let coverage = coverages[x - startX] - if coverage == 255 and rgbx.a == 255: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - elif coverage != 0: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendSubtractMask(backdrop, source(rgbx, coverage)) - inc dataIndex - - of ExcludeMaskBlend: - for x in x ..< startX + coverages.len: - let - coverage = coverages[x - startX] - backdrop = image.data[dataIndex] - image.data[dataIndex] = blendExcludeMask(backdrop, source(rgbx, coverage)) - inc dataIndex - else: let blender = blendMode.blender() for x in x ..< startX + coverages.len: let coverage = coverages[x - startX] if coverage != 0: let backdrop = image.data[dataIndex] - image.data[dataIndex] = blender(backdrop, source(rgbx, coverage)) + image.data[dataIndex] = blender(backdrop, rgbx * coverage) inc dataIndex +proc blendLineNormal( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.hasSimd.} = + for i in 0 ..< len: + line[i] = blendNormal(line[i], rgbx) + +proc blendLineMask( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.hasSimd.} = + for i in 0 ..< len: + line[i] = blendMask(line[i], rgbx) + proc fillHits( image: Image, rgbx: ColorRGBX, @@ -1625,19 +1537,6 @@ proc fillHits( blendMode: BlendMode, maskClears = true ) = - template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) = - when allowSimd: - when defined(amd64): - var p = cast[uint](image.data[image.dataIndex(x, y)].addr) - let - iterations = len div 4 - colorVec = mm_set1_epi32(cast[int32](rgbx)) - for _ in 0 ..< iterations: - let backdrop = mm_loadu_si128(cast[pointer](p)) - mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec)) - p += 16 - x += iterations * 4 - case blendMode: of OverwriteBlend: for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): @@ -1648,17 +1547,10 @@ proc fillHits( if rgbx.a == 255: fillUnsafe(image.data, rgbx, image.dataIndex(start, y), len) else: - var x = start - simdBlob(image, x, len, blendNormalSimd) - var dataIndex = image.dataIndex(x, y) - for _ in x ..< start + len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendNormal(backdrop, rgbx) - inc dataIndex + blendLineNormal(image.getUncheckedArray(start, y), rgbx, len) of MaskBlend: {.linearScanEnd.} - var filledTo = startX for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): if maskClears: # Clear any gap between this fill and the previous fill @@ -1672,37 +1564,13 @@ proc fillHits( ) block: # Handle this fill if rgbx.a != 255: - var x = start - simdBlob(image, x, len, blendMaskSimd) - var dataIndex = image.dataIndex(x, y) - for _ in x ..< start + len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendMask(backdrop, rgbx) + blendLineMask(image.getUncheckedArray(start, y), rgbx, len) filledTo = start + len if maskClears: image.clearUnsafe(0, y, startX, y) image.clearUnsafe(filledTo, y, image.width, y) - of SubtractMaskBlend: - for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): - var dataIndex = image.dataIndex(start, y) - for _ in 0 ..< len: - if rgbx.a == 255: - image.data[dataIndex] = rgbx(0, 0, 0, 0) - else: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendSubtractMask(backdrop, rgbx) - inc dataIndex - - of ExcludeMaskBlend: - for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): - var dataIndex = image.dataIndex(start, y) - for _ in 0 ..< len: - let backdrop = image.data[dataIndex] - image.data[dataIndex] = blendExcludeMask(backdrop, rgbx) - inc dataIndex - else: let blender = blendMode.blender() for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width): diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 9375075..97807c3 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -6,6 +6,41 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} +template blendNormalSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) + + backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_add_epi8( + source, + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + ) + +template blendMaskSimd(backdrop, source: M256i): M256i = + var + sourceAlpha = mm256_and_si256(source, alphaMask) + backdropEven = mm256_slli_epi16(backdrop, 8) + backdropOdd = mm256_and_si256(backdrop, oddMask) + + sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) + + backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) + backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) + backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) + backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) + + mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) + proc isOneColorAvx2*(image: Image): bool {.simd.} = result = true @@ -380,11 +415,37 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = # Set src as this result for if we do another power src = result -proc blitLineNormalAvx2*( +proc blendLineNormalAvx2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 31) != 0: + line[i] = blendNormal(line[i], rgbx) + inc i + + let + source = mm256_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 8: + let backdrop = mm256_load_si256(line[i].addr) + mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source)) + i += 8 + + for i in i ..< len: + line[i] = blendNormal(line[i], rgbx) + +proc blendLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 31) != 0: + while i < len and (cast[uint](a[i].addr) and 31) != 0: a[i] = blendNormal(a[i], b[i]) inc i @@ -403,41 +464,45 @@ proc blitLineNormalAvx2*( source = mm256_loadu_si256(b[i].addr) eq255 = mm256_cmpeq_epi8(source, vec255) if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source - mm256_storeu_si256(a[i].addr, source) + mm256_store_si256(a[i].addr, source) else: let backdrop = mm256_load_si256(a[i].addr) - - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm256_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm256_add_epi8( - source, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - - mm256_store_si256(a[i].addr, added) - + mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source)) i += 8 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMaskAvx2*( +proc blendLineMaskAvx2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 31) != 0: + line[i] = blendMask(line[i], rgbx) + inc i + + let + source = mm256_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) + oddMask = mm256_set1_epi16(cast[int16](0xff00)) + div255 = mm256_set1_epi16(cast[int16](0x8081)) + shuffleControl = mm256_set_epi8( + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, + 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 + ) + while i < len - 8: + let backdrop = mm256_load_si256(line[i].addr) + mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source)) + i += 8 + + for i in i ..< len: + line[i] = blendMask(line[i], rgbx) + +proc blendLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 31) != 0: + while i < len and (cast[uint](a[i].addr) and 31) != 0: a[i] = blendMask(a[i], b[i]) inc i @@ -458,24 +523,7 @@ proc blitLineMaskAvx2*( discard else: let backdrop = mm256_load_si256(a[i].addr) - - var - sourceAlpha = mm256_and_si256(source, alphaMask) - backdropEven = mm256_slli_epi16(backdrop, 8) - backdropOdd = mm256_and_si256(backdrop, oddMask) - - sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl) - - backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - - mm256_store_si256( - a[i].addr, - mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) - ) - + mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source)) i += 8 for i in i ..< len: diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index bb43213..8beca4f 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -414,7 +414,7 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) -proc blitLineNormalNeon*( +proc blendLineNormalNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int @@ -463,7 +463,7 @@ proc blitLineNormalNeon*( for i in i ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMaskNeon*( +proc blendLineMaskNeon*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index cc77910..a5880ed 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} = finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) -proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) - -proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendNormalSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - let k = mm_sub_epi32( - mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])), - sourceAlpha - ) - - backdropEven = mm_mulhi_epu16(backdropEven, k) - backdropOdd = mm_mulhi_epu16(backdropOdd, k) + let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) + backdropEven = mm_mulhi_epu16(backdropEven, multiplier) + backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} = mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) -proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - +template blendMaskSimd*(backdrop, source: M128i): M128i = var sourceAlpha = mm_and_si128(source, alphaMask) backdropEven = mm_slli_epi16(backdrop, 8) @@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} = backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) @@ -325,7 +305,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) - mm_storeu_si128( + mm_store_si128( cast[pointer](p), mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) ) @@ -367,8 +347,8 @@ proc ceilSse2*(image: Image) {.simd.} = values1 = mm_cmpeq_epi8(values1, vecZero) values0 = mm_andnot_si128(values0, vec255) values1 = mm_andnot_si128(values1, vec255) - mm_storeu_si128(cast[pointer](p), values0) - mm_storeu_si128(cast[pointer](p + 16), values1) + mm_store_si128(cast[pointer](p), values0) + mm_store_si128(cast[pointer](p + 16), values1) p += 32 i += 8 * iterations @@ -527,11 +507,91 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) -proc blitLineNormalSse2*( +template applyCoverage*(rgbxVec, coverage: M128i): M128i = + ## Unpack the first 4 coverage bytes. + var unpacked = mm_unpacklo_epi8(mm_setzero_si128(), coverage) + unpacked = mm_unpacklo_epi8(mm_setzero_si128(), unpacked) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + + var + rgbxEven = mm_slli_epi16(rgbxVec, 8) + rgbxOdd = mm_and_si128(rgbxVec, oddMask) + rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked) + rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked) + rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7) + rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7) + + mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8)) + +proc blendLineCoverageOverwriteSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int + ) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec.applyCoverage(coverage)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage != 0: + line[i] = rgbx * coverage + +proc blendLineNormalSse2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendNormal(line[i], rgbx) + inc i + + let + source = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + while i < len - 4: + let backdrop = mm_load_si128(line[i].addr) + mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source)) + i += 4 + + for i in i ..< len: + line[i] = blendNormal(line[i], rgbx) + +proc blendLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendNormal(a[i], b[i]) inc i @@ -546,41 +606,92 @@ proc blitLineNormalSse2*( source = mm_loadu_si128(b[i].addr) eq255 = mm_cmpeq_epi8(source, vec255) if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source - mm_storeu_si128(a[i].addr, source) + mm_store_si128(a[i].addr, source) else: let backdrop = mm_load_si128(a[i].addr) - - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha) - - backdropEven = mm_mulhi_epu16(backdropEven, multiplier) - backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - let added = mm_add_epi8( - source, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - - mm_store_si128(a[i].addr, added) - + mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) -proc blitLineMaskSse2*( +proc blendLineCoverageNormalSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) + if mm_movemask_epi8(eqZero) == 0xffff: + i += 16 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, rgbxVec) + i += 4 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + discard + else: + line[i] = blendNormal(line[i], rgbx * coverage) + +proc blendLineMaskSse2*( + line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + line[i] = blendMask(line[i], rgbx) + inc i + + let + source = mm_set1_epi32(cast[uint32](rgbx)) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + while i < len - 4: + let backdrop = mm_load_si128(line[i].addr) + mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source)) + i += 4 + + for i in i ..< len: + line[i] = blendMask(line[i], rgbx) + +proc blendLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = var i: int - while (cast[uint](a[i].addr) and 15) != 0: + while i < len and (cast[uint](a[i].addr) and 15) != 0: a[i] = blendMask(a[i], b[i]) inc i @@ -597,28 +708,65 @@ proc blitLineMaskSse2*( discard else: let backdrop = mm_load_si128(a[i].addr) - - var - sourceAlpha = mm_and_si128(source, alphaMask) - backdropEven = mm_slli_epi16(backdrop, 8) - backdropOdd = mm_and_si128(backdrop, oddMask) - - sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16)) - - backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha) - backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha) - backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) - backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - - mm_store_si128( - a[i].addr, - mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) - ) - + mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source)) i += 4 for i in i ..< len: a[i] = blendMask(a[i], b[i]) +proc blendLineCoverageMaskSse2*( + line: ptr UncheckedArray[ColorRGBX], + coverages: ptr UncheckedArray[uint8], + rgbx: ColorRGBX, + len: int +) {.simd.} = + var i: int + while i < len and (cast[uint](line[i].addr) and 15) != 0: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + inc i + + let + rgbxVec = mm_set1_epi32(cast[uint32](rgbx)) + vecZero = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + while i < len - 16: + let + coverage = mm_loadu_si128(coverages[i].addr) + eqZero = mm_cmpeq_epi8(coverage, vecZero) + eq255 = mm_cmpeq_epi8(coverage, vec255) + if mm_movemask_epi8(eqZero) == 0xffff: + for _ in 0 ..< 4: + mm_store_si128(line[i].addr, vecZero) + i += 4 + elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255: + i += 16 + else: + var coverage = coverage + for _ in 0 ..< 4: + let + backdrop = mm_loadu_si128(line[i].addr) + source = rgbxVec.applyCoverage(coverage) + mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source)) + coverage = mm_srli_si128(coverage, 4) + i += 4 + + for i in i ..< len: + let coverage = coverages[i] + if coverage == 0: + line[i] = rgbx(0, 0, 0, 0) + elif coverage == 255: + discard + else: + line[i] = blendMask(line[i], rgbx * coverage) + when defined(release): {.pop.} diff --git a/tests/bench_fonts.nim b/tests/bench_fonts.nim index 32bbc01..544ca12 100644 --- a/tests/bench_fonts.nim +++ b/tests/bench_fonts.nim @@ -5,8 +5,7 @@ const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis in q var font = readFont("tests/fonts/Roboto-Regular_1.ttf") font.size = 16 -let - image = newImage(500, 300) +let image = newImage(500, 300) timeIt "typeset": discard font.typeset(text, bounds = vec2(image.width.float32, 0)) diff --git a/tests/fuzz_image_draw.nim b/tests/fuzz_image_draw.nim index d8d43b8..1271156 100644 --- a/tests/fuzz_image_draw.nim +++ b/tests/fuzz_image_draw.nim @@ -25,3 +25,31 @@ for i in 0 ..< 250: a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) a.draw(b, translate(translation)) + +for i in 0 ..< 25: + let a = newImage(rand(1 .. 20), rand(1 .. 20)) + for j in 0 ..< 25: + let b = newImage(rand(1 .. 20), rand(1 .. 20)) + + let + translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5) + rotation = rand(2 * PI).float32 + + echo a, " ", b, " ", translation, " ", rotation + + a.draw(b, translate(vec2(translation.x, translation.y))) + a.draw(b, translate(translation) * rotate(rotation)) + +for i in 0 ..< 25: + let a = newImage(rand(1 .. 2000), rand(1 .. 2000)) + for j in 0 ..< 25: + let b = newImage(rand(1 .. 1000), rand(1 .. 1000)) + + let + translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500) + rotation = rand(2 * PI).float32 + + echo a, " ", b, " ", translation, " ", rotation + + a.draw(b, translate(vec2(translation.x, translation.y))) + a.draw(b, translate(translation) * rotate(rotation)) diff --git a/tests/fuzz_image_draw_smooth.nim b/tests/fuzz_image_draw_smooth.nim deleted file mode 100644 index 0a80cd8..0000000 --- a/tests/fuzz_image_draw_smooth.nim +++ /dev/null @@ -1,31 +0,0 @@ -import pixie, random - -randomize() - -for i in 0 ..< 25: - let a = newImage(rand(1 .. 20), rand(1 .. 20)) - for j in 0 ..< 25: - let b = newImage(rand(1 .. 20), rand(1 .. 20)) - - let - translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5) - rotation = rand(2 * PI).float32 - - echo a, " ", b, " ", translation, " ", rotation - - a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) - a.draw(b, translate(translation) * rotate(rotation)) - -for i in 0 ..< 25: - let a = newImage(rand(1 .. 2000), rand(1 .. 2000)) - for j in 0 ..< 25: - let b = newImage(rand(1 .. 1000), rand(1 .. 1000)) - - let - translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500) - rotation = rand(2 * PI).float32 - - echo a, " ", b, " ", translation, " ", rotation - - a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc))) - a.draw(b, translate(translation) * rotate(rotation))