diff --git a/src/pixie/images.nim b/src/pixie/images.nim index ebc2c2d..cd838b8 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,7 +1,10 @@ -import blends, bumpy, chroma, common, masks, pixie/internal, vmath +import blends, bumpy, chroma, common, internal, masks, vmath -when defined(amd64) and allowSimd: - import nimsimd/sse2, runtimechecked/avx2 +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 const h = 0.5.float32 @@ -28,21 +31,18 @@ proc newImage*(width, height: int): Image {.raises: [PixieError].} = proc newImage*(mask: Mask): Image {.raises: [PixieError].} = result = newImage(mask.width, mask.height) - var i: int - when defined(amd64) and allowSimd: - for _ in 0 ..< mask.data.len div 16: - var alphas = mm_loadu_si128(mask.data[i].addr) - for j in 0 ..< 4: - var unpacked = unpackAlphaValues(alphas) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - mm_storeu_si128(result.data[i + j * 4].addr, unpacked) - alphas = mm_srli_si128(alphas, 4) - i += 16 - for j in i ..< mask.data.len: - let v = mask.data[j] - result.data[j] = rgbx(v, v, v, v) + when allowSimd and compiles(newImageFromMaskSimd): + newImageFromMaskSimd( + cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr), + cast[ptr UncheckedArray[uint8]](mask.data[0].addr), + mask.data.len + ) + return + + for i in 0 ..< mask.data.len: + let v = mask.data[i] + result.data[i] = rgbx(v, v, v, v) proc copy*(image: Image): Image {.raises: [PixieError].} = ## Copies the image data into a new image. @@ -101,83 +101,30 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. - when defined(amd64) and allowSimd: - if cpuHasAvx2: - return isOneColorAvx2(image.data, 0, image.data.len) + when allowSimd and compiles(isOneColorSimd): + return isOneColorSimd( + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) result = true - let color = image.data[0] - - var i: int - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](image.data[i].addr) - while i < image.data.len and (p and 15) != 0: - if image.data[i] != color: - return false - inc i - p += 4 - - let - colorVec = mm_set1_epi32(cast[int32](color)) - iterations = (image.data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - eq0 = mm_cmpeq_epi8(values0, colorVec) - eq1 = mm_cmpeq_epi8(values1, colorVec) - eq2 = mm_cmpeq_epi8(values2, colorVec) - eq3 = mm_cmpeq_epi8(values3, colorVec) - eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) - if mm_movemask_epi8(eq0123) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< image.data.len: - if image.data[i] != color: + let color = cast[uint32](image.data[0]) + for i in 0 ..< image.data.len: + if cast[uint32](image.data[i]) != color: return false proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. - when defined(amd64) and allowSimd: - if cpuHasAvx2: - return isTransparentAvx2(image.data, 0, image.data.len) + when allowSimd and compiles(isTransparentSimd): + return isTransparentSimd( + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) result = true - var i: int - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](image.data[i].addr) - while i < image.data.len and (p and 15) != 0: - if image.data[i].a != 0: - return false - inc i - p += 4 - - let - vecZero = mm_setzero_si128() - iterations = (image.data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_or_si128(values0, values1) - values23 = mm_or_si128(values2, values3) - values0123 = mm_or_si128(values01, values23) - if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< image.data.len: + for i in 0 ..< image.data.len: if image.data[i].a != 0: return false @@ -410,89 +357,48 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(target: Image | Mask, opacity: float32) {.raises: [].} = +proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - when type(target) is Image: - target.fill(rgbx(0, 0, 0, 0)) - else: - target.fill(0) + image.fill(rgbx(0, 0, 0, 0)) return - var i: int - when defined(amd64) and allowSimd: - when type(target) is Image: - let byteLen = target.data.len * 4 - else: - let byteLen = target.data.len + when allowSimd and compiles(applyOpacitySimd): + applyOpacitySimd( + cast[ptr UncheckedArray[uint8]](image.data[0].addr), + image.data.len * 4, + opacity + ) + return - let - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - zeroVec = mm_setzero_si128() - opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) - for _ in 0 ..< byteLen div 16: - when type(target) is Image: - let index = i div 4 - else: - let index = i - - let values = mm_loadu_si128(target.data[index].addr) - if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: - var - valuesEven = mm_slli_epi16(values, 8) - valuesOdd = mm_and_si128(values, oddMask) - valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) - valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) - valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) - valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) - mm_storeu_si128( - target.data[index].addr, - mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) - ) - - i += 16 - - when type(target) is Image: - for j in i div 4 ..< target.data.len: - var rgbx = target.data[j] - rgbx.r = ((rgbx.r * opacity) div 255).uint8 - rgbx.g = ((rgbx.g * opacity) div 255).uint8 - rgbx.b = ((rgbx.b * opacity) div 255).uint8 - rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[j] = rgbx - else: - for j in i ..< target.data.len: - target.data[j] = ((target.data[j] * opacity) div 255).uint8 + for i in 0 ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx proc invert*(image: Image) {.raises: [].} = ## Inverts all of the colors and alpha. - var i: int - when defined(amd64) and allowSimd: - let vec255 = mm_set1_epi8(cast[int8](255)) - for _ in 0 ..< image.data.len div 16: - let - a = mm_loadu_si128(image.data[i + 0].addr) - b = mm_loadu_si128(image.data[i + 4].addr) - c = mm_loadu_si128(image.data[i + 8].addr) - d = mm_loadu_si128(image.data[i + 12].addr) - mm_storeu_si128(image.data[i + 0].addr, mm_sub_epi8(vec255, a)) - mm_storeu_si128(image.data[i + 4].addr, mm_sub_epi8(vec255, b)) - mm_storeu_si128(image.data[i + 8].addr, mm_sub_epi8(vec255, c)) - mm_storeu_si128(image.data[i + 12].addr, mm_sub_epi8(vec255, d)) - i += 16 + when allowSimd and compiles(invertImageSimd): + invertImageSimd( + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) + return - for j in i ..< image.data.len: - var rgbx = image.data[j] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g rgbx.b = 255 - rgbx.b rgbx.a = 255 - rgbx.a - image.data[j] = rgbx + image.data[i] = rgbx # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # is not a valid premultiplied alpha color. @@ -564,22 +470,16 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} = ## Returns a new mask using the alpha values of the image. result = newMask(image.width, image.height) - var i: int - when defined(amd64) and allowSimd: - for _ in 0 ..< image.data.len div 16: - let - a = mm_loadu_si128(image.data[i + 0].addr) - b = mm_loadu_si128(image.data[i + 4].addr) - c = mm_loadu_si128(image.data[i + 8].addr) - d = mm_loadu_si128(image.data[i + 12].addr) - mm_storeu_si128( - result.data[i].addr, - pack4xAlphaValues(a, b, c, d) - ) - i += 16 + when allowSimd and compiles(newMaskFromImageSimd): + newMaskFromImageSimd( + cast[ptr UncheckedArray[uint8]](result.data[0].addr), + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) + return - for j in i ..< image.data.len: - result.data[j] = image.data[j].a + for i in 0 ..< image.data.len: + result.data[i] = image.data[i].a proc getRgbaSmooth*( image: Image, x, y: float32, wrapped = false diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index cbeb522..2524847 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -2,11 +2,11 @@ import bumpy, chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) -when defined(amd64) and allowSimd: - import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 - let - cpuHasAvx* = checkInstructionSets({AVX}) - cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -81,45 +81,20 @@ proc fillUnsafe*( ## continuing for len indices. let rgbx = color.asRgbx() - # If we can use AVX, do so - when defined(amd64) and allowSimd: - if cpuHasAvx and len >= 64: - fillUnsafeAvx(data, rgbx, start, len) - return + when allowSimd and compiles(fillUnsafeSimd): + fillUnsafeSimd( + cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), + len, + rgbx + ) + return # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) else: - var i = start - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](data[i].addr) - while i < (start + len) and (p and 15) != 0: - data[i] = rgbx - inc i - p += 4 - # When supported, SIMD fill until we run out of room - let - colorVec = mm_set1_epi32(cast[int32](rgbx)) - iterations = (start + len - i) div 8 - for _ in 0 ..< iterations: - mm_store_si128(cast[pointer](p), colorVec) - mm_store_si128(cast[pointer](p + 16), colorVec) - p += 32 - i += iterations * 8 - else: - when sizeof(int) == 8: - # Fill 8 bytes at a time when possible - var - u32 = cast[uint32](rgbx) - u64 = cast[uint64]([u32, u32]) - for _ in 0 ..< len div 2: - copyMem(data[i].addr, u64.addr, 8) - i += 2 - # Fill whatever is left the slow way - for i in i ..< start + len: - data[i] = rgbx + for color in data.mitems: + color = rgbx const straightAlphaTable = block: var table: array[256, array[256, uint8]] @@ -141,39 +116,14 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. - var i: int - when defined(amd64) and allowSimd: - if cpuHasAvx2: - i = toPremultipliedAlphaAvx2(data) - else: - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< data.len div 4: - let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 + when allowSimd and compiles(toPremultipliedAlphaSimd): + toPremultipliedAlphaSimd( + cast[ptr UncheckedArray[uint32]](data[0].addr), + data.len + ) + return - # Convert whatever is left - for i in i ..< data.len: + for i in 0 ..< data.len: var c = data[i] if c.a != 255: c.r = ((c.r.uint32 * c.a) div 255).uint8 @@ -182,41 +132,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = - when defined(amd64) and allowSimd: - if cpuHasAvx2 and len >= 64: - return isOpaqueAvx2(data, start, len) + when allowSimd and compiles(isOpaqueSimd): + return isOpaqueSimd( + cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), + len + ) result = true - var i = start - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](data[i].addr) - while i < (start + len) and (p and 15) != 0: - if data[i].a != 255: - return false - inc i - p += 4 - - let - vec255 = mm_set1_epi8(255) - iterations = (start + len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_and_si128(values0, values1) - values23 = mm_and_si128(values2, values3) - values0123 = mm_and_si128(values01, values23) - eq = mm_cmpeq_epi8(values0123, vec255) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - return false - p += 64 - i += 16 * iterations - - for i in i ..< start + len: + for i in start ..< start + len: if data[i].a != 255: return false @@ -228,24 +152,7 @@ when defined(amd64) and allowSimd: finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) - proc packAlphaValues(v: M128i): M128i {.inline, raises: [].} = - ## Shuffle the alpha values for these 4 colors to the first 4 bytes - result = mm_srli_epi32(v, 24) - result = mm_packus_epi16(result, mm_setzero_si128()) - result = mm_packus_epi16(result, mm_setzero_si128()) - - proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline, raises: [].} = - let - i = packAlphaValues(i) - j = mm_slli_si128(packAlphaValues(j), 4) - k = mm_slli_si128(packAlphaValues(k), 8) - l = mm_slli_si128(packAlphaValues(l), 12) - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) - - proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value) - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) + export pack4xAlphaValues, unpackAlphaValues when defined(release): {.pop.} diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 7644bba..40b2ea4 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -75,6 +75,10 @@ proc setValue*(mask: Mask, x, y: int, value: uint8) {.inline, raises: [].} = ## Sets a value at (x, y) or does nothing if outside of bounds. mask[x, y] = value +proc fill*(mask: Mask, value: uint8) {.inline, raises: [].} = + ## Fills the mask with the value. + fillUnsafe(mask.data, value, 0, mask.data.len) + proc minifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = ## Scales the mask down by an integer scale. if power < 0: @@ -179,9 +183,26 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = result.width * 4 ) -proc fill*(mask: Mask, value: uint8) {.inline, raises: [].} = - ## Fills the mask with the value. - fillUnsafe(mask.data, value, 0, mask.data.len) +proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} = + ## Multiplies alpha of the image by opacity. + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + mask.fill(0) + return + + when allowSimd and compiles(applyOpacitySimd): + applyOpacitySimd( + cast[ptr UncheckedArray[uint8]](mask.data[0].addr), + mask.data.len, + opacity + ) + return + + for i in 0 ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = ## Gets a interpolated value with float point coordinates. @@ -213,17 +234,15 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = proc invert*(mask: Mask) {.raises: [].} = ## Inverts all of the values - creates a negative of the mask. - var i: int - when defined(amd64) and allowSimd: - let vec255 = mm_set1_epi8(255) - for _ in 0 ..< mask.data.len div 16: - var values = mm_loadu_si128(mask.data[i].addr) - values = mm_sub_epi8(vec255, values) - mm_storeu_si128(mask.data[i].addr, values) - i += 16 + when allowSimd and compiles(invertImageSimd): + invertMaskSimd( + cast[ptr UncheckedArray[uint8]](mask.data[0].addr), + mask.data.len + ) + return - for j in i ..< mask.data.len: - mask.data[j] = 255 - mask.data[j] + for i in 0 ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = ## Grows the mask by spread. @@ -288,21 +307,16 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = proc ceil*(mask: Mask) {.raises: [].} = ## A value of 0 stays 0. Anything else turns into 255. - var i: int - when defined(amd64) and allowSimd: - let - zeroVec = mm_setzero_si128() - vec255 = mm_set1_epi8(255) - for _ in 0 ..< mask.data.len div 16: - var values = mm_loadu_si128(mask.data[i].addr) - values = mm_cmpeq_epi8(values, zeroVec) - values = mm_andnot_si128(values, vec255) - mm_storeu_si128(mask.data[i].addr, values) - i += 16 + when allowSimd and compiles(invertImageSimd): + ceilMaskSimd( + cast[ptr UncheckedArray[uint8]](mask.data[0].addr), + mask.data.len + ) + return - for j in i ..< mask.data.len: - if mask.data[j] != 0: - mask.data[j] = 255 + for i in 0 ..< mask.data.len: + if mask.data[i] != 0: + mask.data[i] = 255 proc blur*(mask: Mask, radius: float32, outOfBounds: uint8 = 0) {.raises: [PixieError].} = ## Applies Gaussian blur to the image given a radius. diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index ef0d6e2..4cb2fa5 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1823,7 +1823,7 @@ proc fillHits( proc fillShapes( image: Image, - shapes: seq[Polygon], + shapes: var seq[Polygon], color: SomeColor, windingRule: WindingRule, blendMode: BlendMode @@ -1852,8 +1852,10 @@ proc fillShapes( var partitions = partitionSegments(segments, startY, pathHeight - startY) partitionIndex: int + entryIndices = newSeq[int](partitions.maxEntryCount) + numEntryIndices: int coverages = newSeq[uint8](pathWidth) - hits = newSeq[(Fixed32, int16)](partitions.maxEntryCount) + hits = newSeq[(Fixed32, int16)](entryIndices.len) numHits: int aa: bool @@ -1895,13 +1897,13 @@ proc fillShapes( y += partitionHeight continue - var - allEntriesInScanlineSpanIt = true - tmp: int - entryIndices: array[2, int] + var allEntriesInScanlineSpanIt = true + numEntryIndices = 0 + if partitions[partitionIndex].twoNonintersectingSpanningSegments: - tmp = 2 - entryIndices = [0, 1] + numEntryIndices = 2 + entryIndices[0] = 0 + entryIndices[1] = 1 else: for i in 0 ..< partitions[partitionIndex].entries.len: if partitions[partitionIndex].entries[i].segment.to.y < y.float32 or @@ -1911,14 +1913,10 @@ proc fillShapes( partitions[partitionIndex].entries[i].segment.to.y < (y + 1).float32: allEntriesInScanlineSpanIt = false break - if tmp < 2: - entryIndices[tmp] = i - inc tmp - else: - tmp = 0 - break + entryIndices[numEntryIndices] = i + inc numEntryIndices - if allEntriesInScanlineSpanIt and tmp == 2: + if allEntriesInScanlineSpanIt and numEntryIndices == 2: var left = partitions[partitionIndex].entries[entryIndices[0]] right = partitions[partitionIndex].entries[entryIndices[1]] diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index 2a3b9d2..cb6d8e0 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -7,28 +7,23 @@ when defined(release): {.push checks: off.} proc fillUnsafeAvx*( - data: var seq[ColorRGBX], - rgbx: ColorRGBX, - start, len: int + data: ptr UncheckedArray[ColorRGBX], + len: int, + rgbx: ColorRGBX ) = - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes data[i] = rgbx inc i - p += 4 - # When supported, SIMD fill until we run out of room + let - iterations = (start + len - i) div 8 + iterations = (len - i) div 8 colorVec = mm256_set1_epi32(cast[int32](rgbx)) for _ in 0 ..< iterations: - mm256_store_si256(cast[pointer](p), colorVec) - p += 32 - i += iterations * 8 + mm256_store_si256(data[i].addr, colorVec) + i += 8 # Fill whatever is left the slow way - for i in i ..< start + len: + for i in i ..< len: data[i] = rgbx when defined(release): diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index 3f4a86d..da34dd4 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -6,108 +6,96 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} -proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true let color = data[0] - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i] != color: return false inc i - p += 4 let colorVec = mm256_set1_epi32(cast[int32](color)) - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) eq0 = mm256_cmpeq_epi8(values0, colorVec) eq1 = mm256_cmpeq_epi8(values1, colorVec) eq01 = mm256_and_si256(eq0, eq1) if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff): return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i] != color: return false -proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i].a != 0: return false inc i - p += 4 let vecZero = mm256_setzero_si256() - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) values01 = mm256_or_si256(values0, values1) eq = mm256_cmpeq_epi8(values01, vecZero) if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i].a != 0: return false -proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i].a != 255: return false inc i - p += 4 let vec255 = mm256_set1_epi8(255) - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) values01 = mm256_and_si256(values0, values1) eq = mm256_cmpeq_epi8(values01, vec255) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int = +proc toPremultipliedAlphaAvx2*( + data: ptr UncheckedArray[uint32], + len: int +): int = let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) div255 = mm256_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< data.len div 8: + for _ in 0 ..< len div 8: let values = mm256_loadu_si256(data[result].addr) alpha = mm256_and_si256(values, alphaMask) diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim new file mode 100644 index 0000000..d789bdf --- /dev/null +++ b/src/pixie/simd.nim @@ -0,0 +1,322 @@ +import chroma, vmath + +when defined(release): + {.push checks: off.} + +when defined(amd64): + import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, + runtimechecked/avx2 + + let + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) + + proc packAlphaValues(v: M128i): M128i {.inline.} = + ## Shuffle the alpha values for these 4 colors to the first 4 bytes. + result = mm_srli_epi32(v, 24) + result = mm_packus_epi16(result, mm_setzero_si128()) + result = mm_packus_epi16(result, mm_setzero_si128()) + + proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} = + let + i = packAlphaValues(i) + j = mm_slli_si128(packAlphaValues(j), 4) + k = mm_slli_si128(packAlphaValues(k), 8) + l = mm_slli_si128(packAlphaValues(l), 12) + mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + + proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). + result = mm_unpacklo_epi8(mm_setzero_si128(), v) + result = mm_unpacklo_epi8(mm_setzero_si128(), result) + + proc fillUnsafeSimd*( + data: ptr UncheckedArray[ColorRGBX], + len: int, + rgbx: ColorRGBX + ) = + if cpuHasAvx and len >= 64: + fillUnsafeAvx(data, len, rgbx) + else: + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + data[i] = rgbx + inc i + + let + colorVec = mm_set1_epi32(cast[int32](rgbx)) + iterations = (len - i) div 8 + for _ in 0 ..< iterations: + mm_store_si128(data[i].addr, colorVec) + mm_store_si128(data[i + 4].addr, colorVec) + i += 8 + + for i in i ..< len: + data[i] = rgbx + + proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isOneColorAvx2(data, len) + + result = true + + let color = data[0] + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i] != color: + return false + inc i + + let + colorVec = mm_set1_epi32(cast[int32](color)) + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + eq0 = mm_cmpeq_epi8(values0, colorVec) + eq1 = mm_cmpeq_epi8(values1, colorVec) + eq2 = mm_cmpeq_epi8(values2, colorVec) + eq3 = mm_cmpeq_epi8(values3, colorVec) + eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) + if mm_movemask_epi8(eq0123) != 0xffff: + return false + i += 16 + + for i in i ..< len: + if data[i] != color: + return false + + proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isTransparentAvx2(data, len) + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i].a != 0: + return false + inc i + + result = true + + let + vecZero = mm_setzero_si128() + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + values01 = mm_or_si128(values0, values1) + values23 = mm_or_si128(values2, values3) + values0123 = mm_or_si128(values01, values23) + if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: + return false + i += 16 + + for i in i ..< len: + if data[i].a != 0: + return false + + proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isOpaqueAvx2(data, len) + + result = true + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i].a != 255: + return false + inc i + + let + vec255 = mm_set1_epi8(255) + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + values01 = mm_and_si128(values0, values1) + values23 = mm_and_si128(values2, values3) + values0123 = mm_and_si128(values01, values23) + eq = mm_cmpeq_epi8(values0123, vec255) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + return false + i += 16 + + for i in i ..< len: + if data[i].a != 255: + return false + + proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) = + var i: int + if cpuHasAvx2: + i = toPremultipliedAlphaAvx2(data, len) + else: + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + for _ in 0 ..< len div 4: + let + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + let + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 + + for i in i ..< len: + var c: ColorRGBX + copyMem(c.addr, data[i].addr, 4) + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + copyMem(data[i].addr, c.addr, 4) + + proc newImageFromMaskSimd*( + dst: ptr UncheckedArray[ColorRGBX], + src: ptr UncheckedArray[uint8], + len: int + ) = + var i: int + for _ in 0 ..< len div 16: + var alphas = mm_loadu_si128(src[i].addr) + for j in 0 ..< 4: + var unpacked = unpackAlphaValues(alphas) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + mm_storeu_si128(dst[i + j * 4].addr, unpacked) + alphas = mm_srli_si128(alphas, 4) + i += 16 + + for i in i ..< len: + let v = src[i] + dst[i] = rgbx(v, v, v, v) + + proc newMaskFromImageSimd*( + dst: ptr UncheckedArray[uint8], + src: ptr UncheckedArray[ColorRGBX], + len: int + ) = + var i: int + for _ in 0 ..< len div 16: + let + a = mm_loadu_si128(src[i + 0].addr) + b = mm_loadu_si128(src[i + 4].addr) + c = mm_loadu_si128(src[i + 8].addr) + d = mm_loadu_si128(src[i + 12].addr) + mm_storeu_si128( + dst[i].addr, + pack4xAlphaValues(a, b, c, d) + ) + i += 16 + + for i in i ..< len: + dst[i] = src[i].a + + proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) = + var i: int + let vec255 = mm_set1_epi8(cast[int8](255)) + for _ in 0 ..< len div 16: + let + a = mm_loadu_si128(data[i + 0].addr) + b = mm_loadu_si128(data[i + 4].addr) + c = mm_loadu_si128(data[i + 8].addr) + d = mm_loadu_si128(data[i + 12].addr) + mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a)) + mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b)) + mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c)) + mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d)) + i += 16 + + for i in i ..< len: + var rgbx = data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + data[i] = rgbx + + toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len) + + proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = + var i: int + let vec255 = mm_set1_epi8(255) + for _ in 0 ..< len div 16: + var values = mm_loadu_si128(data[i].addr) + values = mm_sub_epi8(vec255, values) + mm_storeu_si128(data[i].addr, values) + i += 16 + + for j in i ..< len: + data[j] = 255 - data[j] + + proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = + var i: int + let + zeroVec = mm_setzero_si128() + vec255 = mm_set1_epi8(255) + for _ in 0 ..< len div 16: + var values = mm_loadu_si128(data[i].addr) + values = mm_cmpeq_epi8(values, zeroVec) + values = mm_andnot_si128(values, vec255) + mm_storeu_si128(data[i].addr, values) + i += 16 + + for i in i ..< len: + if data[i] != 0: + data[i] = 255 + + proc applyOpacitySimd*( + data: ptr UncheckedArray[uint8], + len: int, + opacity: uint16 + ) = + var i: int + let + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + zeroVec = mm_setzero_si128() + opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) + for _ in 0 ..< len div 16: + let values = mm_loadu_si128(data[i].addr) + if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: + var + valuesEven = mm_slli_epi16(values, 8) + valuesOdd = mm_and_si128(values, oddMask) + valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) + ) + i += 16 + + for i in i ..< len: + data[i] = ((data[i] * opacity) div 255).uint8 + +when defined(release): + {.pop.}