diff --git a/src/pixie/images.nim b/src/pixie/images.nim index ebc2c2d..3f9eff1 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,7 +1,10 @@ -import blends, bumpy, chroma, common, masks, pixie/internal, vmath +import blends, bumpy, chroma, common, masks, internal, vmath -when defined(amd64) and allowSimd: - import nimsimd/sse2, runtimechecked/avx2 +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 const h = 0.5.float32 @@ -101,83 +104,30 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. - when defined(amd64) and allowSimd: - if cpuHasAvx2: - return isOneColorAvx2(image.data, 0, image.data.len) + when allowSimd and compiles(isOneColorSimd): + return isOneColorSimd( + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) result = true - let color = image.data[0] - - var i: int - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](image.data[i].addr) - while i < image.data.len and (p and 15) != 0: - if image.data[i] != color: - return false - inc i - p += 4 - - let - colorVec = mm_set1_epi32(cast[int32](color)) - iterations = (image.data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - eq0 = mm_cmpeq_epi8(values0, colorVec) - eq1 = mm_cmpeq_epi8(values1, colorVec) - eq2 = mm_cmpeq_epi8(values2, colorVec) - eq3 = mm_cmpeq_epi8(values3, colorVec) - eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) - if mm_movemask_epi8(eq0123) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< image.data.len: - if image.data[i] != color: + let color = cast[uint32](image.data[0]) + for i in 0 ..< image.data.len: + if cast[uint32](image.data[i]) != color: return false proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. - when defined(amd64) and allowSimd: - if cpuHasAvx2: - return isTransparentAvx2(image.data, 0, image.data.len) + when allowSimd and compiles(isTransparentSimd): + return isTransparentSimd( + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) result = true - var i: int - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](image.data[i].addr) - while i < image.data.len and (p and 15) != 0: - if image.data[i].a != 0: - return false - inc i - p += 4 - - let - vecZero = mm_setzero_si128() - iterations = (image.data.len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_or_si128(values0, values1) - values23 = mm_or_si128(values2, values3) - values0123 = mm_or_si128(values01, values23) - if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: - return false - p += 64 - i += 16 * iterations - - for i in i ..< image.data.len: + for i in 0 ..< image.data.len: if image.data[i].a != 0: return false diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index cbeb522..28aac9c 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -2,11 +2,11 @@ import bumpy, chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) -when defined(amd64) and allowSimd: - import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 - let - cpuHasAvx* = checkInstructionSets({AVX}) - cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -81,45 +81,20 @@ proc fillUnsafe*( ## continuing for len indices. let rgbx = color.asRgbx() - # If we can use AVX, do so - when defined(amd64) and allowSimd: - if cpuHasAvx and len >= 64: - fillUnsafeAvx(data, rgbx, start, len) - return + when allowSimd and compiles(fillUnsafeSimd): + fillUnsafeSimd( + cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), + len, + rgbx + ) + return # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) else: - var i = start - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](data[i].addr) - while i < (start + len) and (p and 15) != 0: - data[i] = rgbx - inc i - p += 4 - # When supported, SIMD fill until we run out of room - let - colorVec = mm_set1_epi32(cast[int32](rgbx)) - iterations = (start + len - i) div 8 - for _ in 0 ..< iterations: - mm_store_si128(cast[pointer](p), colorVec) - mm_store_si128(cast[pointer](p + 16), colorVec) - p += 32 - i += iterations * 8 - else: - when sizeof(int) == 8: - # Fill 8 bytes at a time when possible - var - u32 = cast[uint32](rgbx) - u64 = cast[uint64]([u32, u32]) - for _ in 0 ..< len div 2: - copyMem(data[i].addr, u64.addr, 8) - i += 2 - # Fill whatever is left the slow way - for i in i ..< start + len: - data[i] = rgbx + for color in data.mitems: + color = rgbx const straightAlphaTable = block: var table: array[256, array[256, uint8]] @@ -141,39 +116,14 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. - var i: int - when defined(amd64) and allowSimd: - if cpuHasAvx2: - i = toPremultipliedAlphaAvx2(data) - else: - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< data.len div 4: - let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 + when allowSimd and compiles(toPremultipliedAlphaSimd): + toPremultipliedAlphaSimd( + cast[ptr UncheckedArray[uint32]](data[0].addr), + data.len + ) + return - # Convert whatever is left - for i in i ..< data.len: + for i in 0 ..< data.len: var c = data[i] if c.a != 255: c.r = ((c.r.uint32 * c.a) div 255).uint8 @@ -182,41 +132,15 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = - when defined(amd64) and allowSimd: - if cpuHasAvx2 and len >= 64: - return isOpaqueAvx2(data, start, len) + when allowSimd and compiles(isOpaqueSimd): + return isOpaqueSimd( + cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), + len + ) result = true - var i = start - when defined(amd64) and allowSimd: - # Align to 16 bytes - var p = cast[uint](data[i].addr) - while i < (start + len) and (p and 15) != 0: - if data[i].a != 255: - return false - inc i - p += 4 - - let - vec255 = mm_set1_epi8(255) - iterations = (start + len - i) div 16 - for _ in 0 ..< iterations: - let - values0 = mm_load_si128(cast[pointer](p)) - values1 = mm_load_si128(cast[pointer](p + 16)) - values2 = mm_load_si128(cast[pointer](p + 32)) - values3 = mm_load_si128(cast[pointer](p + 48)) - values01 = mm_and_si128(values0, values1) - values23 = mm_and_si128(values2, values3) - values0123 = mm_and_si128(values01, values23) - eq = mm_cmpeq_epi8(values0123, vec255) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - return false - p += 64 - i += 16 * iterations - - for i in i ..< start + len: + for i in start ..< start + len: if data[i].a != 255: return false diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index 2a3b9d2..cb6d8e0 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -7,28 +7,23 @@ when defined(release): {.push checks: off.} proc fillUnsafeAvx*( - data: var seq[ColorRGBX], - rgbx: ColorRGBX, - start, len: int + data: ptr UncheckedArray[ColorRGBX], + len: int, + rgbx: ColorRGBX ) = - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes data[i] = rgbx inc i - p += 4 - # When supported, SIMD fill until we run out of room + let - iterations = (start + len - i) div 8 + iterations = (len - i) div 8 colorVec = mm256_set1_epi32(cast[int32](rgbx)) for _ in 0 ..< iterations: - mm256_store_si256(cast[pointer](p), colorVec) - p += 32 - i += iterations * 8 + mm256_store_si256(data[i].addr, colorVec) + i += 8 # Fill whatever is left the slow way - for i in i ..< start + len: + for i in i ..< len: data[i] = rgbx when defined(release): diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index 3f4a86d..bfa31a9 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -6,108 +6,93 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} -proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true let color = data[0] - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i] != color: return false inc i - p += 4 let colorVec = mm256_set1_epi32(cast[int32](color)) - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) eq0 = mm256_cmpeq_epi8(values0, colorVec) eq1 = mm256_cmpeq_epi8(values1, colorVec) eq01 = mm256_and_si256(eq0, eq1) if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff): return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i] != color: return false -proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i].a != 0: return false inc i - p += 4 let vecZero = mm256_setzero_si256() - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) values01 = mm256_or_si256(values0, values1) eq = mm256_cmpeq_epi8(values01, vecZero) if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i].a != 0: return false -proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = +proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = result = true - var - i = start - p = cast[uint](data[i].addr) - # Align to 32 bytes - while i < (start + len) and (p and 31) != 0: + var i: int + while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes if data[i].a != 255: return false inc i - p += 4 let vec255 = mm256_set1_epi8(255) - iterations = (start + len - i) div 16 + iterations = (len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm256_load_si256(cast[pointer](p)) - values1 = mm256_load_si256(cast[pointer](p + 32)) + values0 = mm256_load_si256(data[i].addr) + values1 = mm256_load_si256(data[i + 8].addr) values01 = mm256_and_si256(values0, values1) eq = mm256_cmpeq_epi8(values01, vec255) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: return false - p += 64 - i += 16 * iterations + i += 16 - for i in i ..< start + len: + for i in i ..< len: if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]): int = +proc toPremultipliedAlphaAvx2*(data: ptr UncheckedArray[uint32], len: int): int = let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) div255 = mm256_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< data.len div 8: + for _ in 0 ..< len div 8: let values = mm256_loadu_si256(data[result].addr) alpha = mm256_and_si256(values, alphaMask) diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim new file mode 100644 index 0000000..0f9fae8 --- /dev/null +++ b/src/pixie/simd.nim @@ -0,0 +1,172 @@ +import chroma + +when defined(amd64): + import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 + + let + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) + + proc fillUnsafeSimd*( + data: ptr UncheckedArray[ColorRGBX], + len: int, + rgbx: ColorRGBX + ) = + if cpuHasAvx and len >= 64: + fillUnsafeAvx(data, len, rgbx) + else: + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + data[i] = rgbx + inc i + + let + colorVec = mm_set1_epi32(cast[int32](rgbx)) + iterations = (len - i) div 8 + for _ in 0 ..< iterations: + mm_store_si128(data[i].addr, colorVec) + mm_store_si128(data[i + 4].addr, colorVec) + i += 8 + + for i in i ..< len: + data[i] = rgbx + + proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isOneColorAvx2(data, len) + + result = true + + let color = data[0] + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i] != color: + return false + inc i + + let + colorVec = mm_set1_epi32(cast[int32](color)) + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + eq0 = mm_cmpeq_epi8(values0, colorVec) + eq1 = mm_cmpeq_epi8(values1, colorVec) + eq2 = mm_cmpeq_epi8(values2, colorVec) + eq3 = mm_cmpeq_epi8(values3, colorVec) + eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) + if mm_movemask_epi8(eq0123) != 0xffff: + return false + i += 16 + + for i in i ..< len: + if data[i] != color: + return false + + proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isTransparentAvx2(data, len) + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i].a != 0: + return false + inc i + + result = true + + let + vecZero = mm_setzero_si128() + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + values01 = mm_or_si128(values0, values1) + values23 = mm_or_si128(values2, values3) + values0123 = mm_or_si128(values01, values23) + if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: + return false + i += 16 + + for i in i ..< len: + if data[i].a != 0: + return false + + proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + if cpuHasAvx2: + return isOpaqueAvx2(data, len) + + result = true + + var i: int + while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + if data[i].a != 255: + return false + inc i + + let + vec255 = mm_set1_epi8(255) + iterations = (len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm_load_si128(data[i].addr) + values1 = mm_load_si128(data[i + 4].addr) + values2 = mm_load_si128(data[i + 8].addr) + values3 = mm_load_si128(data[i + 12].addr) + values01 = mm_and_si128(values0, values1) + values23 = mm_and_si128(values2, values3) + values0123 = mm_and_si128(values01, values23) + eq = mm_cmpeq_epi8(values0123, vec255) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + return false + i += 16 + + for i in i ..< len: + if data[i].a != 255: + return false + + proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) = + var i: int + if cpuHasAvx2: + i = toPremultipliedAlphaAvx2(data, len) + else: + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(cast[int16](0xff00)) + div255 = mm_set1_epi16(cast[int16](0x8081)) + for _ in 0 ..< len div 4: + let + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + let + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 + + for i in i ..< len: + var c: ColorRGBX + copyMem(c.addr, data[i].addr, 4) + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + copyMem(data[i].addr, c.addr, 4)