diff --git a/pixie.nimble b/pixie.nimble index c79cb13..ef57b10 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.5" requires "zippy >= 0.10.2" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.1" +requires "nimsimd >= 1.1.5" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 5f23e00..ebc2c2d 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -1,7 +1,7 @@ import blends, bumpy, chroma, common, masks, pixie/internal, vmath when defined(amd64) and allowSimd: - import nimsimd/sse2 + import nimsimd/sse2, runtimechecked/avx2 const h = 0.5.float32 @@ -101,54 +101,84 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. + when defined(amd64) and allowSimd: + if cpuHasAvx2: + return isOneColorAvx2(image.data, 0, image.data.len) + result = true let color = image.data[0] var i: int when defined(amd64) and allowSimd: - let colorVec = mm_set1_epi32(cast[int32](color)) - for _ in 0 ..< image.data.len div 16: + # Align to 16 bytes + var p = cast[uint](image.data[i].addr) + while i < image.data.len and (p and 15) != 0: + if image.data[i] != color: + return false + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](color)) + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(image.data[i + 0].addr) - values1 = mm_loadu_si128(image.data[i + 4].addr) - values2 = mm_loadu_si128(image.data[i + 8].addr) - values3 = mm_loadu_si128(image.data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) eq0 = mm_cmpeq_epi8(values0, colorVec) eq1 = mm_cmpeq_epi8(values1, colorVec) eq2 = mm_cmpeq_epi8(values2, colorVec) eq3 = mm_cmpeq_epi8(values3, colorVec) - eq = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) - if mm_movemask_epi8(eq) != 0xffff: + eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) + if mm_movemask_epi8(eq0123) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< image.data.len: - if image.data[j] != color: + for i in i ..< image.data.len: + if image.data[i] != color: return false proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. + when defined(amd64) and allowSimd: + if cpuHasAvx2: + return isTransparentAvx2(image.data, 0, image.data.len) + result = true var i: int when defined(amd64) and allowSimd: - let vecZero = mm_setzero_si128() - for _ in 0 ..< image.data.len div 16: + # Align to 16 bytes + var p = cast[uint](image.data[i].addr) + while i < image.data.len and (p and 15) != 0: + if image.data[i].a != 0: + return false + inc i + p += 4 + + let + vecZero = mm_setzero_si128() + iterations = (image.data.len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(image.data[i + 0].addr) - values1 = mm_loadu_si128(image.data[i + 4].addr) - values2 = mm_loadu_si128(image.data[i + 8].addr) - values3 = mm_loadu_si128(image.data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_or_si128(values0, values1) values23 = mm_or_si128(values2, values3) - values = mm_or_si128(values01, values23) - if mm_movemask_epi8(mm_cmpeq_epi8(values, vecZero)) != 0xffff: + values0123 = mm_or_si128(values01, values23) + if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< image.data.len: - if image.data[j].a != 0: + for i in i ..< image.data.len: + if image.data[i].a != 0: return false proc isOpaque*(image: Image): bool {.raises: [].} = diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 1a5a752..b7211f4 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -3,8 +3,10 @@ import bumpy, chroma, common, system/memory, vmath const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) when defined(amd64) and allowSimd: - import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx - let cpuHasAvx* = checkInstructionSets({AVX}) + import nimsimd/runtimecheck, nimsimd/sse2, runtimechecked/avx, runtimechecked/avx2 + let + cpuHasAvx* = checkInstructionSets({AVX}) + cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) template currentExceptionAsPixieError*(): untyped = ## Gets the current exception and returns it as a PixieError with stack trace. @@ -178,27 +180,42 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} data[i] = c proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = + when defined(amd64) and allowSimd: + if cpuHasAvx2 and len >= 64: + return isOpaqueAvx2(data, start, len) + result = true var i = start when defined(amd64) and allowSimd: - let vec255 = mm_set1_epi32(cast[int32](uint32.high)) - for _ in start ..< (start + len) div 16: + # Align to 16 bytes + var p = cast[uint](data[i].addr) + while i < (start + len) and (p and 15) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let + vec255 = mm_set1_epi8(255) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: let - values0 = mm_loadu_si128(data[i + 0].addr) - values1 = mm_loadu_si128(data[i + 4].addr) - values2 = mm_loadu_si128(data[i + 8].addr) - values3 = mm_loadu_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_and_si128(values0, values1) values23 = mm_and_si128(values2, values3) - values = mm_and_si128(values01, values23) - eq = mm_cmpeq_epi8(values, vec255) + values0123 = mm_and_si128(values01, values23) + eq = mm_cmpeq_epi8(values0123, vec255) if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: return false - i += 16 + p += 64 + i += 16 * iterations - for j in i ..< start + len: - if data[j].a != 255: + for i in i ..< start + len: + if data[i].a != 255: return false when defined(amd64) and allowSimd: diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim new file mode 100644 index 0000000..afd68ca --- /dev/null +++ b/src/pixie/runtimechecked/avx2.nim @@ -0,0 +1,106 @@ +import chroma, nimsimd/avx2 + +when defined(gcc) or defined(clang): + {.localPassc: "-mavx2".} + +when defined(release): + {.push checks: off.} + +proc isOneColorAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + let color = data[0] + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i] != color: + return false + inc i + p += 4 + + let + colorVec = mm256_set1_epi32(cast[int32](color)) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + eq0 = mm256_cmpeq_epi8(values0, colorVec) + eq1 = mm256_cmpeq_epi8(values1, colorVec) + eq01 = mm256_and_si256(eq0, eq1) + if mm256_movemask_epi8(eq01) != cast[int32](0xffffffff): + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i] != color: + return false + +proc isTransparentAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i].a != 0: + return false + inc i + p += 4 + + let + vecZero = mm256_setzero_si256() + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + values01 = mm256_or_si256(values0, values1) + eq = mm256_cmpeq_epi8(values01, vecZero) + if mm256_movemask_epi8(eq) != cast[int32](0xffffffff): + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i].a != 0: + return false + +proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = + result = true + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: + if data[i].a != 255: + return false + inc i + p += 4 + + let + vec255 = mm256_set1_epi8(255) + iterations = (start + len - i) div 16 + for _ in 0 ..< iterations: + let + values0 = mm256_load_si256(cast[pointer](p)) + values1 = mm256_load_si256(cast[pointer](p + 32)) + values01 = mm256_and_si256(values0, values1) + eq = mm256_cmpeq_epi8(values01, vec255) + if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: + return false + p += 64 + i += 16 * iterations + + for i in i ..< start + len: + if data[i].a != 255: + return false + +when defined(release): + {.pop.}