diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 343c630..d571b5c 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -113,10 +113,7 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. when allowSimd and compiles(toPremultipliedAlphaSimd): - toPremultipliedAlphaSimd( - cast[ptr UncheckedArray[uint32]](data[0].addr), - data.len - ) + toPremultipliedAlphaSimd(data) return for i in 0 ..< data.len: diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index da34dd4..cedd227 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -87,17 +87,17 @@ proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*( - data: ptr UncheckedArray[uint32], - len: int -): int = +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = + var i: int + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) - oddMask = mm256_set1_epi16(cast[int16](0xff00)) - div255 = mm256_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 8: + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + iterations = data.len div 8 + for _ in 0 ..< iterations: let - values = mm256_loadu_si256(data[result].addr) + values = mm256_loadu_si256(data[i].addr) alpha = mm256_and_si256(values, alphaMask) eq = mm256_cmpeq_epi8(values, alphaMask) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: @@ -112,10 +112,18 @@ proc toPremultipliedAlphaAvx2*( colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) mm256_storeu_si256( - data[result].addr, + data[i].addr, mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) ) - result += 8 + i += 8 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c when defined(release): {.pop.} diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index c0f5533..890fc69 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -163,44 +163,47 @@ when defined(amd64): if data[i].a != 255: return false - proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) = - var i: int + proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) = if cpuHasAvx2: - i = toPremultipliedAlphaAvx2(data, len) - else: - let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 4: - let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 + toPremultipliedAlphaAvx2(data) + return - for i in i ..< len: - var c: ColorRGBX - copyMem(c.addr, data[i].addr, 4) - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 - copyMem(data[i].addr, c.addr, 4) + var i: int + + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + iterations = data.len div 4 + for _ in 0 ..< iterations: + let + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: + let + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c proc newImageFromMaskSimd*( dst: ptr UncheckedArray[ColorRGBX], @@ -282,7 +285,7 @@ when defined(amd64): rgbx.a = 255 - rgbx.a data[i] = rgbx - toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data[0].addr), data.len) + toPremultipliedAlphaSimd(data) proc invertMaskSimd*(data: var seq[uint8]) = var