diff --git a/pixie.nimble b/pixie.nimble index 4b819c3..b899c8d 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.6" requires "zippy >= 0.10.3" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.7" +requires "nimsimd >= 1.1.8" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 11f5c15..ec9f2cb 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -320,38 +320,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(image: Image, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(rgbx(0, 0, 0, 0)) + image.fill(rgbx(0, 0, 0, 0)) return - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = ((rgbx.r * opacity) div 255).uint8 rgbx.g = ((rgbx.g * opacity) div 255).uint8 rgbx.b = ((rgbx.b * opacity) div 255).uint8 rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx + image.data[i] = rgbx -proc invert*(target: Image) {.hasSimd, raises: [].} = +proc invert*(image: Image) {.hasSimd, raises: [].} = ## Inverts all of the colors and alpha. - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g rgbx.b = 255 - rgbx.b rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + image.data[i] = rgbx # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # is not a valid premultiplied alpha color. # We need to convert back to premultiplied alpha after inverting. - target.data.toPremultipliedAlpha() + image.data.toPremultipliedAlpha() proc blur*( image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0) diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 365bc2c..ea52af1 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -165,18 +165,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = result.width ) -proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(mask: Mask, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(0) + mask.fill(0) return - for i in 0 ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in 0 ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = ## Gets a interpolated value with float point coordinates. @@ -206,10 +206,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = else: topMix -proc invert*(target: Mask) {.hasSimd, raises: [].} = +proc invert*(mask: Mask) {.hasSimd, raises: [].} = ## Inverts all of the values - creates a negative of the mask. - for i in 0 ..< target.data.len: - target.data[i] = 255 - target.data[i] + for i in 0 ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = ## Grows the mask by spread. diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index ecf74da..f039b1a 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -1,6 +1,6 @@ -import simd/internal +import simd/internal, system/memory -export internal +export internal, memory const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) @@ -20,6 +20,7 @@ when allowSimd: elif defined(arm64): import simd/neon + export neon import nimsimd/neon as nimsimdneon export nimsimdneon diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index be900bd..4e89ea5 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/avx2, pixie/common +import avx, chroma, internal, nimsimd/avx2, pixie/common, vmath when defined(gcc) or defined(clang): {.localPassc: "-mavx2".} @@ -133,5 +133,88 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 data[i] = c +proc invertAvx2*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 32 bytes + while i < image.data.len and (p and 31) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + + let + vec255 = mm256_set1_epi8(255) + iterations = image.data.len div 16 + for _ in 0 ..< iterations: + let + a = mm256_load_si256(cast[pointer](p)) + b = mm256_load_si256(cast[pointer](p + 32)) + mm256_store_si256(cast[pointer](p), mm256_sub_epi8(vec255, a)) + mm256_store_si256(cast[pointer](p + 32), mm256_sub_epi8(vec255, b)) + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + + toPremultipliedAlphaAvx2(image.data) + +proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + fillUnsafeAvx(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) + return + + var + i: int + p = cast[uint](image.data[0].addr) + + let + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + zeroVec = mm256_setzero_si256() + opacityVec = mm256_slli_epi16(mm256_set1_epi16(opacity), 8) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + let + values = mm256_loadu_si256(cast[pointer](p)) + eqZero = mm256_cmpeq_epi16(values, zeroVec) + if mm256_movemask_epi8(eqZero) != cast[int32](0xffffffff): + var + valuesEven = mm256_slli_epi16(values, 8) + valuesOdd = mm256_and_si256(values, oddMask) + valuesEven = mm256_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm256_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm256_srli_epi16(mm256_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm256_srli_epi16(mm256_mulhi_epu16(valuesOdd, div255), 7) + mm256_storeu_si256( + cast[pointer](p), + mm256_or_si256(valuesEven, mm256_slli_epi16(valuesOdd, 8)) + ) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + when defined(release): {.pop.} diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index 18870ed..1808ac8 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -3,7 +3,7 @@ import std/macros, std/tables var simdProcs* {.compiletime.}: Table[string, NimNode] proc procName(procedure: NimNode): string = - ## Given a procedure signature returns only name string. + ## Given a procedure this returns the name as a string. let nameNode = procedure[0] if nameNode.kind == nnkPostfix: nameNode[1].strVal @@ -11,16 +11,34 @@ proc procName(procedure: NimNode): string = nameNode.strVal proc procArguments(procedure: NimNode): seq[NimNode] = - ## Given a procedure signature gets the arguments as a list. + ## Given a procedure this gets the arguments as a list. for i, arg in procedure[3]: if i > 0: for j in 0 ..< arg.len - 2: result.add(arg[j]) proc procReturnType(procedure: NimNode): NimNode = - ## Given a procedure signature gets the return type. + ## Given a procedure this gets the return type. procedure[3][0] +proc procSignature(procedure: NimNode): string = + ## Given a procedure this returns the signature as a string. + result = "(" + + for i, arg in procedure[3]: + if i > 0: + for j in 0 ..< arg.len - 2: + result &= arg[^2].repr & ", " + + if procedure[3].len > 1: + result = result[0 ..^ 3] + + result &= ")" + + let ret = procedure.procReturnType() + if ret.kind != nnkEmpty: + result &= ": " & ret.repr + proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = ## Produces a procedure call with arguments. let @@ -38,8 +56,8 @@ proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = return `call` macro simd*(procedure: untyped) = - let name = procedure.procName() - simdProcs[name] = procedure.copy() + let signature = procedure.procName() & procSignature(procedure) + simdProcs[signature] = procedure.copy() return procedure macro hasSimd*(procedure: untyped) = @@ -53,25 +71,31 @@ macro hasSimd*(procedure: untyped) = callAvx = callAndReturn(ident(nameAvx), procedure) callAvx2 = callAndReturn(ident(nameAvx2), procedure) - var body = newStmtList() + var + foundSimd: bool + body = newStmtList() when defined(amd64) and not defined(pixieNoAvx): - if nameAvx2 in simdProcs: + if nameAvx2 & procSignature(procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx2` - if nameAvx in simdProcs: + if nameAvx & procSignature(procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx` - if nameSse2 in simdProcs: - let bodySse2 = simdProcs[nameSse2][6] + if nameSse2 & procSignature(procedure) in simdProcs: + foundSimd = true + let bodySse2 = simdProcs[nameSse2 & procSignature(procedure)][6] body.add quote do: `bodySse2` - elif nameNeon in simdProcs: - let bodyNeon = simdProcs[nameNeon][6] + elif nameNeon & procSignature(procedure) in simdProcs: + foundSimd = true + let bodyNeon = simdProcs[nameNeon & procSignature(procedure)][6] body.add quote do: `bodyNeon` else: @@ -80,4 +104,8 @@ macro hasSimd*(procedure: untyped) = procedure[6] = body + when not defined(pixieNoSimd): + if not foundSimd: + echo "No SIMD found for " & name & procSignature(procedure) + return procedure diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 739a224..19fa0a3 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/neon, pixie/common +import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = channels.val[2] = premultiply(channels.val[2], channels.val[3]) vst4_u8(cast[pointer](p), channels) p += 32 - i += 8 + i += 8 * iterations for i in i ..< data.len: var c = data[i] @@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a +proc invertNeon*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + + let + vec255 = vmovq_n_u8(255) + iterations = image.data.len div 16 + for _ in 0 ..< iterations: + var channels = vld4q_u8(cast[pointer](p)) + channels.val[0] = vsubq_u8(vec255, channels.val[0]) + channels.val[1] = vsubq_u8(vec255, channels.val[1]) + channels.val[2] = vsubq_u8(vec255, channels.val[2]) + channels.val[3] = vsubq_u8(vec255, channels.val[3]) + vst4q_u8(cast[pointer](p), channels) + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + + toPremultipliedAlphaNeon(image.data) + +proc invertNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p + + let + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + let values = vld1q_u8(cast[pointer](p)) + vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values)) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] + +proc ceilNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + + let + zeroVec = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + var values = vld1q_u8(cast[pointer](p)) + values = vceqq_u8(values, zeroVec) + values = vbicq_u8(vec255, values) + vst1q_u8(cast[pointer](p), values) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + if mask.data[i] != 0: + mask.data[i] = 255 + +proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) + return + + var + i: int + p = cast[uint](image.data[0].addr) + + proc apply(c, o: uint8x8): uint8x8 {.inline.} = + let co = vmull_u8(c, o) + vraddhn_u16(co, vrshrq_n_u16(co, 8)) + + let + opacityVec = vmov_n_u8(opacity) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + var channels = vld4_u8(cast[pointer](p)) + channels.val[0] = apply(channels.val[0], opacityVec) + channels.val[1] = apply(channels.val[1], opacityVec) + channels.val[2] = apply(channels.val[2], opacityVec) + channels.val[3] = apply(channels.val[3], opacityVec) + vst4_u8(cast[pointer](p), channels) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + opacityVec = vmov_n_u8(opacity) + iterations = mask.data.len div 8 + for _ in 0 ..< iterations: + let + values = vld1_u8(cast[pointer](p)) + multiplied = vmull_u8(values, opacityVec) + rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8)) + vst1_u8(cast[pointer](p), rounded) + p += 8 + i += 8 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 + when defined(release): {.pop.} diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index b5021fc..0f9c347 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/sse2, pixie/common, vmath +import chroma, internal, nimsimd/sse2, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -244,33 +244,24 @@ proc newMaskSse2*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a -proc invertSse2*(target: Image | Mask) {.simd.} = +proc invertSse2*(image: Image) {.simd.} = var i: int - p = cast[uint](target.data[0].addr) + p = cast[uint](image.data[0].addr) # Align to 16 bytes - while i < target.data.len and (p and 15) != 0: - when target is Image: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx - inc i - p += 4 - else: - target.data[i] = 255 - target.data[i] - inc i - inc p - - let vec255 = mm_set1_epi8(255) - - when target is Image: - let iterations = target.data.len div 16 - else: - let iterations = target.data.len div 64 + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + let + vec255 = mm_set1_epi8(255) + iterations = image.data.len div 16 for _ in 0 ..< iterations: let a = mm_load_si128(cast[pointer](p)) @@ -282,24 +273,46 @@ proc invertSse2*(target: Image | Mask) {.simd.} = mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) p += 64 + i += 16 * iterations - when target is Image: - i += 16 * iterations + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx - for i in i ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + toPremultipliedAlphaSse2(image.data) - toPremultipliedAlphaSse2(target.data) - else: - i += 64 * iterations +proc invertSse2*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p - for i in i ..< target.data.len: - target.data[i] = 255 - target.data[i] + let + vec255 = mm_set1_epi8(255) + iterations = mask.data.len div 64 + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 64 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc ceilSse2*(mask: Mask) {.simd.} = var @@ -322,34 +335,69 @@ proc ceilSse2*(mask: Mask) {.simd.} = if mask.data[i] != 0: mask.data[i] = 255 -proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = +proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - when target is Image: - target.fill(rgbx(0, 0, 0, 0)) - else: - target.fill(0) + fillUnsafeSse2(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) return var i: int - p = cast[uint](target.data[0].addr) - len = - when target is Image: - target.data.len * 4 - else: - target.data.len + p = cast[uint](image.data[0].addr) let oddMask = mm_set1_epi16(0xff00) div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) - iterations = len div 16 - for _ in 0 ..< len div 16: + iterations = image.data.len div 4 + for _ in 0 ..< iterations: + let values = mm_loadu_si128(cast[pointer](p)) + if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: + var + valuesEven = mm_slli_epi16(values, 8) + valuesOdd = mm_and_si128(values, oddMask) + valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) + mm_storeu_si128( + cast[pointer](p), + mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) + ) + p += 16 + i += 4 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacitySse2*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + zeroVec = mm_setzero_si128() + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: var @@ -366,17 +414,8 @@ proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = p += 16 i += 16 * iterations - when target is Image: - for i in i div 4 ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = ((rgbx.r * opacity) div 255).uint8 - rgbx.g = ((rgbx.g * opacity) div 255).uint8 - rgbx.b = ((rgbx.b * opacity) div 255).uint8 - rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx - else: - for i in i ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 when defined(release): {.pop.} diff --git a/tests/bench_images.nim b/tests/bench_images.nim index 60cd261..e210a5d 100644 --- a/tests/bench_images.nim +++ b/tests/bench_images.nim @@ -74,6 +74,7 @@ timeIt "invert": reset() timeIt "applyOpacity": + reset() image.applyOpacity(0.5) reset() diff --git a/tests/bench_masks.nim b/tests/bench_masks.nim index cd7a527..abf15de 100644 --- a/tests/bench_masks.nim +++ b/tests/bench_masks.nim @@ -1,4 +1,4 @@ -import benchy, chroma, pixie +import benchy, pixie let mask = newMask(2560, 1440) @@ -25,6 +25,7 @@ timeIt "invert": reset() timeIt "applyOpacity": + reset() mask.applyOpacity(0.5) reset()