From 2ace8e5e9ff244a49ca0827ac476374bc59a4a76 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 22 Jul 2022 22:27:18 -0500 Subject: [PATCH 1/6] simd macro works on signature not just name, split applyOpacity + invert --- src/pixie/images.nim | 20 ++--- src/pixie/masks.nim | 16 ++-- src/pixie/simd/internal.nim | 47 ++++++++--- src/pixie/simd/sse2.nim | 163 ++++++++++++++++++++++-------------- 4 files changed, 155 insertions(+), 91 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 11f5c15..ec9f2cb 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -320,38 +320,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(image: Image, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(rgbx(0, 0, 0, 0)) + image.fill(rgbx(0, 0, 0, 0)) return - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = ((rgbx.r * opacity) div 255).uint8 rgbx.g = ((rgbx.g * opacity) div 255).uint8 rgbx.b = ((rgbx.b * opacity) div 255).uint8 rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx + image.data[i] = rgbx -proc invert*(target: Image) {.hasSimd, raises: [].} = +proc invert*(image: Image) {.hasSimd, raises: [].} = ## Inverts all of the colors and alpha. - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g rgbx.b = 255 - rgbx.b rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + image.data[i] = rgbx # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # is not a valid premultiplied alpha color. # We need to convert back to premultiplied alpha after inverting. - target.data.toPremultipliedAlpha() + image.data.toPremultipliedAlpha() proc blur*( image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0) diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 365bc2c..9927088 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,4 +1,4 @@ -import common, internal, simd, vmath +import common, internal, simd, system/memory, vmath export Mask, newMask @@ -165,18 +165,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = result.width ) -proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(mask: Mask, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(0) + mask.fill(0) return - for i in 0 ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in 0 ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = ## Gets a interpolated value with float point coordinates. @@ -206,10 +206,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = else: topMix -proc invert*(target: Mask) {.hasSimd, raises: [].} = +proc invert*(mask: Mask) {.hasSimd, raises: [].} = ## Inverts all of the values - creates a negative of the mask. - for i in 0 ..< target.data.len: - target.data[i] = 255 - target.data[i] + for i in 0 ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = ## Grows the mask by spread. diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index 18870ed..9137d13 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -3,7 +3,7 @@ import std/macros, std/tables var simdProcs* {.compiletime.}: Table[string, NimNode] proc procName(procedure: NimNode): string = - ## Given a procedure signature returns only name string. + ## Given a procedure this returns the name as a string. let nameNode = procedure[0] if nameNode.kind == nnkPostfix: nameNode[1].strVal @@ -11,16 +11,30 @@ proc procName(procedure: NimNode): string = nameNode.strVal proc procArguments(procedure: NimNode): seq[NimNode] = - ## Given a procedure signature gets the arguments as a list. + ## Given a procedure this gets the arguments as a list. for i, arg in procedure[3]: if i > 0: for j in 0 ..< arg.len - 2: result.add(arg[j]) proc procReturnType(procedure: NimNode): NimNode = - ## Given a procedure signature gets the return type. + ## Given a procedure this gets the return type. procedure[3][0] +proc procSignature(procName: string, procedure: NimNode): string = + ## Given a procedure this returns the signature as a string. + result = procName & "(" + + for i, arg in procedure[3]: + if i > 0: + for j in 0 ..< arg.len - 2: + result &= arg[^2].repr & ", " + + if procedure[3].len > 1: + result = result[0 ..^ 3] + + result &= ")" + proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = ## Produces a procedure call with arguments. let @@ -38,8 +52,8 @@ proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = return `call` macro simd*(procedure: untyped) = - let name = procedure.procName() - simdProcs[name] = procedure.copy() + let signature = procSignature(procedure.procName(), procedure) + simdProcs[signature] = procedure.copy() return procedure macro hasSimd*(procedure: untyped) = @@ -53,25 +67,31 @@ macro hasSimd*(procedure: untyped) = callAvx = callAndReturn(ident(nameAvx), procedure) callAvx2 = callAndReturn(ident(nameAvx2), procedure) - var body = newStmtList() + var + foundSimd: bool + body = newStmtList() when defined(amd64) and not defined(pixieNoAvx): - if nameAvx2 in simdProcs: + if procSignature(nameAvx2, procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx2` - if nameAvx in simdProcs: + if procSignature(nameAvx, procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx` - if nameSse2 in simdProcs: - let bodySse2 = simdProcs[nameSse2][6] + if procSignature(nameSse2, procedure) in simdProcs: + foundSimd = true + let bodySse2 = simdProcs[procSignature(nameSse2, procedure)][6] body.add quote do: `bodySse2` - elif nameNeon in simdProcs: - let bodyNeon = simdProcs[nameNeon][6] + elif procSignature(nameNeon, procedure) in simdProcs: + foundSimd = true + let bodyNeon = simdProcs[procSignature(nameNeon, procedure)][6] body.add quote do: `bodyNeon` else: @@ -80,4 +100,7 @@ macro hasSimd*(procedure: untyped) = procedure[6] = body + if not foundSimd: + echo "No SIMD found for " & procSignature(name, procedure) + return procedure diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index b5021fc..0a34b17 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/sse2, pixie/common, vmath +import chroma, internal, nimsimd/sse2, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -244,32 +244,24 @@ proc newMaskSse2*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a -proc invertSse2*(target: Image | Mask) {.simd.} = +proc invertSse2*(image: Image) {.simd.} = var i: int - p = cast[uint](target.data[0].addr) + p = cast[uint](image.data[0].addr) # Align to 16 bytes - while i < target.data.len and (p and 15) != 0: - when target is Image: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx - inc i - p += 4 - else: - target.data[i] = 255 - target.data[i] - inc i - inc p + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 - let vec255 = mm_set1_epi8(255) - - when target is Image: - let iterations = target.data.len div 16 - else: - let iterations = target.data.len div 64 + let + vec255 = mm_set1_epi8(255) + iterations = image.data.len div 16 for _ in 0 ..< iterations: let @@ -282,24 +274,47 @@ proc invertSse2*(target: Image | Mask) {.simd.} = mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) p += 64 + i += 16 * iterations - when target is Image: - i += 16 * iterations + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx - for i in i ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + toPremultipliedAlphaSse2(image.data) - toPremultipliedAlphaSse2(target.data) - else: - i += 64 * iterations +proc invertSse2*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p - for i in i ..< target.data.len: - target.data[i] = 255 - target.data[i] + let + vec255 = mm_set1_epi8(255) + iterations = mask.data.len div 64 + + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 64 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc ceilSse2*(mask: Mask) {.simd.} = var @@ -322,34 +337,69 @@ proc ceilSse2*(mask: Mask) {.simd.} = if mask.data[i] != 0: mask.data[i] = 255 -proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = +proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - when target is Image: - target.fill(rgbx(0, 0, 0, 0)) - else: - target.fill(0) + fillUnsafeSse2(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) return var i: int - p = cast[uint](target.data[0].addr) - len = - when target is Image: - target.data.len * 4 - else: - target.data.len + p = cast[uint](image.data[0].addr) let oddMask = mm_set1_epi16(0xff00) div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) - iterations = len div 16 - for _ in 0 ..< len div 16: + iterations = image.data.len div 4 + for _ in 0 ..< iterations: + let values = mm_loadu_si128(cast[pointer](p)) + if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: + var + valuesEven = mm_slli_epi16(values, 8) + valuesOdd = mm_and_si128(values, oddMask) + valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) + mm_storeu_si128( + cast[pointer](p), + mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) + ) + p += 16 + i += 4 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacitySse2*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + zeroVec = mm_setzero_si128() + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: var @@ -366,17 +416,8 @@ proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = p += 16 i += 16 * iterations - when target is Image: - for i in i div 4 ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = ((rgbx.r * opacity) div 255).uint8 - rgbx.g = ((rgbx.g * opacity) div 255).uint8 - rgbx.b = ((rgbx.b * opacity) div 255).uint8 - rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx - else: - for i in i ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 when defined(release): {.pop.} From 3b478317102c9087c841084589b0f5e0ba0992ae Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 22 Jul 2022 22:34:55 -0500 Subject: [PATCH 2/6] f --- src/pixie/simd/internal.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index 9137d13..ce804a7 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -100,7 +100,8 @@ macro hasSimd*(procedure: untyped) = procedure[6] = body - if not foundSimd: - echo "No SIMD found for " & procSignature(name, procedure) + when not defined(pixieNoSimd): + if not foundSimd: + echo "No SIMD found for " & procSignature(name, procedure) return procedure From d9cdb0ef95e89701850bca097d44064d18e6014b Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 22 Jul 2022 22:40:39 -0500 Subject: [PATCH 3/6] image invertAvx2 --- src/pixie/simd/avx2.nim | 37 +++++++++++++++++++++++++++++++++++++ src/pixie/simd/sse2.nim | 2 -- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index be900bd..1d8db9e 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -133,5 +133,42 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = c.b = ((c.b.uint32 * c.a + 127) div 255).uint8 data[i] = c +proc invertAvx2*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 32 bytes + while i < image.data.len and (p and 31) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + + let + vec255 = mm256_set1_epi8(255) + iterations = image.data.len div 16 + for _ in 0 ..< iterations: + let + a = mm256_load_si256(cast[pointer](p)) + b = mm256_load_si256(cast[pointer](p + 32)) + mm256_store_si256(cast[pointer](p), mm256_sub_epi8(vec255, a)) + mm256_store_si256(cast[pointer](p + 32), mm256_sub_epi8(vec255, b)) + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + + toPremultipliedAlphaAvx2(image.data) + when defined(release): {.pop.} diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 0a34b17..0f9c347 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -262,7 +262,6 @@ proc invertSse2*(image: Image) {.simd.} = let vec255 = mm_set1_epi8(255) iterations = image.data.len div 16 - for _ in 0 ..< iterations: let a = mm_load_si128(cast[pointer](p)) @@ -299,7 +298,6 @@ proc invertSse2*(mask: Mask) {.simd.} = let vec255 = mm_set1_epi8(255) iterations = mask.data.len div 64 - for _ in 0 ..< iterations: let a = mm_load_si128(cast[pointer](p)) From 0f93769ef1d52a20b4f9247e8dcee276a9e5705e Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 22 Jul 2022 22:49:51 -0500 Subject: [PATCH 4/6] avx2 image applyOpacity --- src/pixie/simd/avx2.nim | 48 ++++++++++++++++++++++++++++++++++++++++- tests/bench_images.nim | 1 + 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 1d8db9e..4e89ea5 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/avx2, pixie/common +import avx, chroma, internal, nimsimd/avx2, pixie/common, vmath when defined(gcc) or defined(clang): {.localPassc: "-mavx2".} @@ -170,5 +170,51 @@ proc invertAvx2*(image: Image) {.simd.} = toPremultipliedAlphaAvx2(image.data) +proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + fillUnsafeAvx(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) + return + + var + i: int + p = cast[uint](image.data[0].addr) + + let + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + zeroVec = mm256_setzero_si256() + opacityVec = mm256_slli_epi16(mm256_set1_epi16(opacity), 8) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + let + values = mm256_loadu_si256(cast[pointer](p)) + eqZero = mm256_cmpeq_epi16(values, zeroVec) + if mm256_movemask_epi8(eqZero) != cast[int32](0xffffffff): + var + valuesEven = mm256_slli_epi16(values, 8) + valuesOdd = mm256_and_si256(values, oddMask) + valuesEven = mm256_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm256_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm256_srli_epi16(mm256_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm256_srli_epi16(mm256_mulhi_epu16(valuesOdd, div255), 7) + mm256_storeu_si256( + cast[pointer](p), + mm256_or_si256(valuesEven, mm256_slli_epi16(valuesOdd, 8)) + ) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + when defined(release): {.pop.} diff --git a/tests/bench_images.nim b/tests/bench_images.nim index 60cd261..e210a5d 100644 --- a/tests/bench_images.nim +++ b/tests/bench_images.nim @@ -74,6 +74,7 @@ timeIt "invert": reset() timeIt "applyOpacity": + reset() image.applyOpacity(0.5) reset() From 8670e0edec6feb4c6ad94b8c21f0af2dae649f8a Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 23 Jul 2022 00:13:19 -0500 Subject: [PATCH 5/6] neon procs --- pixie.nimble | 2 +- src/pixie/simd.nim | 1 + src/pixie/simd/neon.nim | 150 +++++++++++++++++++++++++++++++++++++++- tests/bench_masks.nim | 3 +- 4 files changed, 152 insertions(+), 4 deletions(-) diff --git a/pixie.nimble b/pixie.nimble index 4b819c3..b899c8d 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.6" requires "zippy >= 0.10.3" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.7" +requires "nimsimd >= 1.1.8" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index ecf74da..57b0b8d 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -20,6 +20,7 @@ when allowSimd: elif defined(arm64): import simd/neon + export neon import nimsimd/neon as nimsimdneon export nimsimdneon diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 739a224..19fa0a3 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/neon, pixie/common +import chroma, internal, nimsimd/neon, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -150,7 +150,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = channels.val[2] = premultiply(channels.val[2], channels.val[3]) vst4_u8(cast[pointer](p), channels) p += 32 - i += 8 + i += 8 * iterations for i in i ..< data.len: var c = data[i] @@ -194,5 +194,151 @@ proc newMaskNeon*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a +proc invertNeon*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + # Align to 16 bytes + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 + + let + vec255 = vmovq_n_u8(255) + iterations = image.data.len div 16 + for _ in 0 ..< iterations: + var channels = vld4q_u8(cast[pointer](p)) + channels.val[0] = vsubq_u8(vec255, channels.val[0]) + channels.val[1] = vsubq_u8(vec255, channels.val[1]) + channels.val[2] = vsubq_u8(vec255, channels.val[2]) + channels.val[3] = vsubq_u8(vec255, channels.val[3]) + vst4q_u8(cast[pointer](p), channels) + p += 64 + i += 16 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + + toPremultipliedAlphaNeon(image.data) + +proc invertNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p + + let + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + let values = vld1q_u8(cast[pointer](p)) + vst1q_u8(cast[pointer](p), vsubq_u8(vec255, values)) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] + +proc ceilNeon*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + + let + zeroVec = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: + var values = vld1q_u8(cast[pointer](p)) + values = vceqq_u8(values, zeroVec) + values = vbicq_u8(vec255, values) + vst1q_u8(cast[pointer](p), values) + p += 16 + i += 16 * iterations + + for i in i ..< mask.data.len: + if mask.data[i] != 0: + mask.data[i] = 255 + +proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + fillUnsafeNeon(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) + return + + var + i: int + p = cast[uint](image.data[0].addr) + + proc apply(c, o: uint8x8): uint8x8 {.inline.} = + let co = vmull_u8(c, o) + vraddhn_u16(co, vrshrq_n_u16(co, 8)) + + let + opacityVec = vmov_n_u8(opacity) + iterations = image.data.len div 8 + for _ in 0 ..< iterations: + var channels = vld4_u8(cast[pointer](p)) + channels.val[0] = apply(channels.val[0], opacityVec) + channels.val[1] = apply(channels.val[1], opacityVec) + channels.val[2] = apply(channels.val[2], opacityVec) + channels.val[3] = apply(channels.val[3], opacityVec) + vst4_u8(cast[pointer](p), channels) + p += 32 + i += 8 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacityNeon*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint8 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + opacityVec = vmov_n_u8(opacity) + iterations = mask.data.len div 8 + for _ in 0 ..< iterations: + let + values = vld1_u8(cast[pointer](p)) + multiplied = vmull_u8(values, opacityVec) + rounded = vraddhn_u16(multiplied, vrshrq_n_u16(multiplied, 8)) + vst1_u8(cast[pointer](p), rounded) + p += 8 + i += 8 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 + when defined(release): {.pop.} diff --git a/tests/bench_masks.nim b/tests/bench_masks.nim index cd7a527..abf15de 100644 --- a/tests/bench_masks.nim +++ b/tests/bench_masks.nim @@ -1,4 +1,4 @@ -import benchy, chroma, pixie +import benchy, pixie let mask = newMask(2560, 1440) @@ -25,6 +25,7 @@ timeIt "invert": reset() timeIt "applyOpacity": + reset() mask.applyOpacity(0.5) reset() From 7cf81a6d9a52f1eb37ef59ad99920cf37cf23a2e Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 23 Jul 2022 12:57:23 -0500 Subject: [PATCH 6/6] update macro procSignature --- src/pixie/masks.nim | 2 +- src/pixie/simd.nim | 4 ++-- src/pixie/simd/internal.nim | 24 ++++++++++++++---------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 9927088..ea52af1 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,4 +1,4 @@ -import common, internal, simd, system/memory, vmath +import common, internal, simd, vmath export Mask, newMask diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 57b0b8d..f039b1a 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -1,6 +1,6 @@ -import simd/internal +import simd/internal, system/memory -export internal +export internal, memory const allowSimd* = not defined(pixieNoSimd) and not defined(tcc) diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index ce804a7..1808ac8 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -21,9 +21,9 @@ proc procReturnType(procedure: NimNode): NimNode = ## Given a procedure this gets the return type. procedure[3][0] -proc procSignature(procName: string, procedure: NimNode): string = +proc procSignature(procedure: NimNode): string = ## Given a procedure this returns the signature as a string. - result = procName & "(" + result = "(" for i, arg in procedure[3]: if i > 0: @@ -35,6 +35,10 @@ proc procSignature(procName: string, procedure: NimNode): string = result &= ")" + let ret = procedure.procReturnType() + if ret.kind != nnkEmpty: + result &= ": " & ret.repr + proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = ## Produces a procedure call with arguments. let @@ -52,7 +56,7 @@ proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = return `call` macro simd*(procedure: untyped) = - let signature = procSignature(procedure.procName(), procedure) + let signature = procedure.procName() & procSignature(procedure) simdProcs[signature] = procedure.copy() return procedure @@ -72,26 +76,26 @@ macro hasSimd*(procedure: untyped) = body = newStmtList() when defined(amd64) and not defined(pixieNoAvx): - if procSignature(nameAvx2, procedure) in simdProcs: + if nameAvx2 & procSignature(procedure) in simdProcs: foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx2` - if procSignature(nameAvx, procedure) in simdProcs: + if nameAvx & procSignature(procedure) in simdProcs: foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx` - if procSignature(nameSse2, procedure) in simdProcs: + if nameSse2 & procSignature(procedure) in simdProcs: foundSimd = true - let bodySse2 = simdProcs[procSignature(nameSse2, procedure)][6] + let bodySse2 = simdProcs[nameSse2 & procSignature(procedure)][6] body.add quote do: `bodySse2` - elif procSignature(nameNeon, procedure) in simdProcs: + elif nameNeon & procSignature(procedure) in simdProcs: foundSimd = true - let bodyNeon = simdProcs[procSignature(nameNeon, procedure)][6] + let bodyNeon = simdProcs[nameNeon & procSignature(procedure)][6] body.add quote do: `bodyNeon` else: @@ -102,6 +106,6 @@ macro hasSimd*(procedure: untyped) = when not defined(pixieNoSimd): if not foundSimd: - echo "No SIMD found for " & procSignature(name, procedure) + echo "No SIMD found for " & name & procSignature(procedure) return procedure