From 2ace8e5e9ff244a49ca0827ac476374bc59a4a76 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Fri, 22 Jul 2022 22:27:18 -0500 Subject: [PATCH] simd macro works on signature not just name, split applyOpacity + invert --- src/pixie/images.nim | 20 ++--- src/pixie/masks.nim | 16 ++-- src/pixie/simd/internal.nim | 47 ++++++++--- src/pixie/simd/sse2.nim | 163 ++++++++++++++++++++++-------------- 4 files changed, 155 insertions(+), 91 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 11f5c15..ec9f2cb 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -320,38 +320,38 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = result.width * 4 ) -proc applyOpacity*(target: Image, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(image: Image, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(rgbx(0, 0, 0, 0)) + image.fill(rgbx(0, 0, 0, 0)) return - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = ((rgbx.r * opacity) div 255).uint8 rgbx.g = ((rgbx.g * opacity) div 255).uint8 rgbx.b = ((rgbx.b * opacity) div 255).uint8 rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx + image.data[i] = rgbx -proc invert*(target: Image) {.hasSimd, raises: [].} = +proc invert*(image: Image) {.hasSimd, raises: [].} = ## Inverts all of the colors and alpha. - for i in 0 ..< target.data.len: - var rgbx = target.data[i] + for i in 0 ..< image.data.len: + var rgbx = image.data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g rgbx.b = 255 - rgbx.b rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + image.data[i] = rgbx # Inverting rgbx(50, 100, 150, 200) becomes rgbx(205, 155, 105, 55). This # is not a valid premultiplied alpha color. # We need to convert back to premultiplied alpha after inverting. - target.data.toPremultipliedAlpha() + image.data.toPremultipliedAlpha() proc blur*( image: Image, radius: float32, outOfBounds: SomeColor = color(0, 0, 0, 0) diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 365bc2c..9927088 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,4 +1,4 @@ -import common, internal, simd, vmath +import common, internal, simd, system/memory, vmath export Mask, newMask @@ -165,18 +165,18 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = result.width ) -proc applyOpacity*(target: Mask, opacity: float32) {.hasSimd, raises: [].} = +proc applyOpacity*(mask: Mask, opacity: float32) {.hasSimd, raises: [].} = ## Multiplies alpha of the image by opacity. let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - target.fill(0) + mask.fill(0) return - for i in 0 ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in 0 ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = ## Gets a interpolated value with float point coordinates. @@ -206,10 +206,10 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = else: topMix -proc invert*(target: Mask) {.hasSimd, raises: [].} = +proc invert*(mask: Mask) {.hasSimd, raises: [].} = ## Inverts all of the values - creates a negative of the mask. - for i in 0 ..< target.data.len: - target.data[i] = 255 - target.data[i] + for i in 0 ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = ## Grows the mask by spread. diff --git a/src/pixie/simd/internal.nim b/src/pixie/simd/internal.nim index 18870ed..9137d13 100644 --- a/src/pixie/simd/internal.nim +++ b/src/pixie/simd/internal.nim @@ -3,7 +3,7 @@ import std/macros, std/tables var simdProcs* {.compiletime.}: Table[string, NimNode] proc procName(procedure: NimNode): string = - ## Given a procedure signature returns only name string. + ## Given a procedure this returns the name as a string. let nameNode = procedure[0] if nameNode.kind == nnkPostfix: nameNode[1].strVal @@ -11,16 +11,30 @@ proc procName(procedure: NimNode): string = nameNode.strVal proc procArguments(procedure: NimNode): seq[NimNode] = - ## Given a procedure signature gets the arguments as a list. + ## Given a procedure this gets the arguments as a list. for i, arg in procedure[3]: if i > 0: for j in 0 ..< arg.len - 2: result.add(arg[j]) proc procReturnType(procedure: NimNode): NimNode = - ## Given a procedure signature gets the return type. + ## Given a procedure this gets the return type. procedure[3][0] +proc procSignature(procName: string, procedure: NimNode): string = + ## Given a procedure this returns the signature as a string. + result = procName & "(" + + for i, arg in procedure[3]: + if i > 0: + for j in 0 ..< arg.len - 2: + result &= arg[^2].repr & ", " + + if procedure[3].len > 1: + result = result[0 ..^ 3] + + result &= ")" + proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = ## Produces a procedure call with arguments. let @@ -38,8 +52,8 @@ proc callAndReturn(name: NimNode, procedure: NimNode): NimNode = return `call` macro simd*(procedure: untyped) = - let name = procedure.procName() - simdProcs[name] = procedure.copy() + let signature = procSignature(procedure.procName(), procedure) + simdProcs[signature] = procedure.copy() return procedure macro hasSimd*(procedure: untyped) = @@ -53,25 +67,31 @@ macro hasSimd*(procedure: untyped) = callAvx = callAndReturn(ident(nameAvx), procedure) callAvx2 = callAndReturn(ident(nameAvx2), procedure) - var body = newStmtList() + var + foundSimd: bool + body = newStmtList() when defined(amd64) and not defined(pixieNoAvx): - if nameAvx2 in simdProcs: + if procSignature(nameAvx2, procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx2` - if nameAvx in simdProcs: + if procSignature(nameAvx, procedure) in simdProcs: + foundSimd = true body.add quote do: if cpuHasAvx2: `callAvx` - if nameSse2 in simdProcs: - let bodySse2 = simdProcs[nameSse2][6] + if procSignature(nameSse2, procedure) in simdProcs: + foundSimd = true + let bodySse2 = simdProcs[procSignature(nameSse2, procedure)][6] body.add quote do: `bodySse2` - elif nameNeon in simdProcs: - let bodyNeon = simdProcs[nameNeon][6] + elif procSignature(nameNeon, procedure) in simdProcs: + foundSimd = true + let bodyNeon = simdProcs[procSignature(nameNeon, procedure)][6] body.add quote do: `bodyNeon` else: @@ -80,4 +100,7 @@ macro hasSimd*(procedure: untyped) = procedure[6] = body + if not foundSimd: + echo "No SIMD found for " & procSignature(name, procedure) + return procedure diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index b5021fc..0a34b17 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/sse2, pixie/common, vmath +import chroma, internal, nimsimd/sse2, pixie/common, system/memory, vmath when defined(release): {.push checks: off.} @@ -244,32 +244,24 @@ proc newMaskSse2*(image: Image): Mask {.simd.} = for i in i ..< image.data.len: result.data[i] = image.data[i].a -proc invertSse2*(target: Image | Mask) {.simd.} = +proc invertSse2*(image: Image) {.simd.} = var i: int - p = cast[uint](target.data[0].addr) + p = cast[uint](image.data[0].addr) # Align to 16 bytes - while i < target.data.len and (p and 15) != 0: - when target is Image: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx - inc i - p += 4 - else: - target.data[i] = 255 - target.data[i] - inc i - inc p + while i < image.data.len and (p and 15) != 0: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx + inc i + p += 4 - let vec255 = mm_set1_epi8(255) - - when target is Image: - let iterations = target.data.len div 16 - else: - let iterations = target.data.len div 64 + let + vec255 = mm_set1_epi8(255) + iterations = image.data.len div 16 for _ in 0 ..< iterations: let @@ -282,24 +274,47 @@ proc invertSse2*(target: Image | Mask) {.simd.} = mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) p += 64 + i += 16 * iterations - when target is Image: - i += 16 * iterations + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + image.data[i] = rgbx - for i in i ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = 255 - rgbx.r - rgbx.g = 255 - rgbx.g - rgbx.b = 255 - rgbx.b - rgbx.a = 255 - rgbx.a - target.data[i] = rgbx + toPremultipliedAlphaSse2(image.data) - toPremultipliedAlphaSse2(target.data) - else: - i += 64 * iterations +proc invertSse2*(mask: Mask) {.simd.} = + var + i: int + p = cast[uint](mask.data[0].addr) + # Align to 16 bytes + while i < mask.data.len and (p and 15) != 0: + mask.data[i] = 255 - mask.data[i] + inc i + inc p - for i in i ..< target.data.len: - target.data[i] = 255 - target.data[i] + let + vec255 = mm_set1_epi8(255) + iterations = mask.data.len div 64 + + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 64 * iterations + + for i in i ..< mask.data.len: + mask.data[i] = 255 - mask.data[i] proc ceilSse2*(mask: Mask) {.simd.} = var @@ -322,34 +337,69 @@ proc ceilSse2*(mask: Mask) {.simd.} = if mask.data[i] != 0: mask.data[i] = 255 -proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = +proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = let opacity = round(255 * opacity).uint16 if opacity == 255: return if opacity == 0: - when target is Image: - target.fill(rgbx(0, 0, 0, 0)) - else: - target.fill(0) + fillUnsafeSse2(image.data, rgbx(0, 0, 0, 0), 0, image.data.len) return var i: int - p = cast[uint](target.data[0].addr) - len = - when target is Image: - target.data.len * 4 - else: - target.data.len + p = cast[uint](image.data[0].addr) let oddMask = mm_set1_epi16(0xff00) div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) - iterations = len div 16 - for _ in 0 ..< len div 16: + iterations = image.data.len div 4 + for _ in 0 ..< iterations: + let values = mm_loadu_si128(cast[pointer](p)) + if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: + var + valuesEven = mm_slli_epi16(values, 8) + valuesOdd = mm_and_si128(values, oddMask) + valuesEven = mm_mulhi_epu16(valuesEven, opacityVec) + valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec) + valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) + valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) + mm_storeu_si128( + cast[pointer](p), + mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) + ) + p += 16 + i += 4 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + image.data[i] = rgbx + +proc applyOpacitySse2*(mask: Mask, opacity: float32) {.simd.} = + let opacity = round(255 * opacity).uint16 + if opacity == 255: + return + + if opacity == 0: + nimSetMem(mask.data[0].addr, 0.cint, mask.data.len) + + var + i: int + p = cast[uint](mask.data[0].addr) + + let + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + zeroVec = mm_setzero_si128() + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = mask.data.len div 16 + for _ in 0 ..< iterations: let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: var @@ -366,17 +416,8 @@ proc applyOpacitySse2*(target: Image | Mask, opacity: float32) {.simd.} = p += 16 i += 16 * iterations - when target is Image: - for i in i div 4 ..< target.data.len: - var rgbx = target.data[i] - rgbx.r = ((rgbx.r * opacity) div 255).uint8 - rgbx.g = ((rgbx.g * opacity) div 255).uint8 - rgbx.b = ((rgbx.b * opacity) div 255).uint8 - rgbx.a = ((rgbx.a * opacity) div 255).uint8 - target.data[i] = rgbx - else: - for i in i ..< target.data.len: - target.data[i] = ((target.data[i] * opacity) div 255).uint8 + for i in i ..< mask.data.len: + mask.data[i] = ((mask.data[i] * opacity) div 255).uint8 when defined(release): {.pop.}