From e56ad9e4031cc534a1b91da225721d21227b581c Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 29 Jun 2022 00:29:26 -0500 Subject: [PATCH] move newImage(mask) and newMask(image) simd out --- src/pixie/images.nim | 51 +++++++++++++++-------------------- src/pixie/internal.nim | 19 +------------ src/pixie/simd.nim | 60 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 48 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index bfcb409..9951365 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -31,21 +31,18 @@ proc newImage*(width, height: int): Image {.raises: [PixieError].} = proc newImage*(mask: Mask): Image {.raises: [PixieError].} = result = newImage(mask.width, mask.height) - var i: int - when defined(amd64) and allowSimd: - for _ in 0 ..< mask.data.len div 16: - var alphas = mm_loadu_si128(mask.data[i].addr) - for j in 0 ..< 4: - var unpacked = unpackAlphaValues(alphas) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) - unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) - mm_storeu_si128(result.data[i + j * 4].addr, unpacked) - alphas = mm_srli_si128(alphas, 4) - i += 16 - for j in i ..< mask.data.len: - let v = mask.data[j] - result.data[j] = rgbx(v, v, v, v) + when allowSimd and compiles(newImageFromMaskSimd): + newImageFromMaskSimd( + cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr), + cast[ptr UncheckedArray[uint8]](mask.data[0].addr), + mask.data.len + ) + return + + for i in 0 ..< mask.data.len: + let v = mask.data[i] + result.data[i] = rgbx(v, v, v, v) proc copy*(image: Image): Image {.raises: [PixieError].} = ## Copies the image data into a new image. @@ -421,7 +418,7 @@ proc applyOpacity*(target: Image | Mask, opacity: float32) {.raises: [].} = proc invert*(image: Image) {.raises: [].} = ## Inverts all of the colors and alpha. - if allowSimd and compiles(invertSimd): + when allowSimd and compiles(invertSimd): invertSimd( cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), image.data.len @@ -506,22 +503,16 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} = ## Returns a new mask using the alpha values of the image. result = newMask(image.width, image.height) - var i: int - when defined(amd64) and allowSimd: - for _ in 0 ..< image.data.len div 16: - let - a = mm_loadu_si128(image.data[i + 0].addr) - b = mm_loadu_si128(image.data[i + 4].addr) - c = mm_loadu_si128(image.data[i + 8].addr) - d = mm_loadu_si128(image.data[i + 12].addr) - mm_storeu_si128( - result.data[i].addr, - pack4xAlphaValues(a, b, c, d) - ) - i += 16 + when allowSimd and compiles(newMaskFromImageSimd): + newMaskFromImageSimd( + cast[ptr UncheckedArray[uint8]](result.data[0].addr), + cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), + image.data.len + ) + return - for j in i ..< image.data.len: - result.data[j] = image.data[j].a + for i in 0 ..< image.data.len: + result.data[i] = image.data[i].a proc getRgbaSmooth*( image: Image, x, y: float32, wrapped = false diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 28aac9c..2524847 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -152,24 +152,7 @@ when defined(amd64) and allowSimd: finalColor = mm_packus_epi16(finalColor, mm_setzero_si128()) cast[ColorRGBX](mm_cvtsi128_si32(finalColor)) - proc packAlphaValues(v: M128i): M128i {.inline, raises: [].} = - ## Shuffle the alpha values for these 4 colors to the first 4 bytes - result = mm_srli_epi32(v, 24) - result = mm_packus_epi16(result, mm_setzero_si128()) - result = mm_packus_epi16(result, mm_setzero_si128()) - - proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline, raises: [].} = - let - i = packAlphaValues(i) - j = mm_slli_si128(packAlphaValues(j), 4) - k = mm_slli_si128(packAlphaValues(k), 8) - l = mm_slli_si128(packAlphaValues(l), 12) - mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) - - proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = - ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value) - result = mm_unpacklo_epi8(mm_setzero_si128(), v) - result = mm_unpacklo_epi8(mm_setzero_si128(), result) + export pack4xAlphaValues, unpackAlphaValues when defined(release): {.pop.} diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 54a2d3b..0deb2aa 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -7,6 +7,25 @@ when defined(amd64): cpuHasAvx* = checkInstructionSets({AVX}) cpuHasAvx2* = checkInstructionSets({AVX, AVX2}) + proc packAlphaValues(v: M128i): M128i {.inline.} = + ## Shuffle the alpha values for these 4 colors to the first 4 bytes. + result = mm_srli_epi32(v, 24) + result = mm_packus_epi16(result, mm_setzero_si128()) + result = mm_packus_epi16(result, mm_setzero_si128()) + + proc pack4xAlphaValues*(i, j, k, l: M128i): M128i {.inline.} = + let + i = packAlphaValues(i) + j = mm_slli_si128(packAlphaValues(j), 4) + k = mm_slli_si128(packAlphaValues(k), 8) + l = mm_slli_si128(packAlphaValues(l), 12) + mm_or_si128(mm_or_si128(i, j), mm_or_si128(k, l)) + + proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} = + ## Unpack the first 32 bits into 4 rgba(0, 0, 0, value). + result = mm_unpacklo_epi8(mm_setzero_si128(), v) + result = mm_unpacklo_epi8(mm_setzero_si128(), result) + proc fillUnsafeSimd*( data: ptr UncheckedArray[ColorRGBX], len: int, @@ -171,6 +190,47 @@ when defined(amd64): c.b = ((c.b.uint32 * c.a) div 255).uint8 copyMem(data[i].addr, c.addr, 4) + proc newImageFromMaskSimd*( + dst: ptr UncheckedArray[ColorRGBX], + src: ptr UncheckedArray[uint8], + len: int + ) = + var i: int + for _ in 0 ..< len div 16: + var alphas = mm_loadu_si128(src[i].addr) + for j in 0 ..< 4: + var unpacked = unpackAlphaValues(alphas) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 8)) + unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16)) + mm_storeu_si128(dst[i + j * 4].addr, unpacked) + alphas = mm_srli_si128(alphas, 4) + i += 16 + + for i in i ..< len: + let v = src[i] + dst[i] = rgbx(v, v, v, v) + + proc newMaskFromImageSimd*( + dst: ptr UncheckedArray[uint8], + src: ptr UncheckedArray[ColorRGBX], + len: int + ) = + var i: int + for _ in 0 ..< len div 16: + let + a = mm_loadu_si128(src[i + 0].addr) + b = mm_loadu_si128(src[i + 4].addr) + c = mm_loadu_si128(src[i + 8].addr) + d = mm_loadu_si128(src[i + 12].addr) + mm_storeu_si128( + dst[i].addr, + pack4xAlphaValues(a, b, c, d) + ) + i += 16 + + for i in i ..< len: + dst[i] = src[i].a + proc invertSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) = var i: int let vec255 = mm_set1_epi8(cast[int8](255))