From 39e37c357acaf636f352351bd34be00c7c1ded0c Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Mon, 13 Dec 2021 14:56:48 -0600 Subject: [PATCH] mask magnifyBy2 simd + copyMem --- src/pixie/images.nim | 22 +++++++++++----------- src/pixie/masks.nim | 31 +++++++++++++++++++++++++++++-- tests/benchmark_masks.nim | 6 ++++++ 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index af92375..2cb808e 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -394,26 +394,26 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} = var x: int when defined(amd64) and not defined(pixieNoSimd): if scale == 2: - let mask = cast[M128i]([uint32.high, 0, 0, 0]) - for _ in countup(0, image.width - 4, 2): + while x <= image.width - 4: let values = mm_loadu_si128(image.data[image.dataIndex(x, y)].addr) - first = mm_and_si128(values, mask) - second = mm_and_si128(mm_srli_si128(values, 4), mask) - combined = mm_or_si128(first, mm_slli_si128(second, 8)) - doubled = mm_or_si128(combined, mm_slli_si128(combined, 4)) + lo = mm_unpacklo_epi32(values, mm_setzero_si128()) + hi = mm_unpackhi_epi32(values, mm_setzero_si128()) mm_storeu_si128( - result.data[result.dataIndex(x * scale, y * scale)].addr, - doubled + result.data[result.dataIndex(x * scale + 0, y * scale)].addr, + mm_or_si128(lo, mm_slli_si128(lo, 4)) ) - x += 2 - for _ in x ..< image.width: + mm_storeu_si128( + result.data[result.dataIndex(x * scale + 4, y * scale)].addr, + mm_or_si128(hi, mm_slli_si128(hi, 4)) + ) + x += 4 + for x in x ..< image.width: let rgbx = image.unsafe[x, y] resultIdx = result.dataIndex(x * scale, y * scale) for i in 0 ..< scale: result.data[resultIdx + i] = rgbx - inc x # Copy that row of pixels into (scale - 1) more rows let rowStart = result.dataIndex(0, y * scale) for i in 1 ..< scale: diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index affa6e3..7643a21 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -166,14 +166,41 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} = let scale = 2 ^ power result = newMask(mask.width * scale, mask.height * scale) - for y in 0 ..< result.height: - for x in 0 ..< mask.width: + + for y in 0 ..< mask.height: + # Write one row of values duplicated by scale + var x: int + when defined(amd64) and not defined(pixieNoSimd): + if scale == 2: + while x <= mask.width - 16: + let + values = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr) + lo = mm_unpacklo_epi8(values, mm_setzero_si128()) + hi = mm_unpacklo_epi8(values, mm_setzero_si128()) + mm_storeu_si128( + result.data[result.dataIndex(x * scale + 0, y * scale)].addr, + mm_or_si128(lo, mm_slli_si128(lo, 1)) + ) + mm_storeu_si128( + result.data[result.dataIndex(x * scale + 16, y * scale)].addr, + mm_or_si128(hi, mm_slli_si128(hi, 1)) + ) + x += 16 + for x in x ..< mask.width: let value = mask.unsafe[x, y div scale] scaledX = x * scale idx = result.dataIndex(scaledX, y) for i in 0 ..< scale: result.data[idx + i] = value + # Copy that row of values into (scale - 1) more rows + let rowStart = result.dataIndex(0, y * scale) + for i in 1 ..< scale: + copyMem( + result.data[rowStart + result.width * i].addr, + result.data[rowStart].addr, + result.width * 4 + ) proc fillUnsafe*( data: var seq[uint8], value: uint8, start, len: int diff --git a/tests/benchmark_masks.nim b/tests/benchmark_masks.nim index 2effded..cd7a527 100644 --- a/tests/benchmark_masks.nim +++ b/tests/benchmark_masks.nim @@ -13,6 +13,12 @@ timeIt "minifyBy2": reset() +timeIt "magnifyBy2": + let magnified = mask.magnifyBy2() + doAssert magnified[0, 0] == 63 + +reset() + timeIt "invert": mask.invert()