From 82c7c8b8643078b5b48416e665ce7d7b04631724 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 22 Jun 2022 01:03:32 -0500 Subject: [PATCH] faster --- src/pixie/internal.nim | 16 ++++++++-------- src/pixie/paths.nim | 42 ++++++++++++++++++++---------------------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index d850a36..f9b255d 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -52,7 +52,7 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} = proc fillUnsafe*( data: var seq[uint8], value: uint8, start, len: int -) {.raises: [].} = +) {.inline, raises: [].} = ## Fills the mask data with the value starting at index start and ## continuing for len indices. nimSetMem(data[start].addr, value.cint, len) @@ -62,9 +62,7 @@ proc fillUnsafe*( ) {.raises: [].} = ## Fills the image data with the color starting at index start and ## continuing for len indices. - let rgbx = color.asRgbx() - # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) @@ -78,11 +76,13 @@ proc fillUnsafe*( # When supported, SIMD fill until we run out of room let colorVec = mm_set1_epi32(cast[int32](rgbx)) - remaining = start + len - i - for _ in 0 ..< remaining div 8: - mm_store_si128(data[i + 0].addr, colorVec) - mm_store_si128(data[i + 4].addr, colorVec) - i += 8 + iterations = (start + len - i) div 8 + var p = cast[uint](data[i].addr) + for _ in 0 ..< iterations: + mm_store_si128(cast[pointer](p), colorVec) + mm_store_si128(cast[pointer](p + 16), colorVec) + p += 32 + i += iterations * 8 else: when sizeof(int) == 8: # Fill 8 bytes at a time when possible diff --git a/src/pixie/paths.nim b/src/pixie/paths.nim index 1a2c90e..22afc69 100644 --- a/src/pixie/paths.nim +++ b/src/pixie/paths.nim @@ -1133,9 +1133,9 @@ proc partitionSegments( result.partitionHeight = height.uint32 div numPartitions for (segment, winding) in segments: - let entry = initPartitionEntry(segment, winding) + var entry = initPartitionEntry(segment, winding) if result.partitionHeight == 0: - result.partitions[0].entries.add(entry) + result.partitions[0].entries.add(move entry) else: var atPartition = max(0, segment.at.y - result.startY.float32).uint32 @@ -1619,16 +1619,15 @@ proc fillHits( template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) = when allowSimd: when defined(amd64): - let colorVec = mm_set1_epi32(cast[int32](rgbx)) - var dataIndex = image.dataIndex(x, y) - for _ in 0 ..< len div 4: - let backdrop = mm_loadu_si128(image.data[dataIndex].addr) - mm_storeu_si128( - image.data[dataIndex].addr, - blendProc(backdrop, colorVec) - ) - x += 4 - dataIndex += 4 + var p = cast[uint](image.data[image.dataIndex(x, y)].addr) + let + iterations = len div 4 + colorVec = mm_set1_epi32(cast[int32](rgbx)) + for _ in 0 ..< iterations: + let backdrop = mm_loadu_si128(cast[pointer](p)) + mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec)) + p += 16 + x += iterations * 4 case blendMode: of OverwriteBlend: @@ -1714,16 +1713,15 @@ proc fillHits( template simdBlob(mask: Mask, x: var int, len: int, blendProc: untyped) = when allowSimd: when defined(amd64): - let vec255 = mm_set1_epi8(255) - var dataIndex = mask.dataIndex(x, y) - for _ in 0 ..< len div 16: - let backdrop = mm_loadu_si128(mask.data[dataIndex].addr) - mm_storeu_si128( - mask.data[dataIndex].addr, - blendProc(backdrop, vec255) - ) - x += 16 - dataIndex += 16 + var p = cast[uint](mask.data[mask.dataIndex(x, y)].addr) + let + iterations = len div 16 + vec255 = mm_set1_epi8(255) + for _ in 0 ..< iterations: + let backdrop = mm_loadu_si128(cast[pointer](p)) + mm_storeu_si128(cast[pointer](p), blendProc(backdrop, vec255)) + p += 16 + x += iterations * 16 case blendMode: of NormalBlend, OverwriteBlend: