faster
This commit is contained in:
parent
34158f3f28
commit
82c7c8b864
2 changed files with 28 additions and 30 deletions
|
@ -52,7 +52,7 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
|
||||||
|
|
||||||
proc fillUnsafe*(
|
proc fillUnsafe*(
|
||||||
data: var seq[uint8], value: uint8, start, len: int
|
data: var seq[uint8], value: uint8, start, len: int
|
||||||
) {.raises: [].} =
|
) {.inline, raises: [].} =
|
||||||
## Fills the mask data with the value starting at index start and
|
## Fills the mask data with the value starting at index start and
|
||||||
## continuing for len indices.
|
## continuing for len indices.
|
||||||
nimSetMem(data[start].addr, value.cint, len)
|
nimSetMem(data[start].addr, value.cint, len)
|
||||||
|
@ -62,9 +62,7 @@ proc fillUnsafe*(
|
||||||
) {.raises: [].} =
|
) {.raises: [].} =
|
||||||
## Fills the image data with the color starting at index start and
|
## Fills the image data with the color starting at index start and
|
||||||
## continuing for len indices.
|
## continuing for len indices.
|
||||||
|
|
||||||
let rgbx = color.asRgbx()
|
let rgbx = color.asRgbx()
|
||||||
|
|
||||||
# Use memset when every byte has the same value
|
# Use memset when every byte has the same value
|
||||||
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a:
|
||||||
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
nimSetMem(data[start].addr, rgbx.r.cint, len * 4)
|
||||||
|
@ -78,11 +76,13 @@ proc fillUnsafe*(
|
||||||
# When supported, SIMD fill until we run out of room
|
# When supported, SIMD fill until we run out of room
|
||||||
let
|
let
|
||||||
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||||
remaining = start + len - i
|
iterations = (start + len - i) div 8
|
||||||
for _ in 0 ..< remaining div 8:
|
var p = cast[uint](data[i].addr)
|
||||||
mm_store_si128(data[i + 0].addr, colorVec)
|
for _ in 0 ..< iterations:
|
||||||
mm_store_si128(data[i + 4].addr, colorVec)
|
mm_store_si128(cast[pointer](p), colorVec)
|
||||||
i += 8
|
mm_store_si128(cast[pointer](p + 16), colorVec)
|
||||||
|
p += 32
|
||||||
|
i += iterations * 8
|
||||||
else:
|
else:
|
||||||
when sizeof(int) == 8:
|
when sizeof(int) == 8:
|
||||||
# Fill 8 bytes at a time when possible
|
# Fill 8 bytes at a time when possible
|
||||||
|
|
|
@ -1133,9 +1133,9 @@ proc partitionSegments(
|
||||||
result.partitionHeight = height.uint32 div numPartitions
|
result.partitionHeight = height.uint32 div numPartitions
|
||||||
|
|
||||||
for (segment, winding) in segments:
|
for (segment, winding) in segments:
|
||||||
let entry = initPartitionEntry(segment, winding)
|
var entry = initPartitionEntry(segment, winding)
|
||||||
if result.partitionHeight == 0:
|
if result.partitionHeight == 0:
|
||||||
result.partitions[0].entries.add(entry)
|
result.partitions[0].entries.add(move entry)
|
||||||
else:
|
else:
|
||||||
var
|
var
|
||||||
atPartition = max(0, segment.at.y - result.startY.float32).uint32
|
atPartition = max(0, segment.at.y - result.startY.float32).uint32
|
||||||
|
@ -1619,16 +1619,15 @@ proc fillHits(
|
||||||
template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) =
|
template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) =
|
||||||
when allowSimd:
|
when allowSimd:
|
||||||
when defined(amd64):
|
when defined(amd64):
|
||||||
let colorVec = mm_set1_epi32(cast[int32](rgbx))
|
var p = cast[uint](image.data[image.dataIndex(x, y)].addr)
|
||||||
var dataIndex = image.dataIndex(x, y)
|
let
|
||||||
for _ in 0 ..< len div 4:
|
iterations = len div 4
|
||||||
let backdrop = mm_loadu_si128(image.data[dataIndex].addr)
|
colorVec = mm_set1_epi32(cast[int32](rgbx))
|
||||||
mm_storeu_si128(
|
for _ in 0 ..< iterations:
|
||||||
image.data[dataIndex].addr,
|
let backdrop = mm_loadu_si128(cast[pointer](p))
|
||||||
blendProc(backdrop, colorVec)
|
mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec))
|
||||||
)
|
p += 16
|
||||||
x += 4
|
x += iterations * 4
|
||||||
dataIndex += 4
|
|
||||||
|
|
||||||
case blendMode:
|
case blendMode:
|
||||||
of OverwriteBlend:
|
of OverwriteBlend:
|
||||||
|
@ -1714,16 +1713,15 @@ proc fillHits(
|
||||||
template simdBlob(mask: Mask, x: var int, len: int, blendProc: untyped) =
|
template simdBlob(mask: Mask, x: var int, len: int, blendProc: untyped) =
|
||||||
when allowSimd:
|
when allowSimd:
|
||||||
when defined(amd64):
|
when defined(amd64):
|
||||||
let vec255 = mm_set1_epi8(255)
|
var p = cast[uint](mask.data[mask.dataIndex(x, y)].addr)
|
||||||
var dataIndex = mask.dataIndex(x, y)
|
let
|
||||||
for _ in 0 ..< len div 16:
|
iterations = len div 16
|
||||||
let backdrop = mm_loadu_si128(mask.data[dataIndex].addr)
|
vec255 = mm_set1_epi8(255)
|
||||||
mm_storeu_si128(
|
for _ in 0 ..< iterations:
|
||||||
mask.data[dataIndex].addr,
|
let backdrop = mm_loadu_si128(cast[pointer](p))
|
||||||
blendProc(backdrop, vec255)
|
mm_storeu_si128(cast[pointer](p), blendProc(backdrop, vec255))
|
||||||
)
|
p += 16
|
||||||
x += 16
|
x += iterations * 16
|
||||||
dataIndex += 16
|
|
||||||
|
|
||||||
case blendMode:
|
case blendMode:
|
||||||
of NormalBlend, OverwriteBlend:
|
of NormalBlend, OverwriteBlend:
|
||||||
|
|
Loading…
Reference in a new issue