Merge pull request #487 from treeform/guzba

paths.nim simd re-organize
This commit is contained in:
Andre von Houck 2022-08-01 10:43:46 -07:00 committed by GitHub
commit f41d53b66c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 448 additions and 371 deletions

View file

@ -76,6 +76,19 @@ proc `*`*(color: ColorRGBX, opacity: float32): ColorRGBX {.raises: [].} =
a = ((color.a * x + 127) div 255).uint8
rgbx(r, g, b, a)
proc `*`*(rgbx: ColorRGBX, opacity: uint8): ColorRGBX {.inline.} =
if opacity == 0:
discard
elif opacity == 255:
result = rgbx
else:
result = rgbx(
((rgbx.r.uint32 * opacity + 127) div 255).uint8,
((rgbx.g.uint32 * opacity + 127) div 255).uint8,
((rgbx.b.uint32 * opacity + 127) div 255).uint8,
((rgbx.a.uint32 * opacity + 127) div 255).uint8
)
proc snapToPixels*(rect: Rect): Rect {.raises: [].} =
let
xMin = rect.x

View file

@ -1,6 +1,6 @@
import blends, bumpy, chroma, common, internal, simd, vmath
export Image, newImage, copy, dataIndex
export Image, copy, dataIndex, newImage
const h = 0.5.float32
@ -436,27 +436,26 @@ proc drawCorrect(
blended = blender(backdrop, sample)
a.unsafe[x, y] = blended
template getUncheckedArray(
image: Image, x, y: int
): ptr UncheckedArray[ColorRGBX] =
cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr)
proc blitLine(a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender) {.inline.} =
proc blendLine(
a, b: ptr UncheckedArray[ColorRGBX], len: int, blender: Blender
) {.inline.} =
for i in 0 ..< len:
a[i] = blender(a[i], b[i])
proc blitLineOverwrite(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.inline.} =
proc blendLineOverwrite(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.inline.} =
copyMem(a[0].addr, b[0].addr, len * 4)
proc blitLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
for i in 0 ..< len:
a[i] = blendNormal(a[i], b[i])
proc blitLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
proc blendLineMask(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
for i in 0 ..< len:
a[i] = blendMask(a[i], b[i])
proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
proc blendRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
let
px = pos.x.int
py = pos.y.int
@ -475,14 +474,14 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
case blendMode:
of NormalBlend:
for y in yStart ..< yEnd:
blitLineNormal(
blendLineNormal(
a.getUncheckedArray(xStart + px, y + py),
b.getUncheckedArray(xStart, y),
xEnd - xStart
)
of OverwriteBlend:
for y in yStart ..< yEnd:
blitLineOverwrite(
blendLineOverwrite(
a.getUncheckedArray(xStart + px, y + py),
b.getUncheckedArray(xStart, y),
xEnd - xStart
@ -494,7 +493,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
for y in yStart ..< yEnd:
if xStart + px > 0:
zeroMem(a.data[a.dataIndex(0, y + py)].addr, (xStart + px) * 4)
blitLineMask(
blendLineMask(
a.getUncheckedArray(xStart + px, y + py),
b.getUncheckedArray(xStart, y),
xEnd - xStart
@ -512,7 +511,7 @@ proc blitRect(a, b: Image, pos: Ivec2, blendMode: BlendMode) =
else:
let blender = blendMode.blender()
for y in yStart ..< yEnd:
blitLine(
blendLine(
a.getUncheckedArray(xStart + px, y + py),
b.getUncheckedArray(xStart, y),
xEnd - xStart,
@ -560,7 +559,7 @@ proc draw*(
if hasRotationOrScaling or smooth:
a.drawCorrect(b, inverseTransform.inverse(), blendMode, false)
else:
a.blitRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode)
a.blendRect(b, ivec2(transform[2, 0].int32, transform[2, 1].int32), blendMode)
proc drawTiled*(
dst, src: Image, mat: Mat3, blendMode = NormalBlend

View file

@ -47,6 +47,11 @@ proc intersectsInside*(a, b: Segment, at: var Vec2): bool {.inline.} =
at = a.at + (t * s1)
return true
template getUncheckedArray*(
image: Image, x, y: int
): ptr UncheckedArray[ColorRGBX] =
cast[ptr UncheckedArray[ColorRGBX]](image.data[image.dataIndex(x, y)].addr)
proc fillUnsafe*(
data: var seq[ColorRGBX], color: SomeColor, start, len: int
) {.hasSimd, raises: [].} =

View file

@ -1429,6 +1429,43 @@ proc clearUnsafe(image: Image, startX, startY, toX, toY: int) =
len = image.dataIndex(toX, toY) - start
fillUnsafe(image.data, rgbx(0, 0, 0, 0), start, len)
proc blendLineCoverageOverwrite(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
proc blendLineCoverageNormal(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
proc blendLineCoverageMask(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.hasSimd.} =
for i in 0 ..< len:
let coverage = coverages[i]
if coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
proc fillCoverage(
image: Image,
rgbx: ColorRGBX,
@ -1440,181 +1477,56 @@ proc fillCoverage(
x = startX
dataIndex = image.dataIndex(x, y)
when allowSimd:
when defined(amd64):
iterator simd(
coverages: seq[uint8], x: var int, startX: int
): (M128i, bool, bool) =
for _ in 0 ..< coverages.len div 16:
let
coverageVec = mm_loadu_si128(coverages[x - startX].unsafeAddr)
eqZero = mm_cmpeq_epi8(coverageVec, mm_setzero_si128())
eq255 = mm_cmpeq_epi8(coverageVec, mm_set1_epi8(255))
allZeroes = mm_movemask_epi8(eqZero) == 0xffff
all255 = mm_movemask_epi8(eq255) == 0xffff
yield (coverageVec, allZeroes, all255)
x += 16
proc source(colorVec, coverageVec: M128i): M128i {.inline.} =
let
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
var unpacked = unpackAlphaValues(coverageVec)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var
sourceEven = mm_slli_epi16(colorVec, 8)
sourceOdd = mm_and_si128(colorVec, oddMask)
sourceEven = mm_mulhi_epu16(sourceEven, unpacked)
sourceOdd = mm_mulhi_epu16(sourceOdd, unpacked)
sourceEven = mm_srli_epi16(mm_mulhi_epu16(sourceEven, div255), 7)
sourceOdd = mm_srli_epi16(mm_mulhi_epu16(sourceOdd, div255), 7)
result = mm_or_si128(sourceEven, mm_slli_epi16(sourceOdd, 8))
let colorVec = mm_set1_epi32(cast[int32](rgbx))
proc source(rgbx: ColorRGBX, coverage: uint8): ColorRGBX {.inline.} =
if coverage == 0:
discard
elif coverage == 255:
result = rgbx
else:
result = rgbx(
((rgbx.r.uint32 * coverage) div 255).uint8,
((rgbx.g.uint32 * coverage) div 255).uint8,
((rgbx.b.uint32 * coverage) div 255).uint8,
((rgbx.a.uint32 * coverage) div 255).uint8
)
case blendMode:
of OverwriteBlend:
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if allZeroes:
dataIndex += 16
else:
if all255:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, colorVec)
dataIndex += 4
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let source = source(colorVec, coverageVec)
mm_storeu_si128(image.data[dataIndex].addr, source)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage != 0:
image.data[dataIndex] = source(rgbx, coverage)
inc dataIndex
blendLineCoverageOverwrite(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
of NormalBlend:
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if allZeroes:
dataIndex += 16
else:
if all255 and rgbx.a == 255:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, colorVec)
dataIndex += 4
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let
backdrop = mm_loadu_si128(image.data[dataIndex].addr)
source = source(colorVec, coverageVec)
mm_storeu_si128(
image.data[dataIndex].addr,
blendNormalSimd(backdrop, source)
)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage == 255 and rgbx.a == 255:
image.data[dataIndex] = rgbx
elif coverage == 0:
discard
else:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendNormal(backdrop, source(rgbx, coverage))
inc dataIndex
blendLineCoverageNormal(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
of MaskBlend:
{.linearScanEnd.}
when allowSimd:
when defined(amd64):
for (coverageVec, allZeroes, all255) in simd(coverages, x, startX):
if not allZeroes:
if all255:
dataIndex += 16
else:
var coverageVec = coverageVec
for i in 0 ..< 4:
let
backdrop = mm_loadu_si128(image.data[dataIndex].addr)
source = source(colorVec, coverageVec)
mm_storeu_si128(
image.data[dataIndex].addr,
blendMaskSimd(backdrop, source)
)
coverageVec = mm_srli_si128(coverageVec, 4)
dataIndex += 4
else:
for i in 0 ..< 4:
mm_storeu_si128(image.data[dataIndex].addr, mm_setzero_si128())
dataIndex += 4
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage == 0:
image.data[dataIndex] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendMask(backdrop, source(rgbx, coverage))
inc dataIndex
blendLineCoverageMask(
image.getUncheckedArray(startX, y),
cast[ptr UncheckedArray[uint8]](coverages[0].unsafeAddr),
rgbx,
coverages.len
)
image.clearUnsafe(0, y, startX, y)
image.clearUnsafe(startX + coverages.len, y, image.width, y)
of SubtractMaskBlend:
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage == 255 and rgbx.a == 255:
image.data[dataIndex] = rgbx(0, 0, 0, 0)
elif coverage != 0:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendSubtractMask(backdrop, source(rgbx, coverage))
inc dataIndex
of ExcludeMaskBlend:
for x in x ..< startX + coverages.len:
let
coverage = coverages[x - startX]
backdrop = image.data[dataIndex]
image.data[dataIndex] = blendExcludeMask(backdrop, source(rgbx, coverage))
inc dataIndex
else:
let blender = blendMode.blender()
for x in x ..< startX + coverages.len:
let coverage = coverages[x - startX]
if coverage != 0:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blender(backdrop, source(rgbx, coverage))
image.data[dataIndex] = blender(backdrop, rgbx * coverage)
inc dataIndex
proc blendLineNormal(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.hasSimd.} =
for i in 0 ..< len:
line[i] = blendNormal(line[i], rgbx)
proc blendLineMask(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.hasSimd.} =
for i in 0 ..< len:
line[i] = blendMask(line[i], rgbx)
proc fillHits(
image: Image,
rgbx: ColorRGBX,
@ -1625,19 +1537,6 @@ proc fillHits(
blendMode: BlendMode,
maskClears = true
) =
template simdBlob(image: Image, x: var int, len: int, blendProc: untyped) =
when allowSimd:
when defined(amd64):
var p = cast[uint](image.data[image.dataIndex(x, y)].addr)
let
iterations = len div 4
colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in 0 ..< iterations:
let backdrop = mm_loadu_si128(cast[pointer](p))
mm_storeu_si128(cast[pointer](p), blendProc(backdrop, colorVec))
p += 16
x += iterations * 4
case blendMode:
of OverwriteBlend:
for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
@ -1648,17 +1547,10 @@ proc fillHits(
if rgbx.a == 255:
fillUnsafe(image.data, rgbx, image.dataIndex(start, y), len)
else:
var x = start
simdBlob(image, x, len, blendNormalSimd)
var dataIndex = image.dataIndex(x, y)
for _ in x ..< start + len:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendNormal(backdrop, rgbx)
inc dataIndex
blendLineNormal(image.getUncheckedArray(start, y), rgbx, len)
of MaskBlend:
{.linearScanEnd.}
var filledTo = startX
for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
if maskClears: # Clear any gap between this fill and the previous fill
@ -1672,37 +1564,13 @@ proc fillHits(
)
block: # Handle this fill
if rgbx.a != 255:
var x = start
simdBlob(image, x, len, blendMaskSimd)
var dataIndex = image.dataIndex(x, y)
for _ in x ..< start + len:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendMask(backdrop, rgbx)
blendLineMask(image.getUncheckedArray(start, y), rgbx, len)
filledTo = start + len
if maskClears:
image.clearUnsafe(0, y, startX, y)
image.clearUnsafe(filledTo, y, image.width, y)
of SubtractMaskBlend:
for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
var dataIndex = image.dataIndex(start, y)
for _ in 0 ..< len:
if rgbx.a == 255:
image.data[dataIndex] = rgbx(0, 0, 0, 0)
else:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendSubtractMask(backdrop, rgbx)
inc dataIndex
of ExcludeMaskBlend:
for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):
var dataIndex = image.dataIndex(start, y)
for _ in 0 ..< len:
let backdrop = image.data[dataIndex]
image.data[dataIndex] = blendExcludeMask(backdrop, rgbx)
inc dataIndex
else:
let blender = blendMode.blender()
for (start, len) in hits.walkInteger(numHits, windingRule, y, image.width):

View file

@ -6,6 +6,41 @@ when defined(gcc) or defined(clang):
when defined(release):
{.push checks: off.}
template blendNormalSimd(backdrop, source: M256i): M256i =
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_add_epi8(
source,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
template blendMaskSimd(backdrop, source: M256i): M256i =
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
proc isOneColorAvx2*(image: Image): bool {.simd.} =
result = true
@ -380,11 +415,37 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
# Set src as this result for if we do another power
src = result
proc blitLineNormalAvx2*(
proc blendLineNormalAvx2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 31) != 0:
line[i] = blendNormal(line[i], rgbx)
inc i
let
source = mm256_set1_epi32(cast[uint32](rgbx))
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081))
vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
shuffleControl = mm256_set_epi8(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)
while i < len - 8:
let backdrop = mm256_load_si256(line[i].addr)
mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source))
i += 8
for i in i ..< len:
line[i] = blendNormal(line[i], rgbx)
proc blendLineNormalAvx2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 31) != 0:
while i < len and (cast[uint](a[i].addr) and 31) != 0:
a[i] = blendNormal(a[i], b[i])
inc i
@ -403,41 +464,45 @@ proc blitLineNormalAvx2*(
source = mm256_loadu_si256(b[i].addr)
eq255 = mm256_cmpeq_epi8(source, vec255)
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
mm256_storeu_si256(a[i].addr, source)
mm256_store_si256(a[i].addr, source)
else:
let backdrop = mm256_load_si256(a[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
let multiplier = mm256_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm256_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm256_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
let added = mm256_add_epi8(
source,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(a[i].addr, added)
mm256_store_si256(a[i].addr, blendNormalSimd(backdrop, source))
i += 8
for i in i ..< len:
a[i] = blendNormal(a[i], b[i])
proc blitLineMaskAvx2*(
proc blendLineMaskAvx2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 31) != 0:
line[i] = blendMask(line[i], rgbx)
inc i
let
source = mm256_set1_epi32(cast[uint32](rgbx))
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081))
shuffleControl = mm256_set_epi8(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)
while i < len - 8:
let backdrop = mm256_load_si256(line[i].addr)
mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source))
i += 8
for i in i ..< len:
line[i] = blendMask(line[i], rgbx)
proc blendLineMaskAvx2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 31) != 0:
while i < len and (cast[uint](a[i].addr) and 31) != 0:
a[i] = blendMask(a[i], b[i])
inc i
@ -458,24 +523,7 @@ proc blitLineMaskAvx2*(
discard
else:
let backdrop = mm256_load_si256(a[i].addr)
var
sourceAlpha = mm256_and_si256(source, alphaMask)
backdropEven = mm256_slli_epi16(backdrop, 8)
backdropOdd = mm256_and_si256(backdrop, oddMask)
sourceAlpha = mm256_shuffle_epi8(sourceAlpha, shuffleControl)
backdropEven = mm256_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm256_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
mm256_store_si256(
a[i].addr,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
mm256_store_si256(a[i].addr, blendMaskSimd(backdrop, source))
i += 8
for i in i ..< len:

View file

@ -414,7 +414,7 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} =
result.width * 4
)
proc blitLineNormalNeon*(
proc blendLineNormalNeon*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
@ -463,7 +463,7 @@ proc blitLineNormalNeon*(
for i in i ..< len:
a[i] = blendNormal(a[i], b[i])
proc blitLineMaskNeon*(
proc blendLineMaskNeon*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int

View file

@ -10,17 +10,7 @@ proc applyOpacity*(color: M128, opacity: float32): ColorRGBX {.inline.} =
finalColor = mm_packus_epi16(finalColor, mm_setzero_si128())
cast[ColorRGBX](mm_cvtsi128_si32(finalColor))
proc unpackAlphaValues*(v: M128i): M128i {.inline, raises: [].} =
## Unpack the first 32 bits into 4 rgba(0, 0, 0, value).
result = mm_unpacklo_epi8(mm_setzero_si128(), v)
result = mm_unpacklo_epi8(mm_setzero_si128(), result)
proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
template blendNormalSimd*(backdrop, source: M128i): M128i =
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
@ -28,14 +18,10 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
let k = mm_sub_epi32(
mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])),
sourceAlpha
)
backdropEven = mm_mulhi_epu16(backdropEven, k)
backdropOdd = mm_mulhi_epu16(backdropOdd, k)
let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
@ -44,12 +30,7 @@ proc blendNormalSimd*(backdrop, source: M128i): M128i {.inline.} =
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
template blendMaskSimd*(backdrop, source: M128i): M128i =
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
@ -59,7 +40,6 @@ proc blendMaskSimd*(backdrop, source: M128i): M128i {.inline.} =
backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
@ -325,7 +305,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} =
valuesOdd = mm_mulhi_epu16(valuesOdd, opacityVec)
valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7)
valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7)
mm_storeu_si128(
mm_store_si128(
cast[pointer](p),
mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8))
)
@ -367,8 +347,8 @@ proc ceilSse2*(image: Image) {.simd.} =
values1 = mm_cmpeq_epi8(values1, vecZero)
values0 = mm_andnot_si128(values0, vec255)
values1 = mm_andnot_si128(values1, vec255)
mm_storeu_si128(cast[pointer](p), values0)
mm_storeu_si128(cast[pointer](p + 16), values1)
mm_store_si128(cast[pointer](p), values0)
mm_store_si128(cast[pointer](p + 16), values1)
p += 32
i += 8 * iterations
@ -527,11 +507,91 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
result.width * 4
)
proc blitLineNormalSse2*(
template applyCoverage*(rgbxVec, coverage: M128i): M128i =
## Unpack the first 4 coverage bytes.
var unpacked = mm_unpacklo_epi8(mm_setzero_si128(), coverage)
unpacked = mm_unpacklo_epi8(mm_setzero_si128(), unpacked)
unpacked = mm_or_si128(unpacked, mm_srli_epi32(unpacked, 16))
var
rgbxEven = mm_slli_epi16(rgbxVec, 8)
rgbxOdd = mm_and_si128(rgbxVec, oddMask)
rgbxEven = mm_mulhi_epu16(rgbxEven, unpacked)
rgbxOdd = mm_mulhi_epu16(rgbxOdd, unpacked)
rgbxEven = mm_srli_epi16(mm_mulhi_epu16(rgbxEven, div255), 7)
rgbxOdd = mm_srli_epi16(mm_mulhi_epu16(rgbxOdd, div255), 7)
mm_or_si128(rgbxEven, mm_slli_epi16(rgbxOdd, 8))
proc blendLineCoverageOverwriteSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
inc i
let
rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
vecZero = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
oddMask = mm_set1_epi16(0xff00)
div255 = mm_set1_epi16(0x8081)
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, vecZero)
eq255 = mm_cmpeq_epi8(coverage, vec255)
if mm_movemask_epi8(eqZero) == 0xffff:
i += 16
elif mm_movemask_epi8(eq255) == 0xffff:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, rgbxVec)
i += 4
else:
var coverage = coverage
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, rgbxVec.applyCoverage(coverage))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
proc blendLineNormalSse2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 15) != 0:
line[i] = blendNormal(line[i], rgbx)
inc i
let
source = mm_set1_epi32(cast[uint32](rgbx))
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
while i < len - 4:
let backdrop = mm_load_si128(line[i].addr)
mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source))
i += 4
for i in i ..< len:
line[i] = blendNormal(line[i], rgbx)
proc blendLineNormalSse2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
while i < len and (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendNormal(a[i], b[i])
inc i
@ -546,41 +606,92 @@ proc blitLineNormalSse2*(
source = mm_loadu_si128(b[i].addr)
eq255 = mm_cmpeq_epi8(source, vec255)
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
mm_storeu_si128(a[i].addr, source)
mm_store_si128(a[i].addr, source)
else:
let backdrop = mm_load_si128(a[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
let multiplier = mm_sub_epi32(vecAlpha255, sourceAlpha)
backdropEven = mm_mulhi_epu16(backdropEven, multiplier)
backdropOdd = mm_mulhi_epu16(backdropOdd, multiplier)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
let added = mm_add_epi8(
source,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(a[i].addr, added)
mm_store_si128(a[i].addr, blendNormalSimd(backdrop, source))
i += 4
for i in i ..< len:
a[i] = blendNormal(a[i], b[i])
proc blitLineMaskSse2*(
proc blendLineCoverageNormalSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
inc i
let
rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
vecZero = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, vecZero)
eq255 = mm_cmpeq_epi8(coverage, vec255)
if mm_movemask_epi8(eqZero) == 0xffff:
i += 16
elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, rgbxVec)
i += 4
else:
var coverage = coverage
for _ in 0 ..< 4:
let
backdrop = mm_loadu_si128(line[i].addr)
source = rgbxVec.applyCoverage(coverage)
mm_store_si128(line[i].addr, blendNormalSimd(backdrop, source))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
proc blendLineMaskSse2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 15) != 0:
line[i] = blendMask(line[i], rgbx)
inc i
let
source = mm_set1_epi32(cast[uint32](rgbx))
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
while i < len - 4:
let backdrop = mm_load_si128(line[i].addr)
mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source))
i += 4
for i in i ..< len:
line[i] = blendMask(line[i], rgbx)
proc blendLineMaskSse2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
while i < len and (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendMask(a[i], b[i])
inc i
@ -597,28 +708,65 @@ proc blitLineMaskSse2*(
discard
else:
let backdrop = mm_load_si128(a[i].addr)
var
sourceAlpha = mm_and_si128(source, alphaMask)
backdropEven = mm_slli_epi16(backdrop, 8)
backdropOdd = mm_and_si128(backdrop, oddMask)
sourceAlpha = mm_or_si128(sourceAlpha, mm_srli_epi32(sourceAlpha, 16))
backdropEven = mm_mulhi_epu16(backdropEven, sourceAlpha)
backdropOdd = mm_mulhi_epu16(backdropOdd, sourceAlpha)
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
mm_store_si128(
a[i].addr,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
mm_store_si128(a[i].addr, blendMaskSimd(backdrop, source))
i += 4
for i in i ..< len:
a[i] = blendMask(a[i], b[i])
proc blendLineCoverageMaskSse2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 15) != 0:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
inc i
let
rgbxVec = mm_set1_epi32(cast[uint32](rgbx))
vecZero = mm_setzero_si128()
vec255 = mm_set1_epi8(255)
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
while i < len - 16:
let
coverage = mm_loadu_si128(coverages[i].addr)
eqZero = mm_cmpeq_epi8(coverage, vecZero)
eq255 = mm_cmpeq_epi8(coverage, vec255)
if mm_movemask_epi8(eqZero) == 0xffff:
for _ in 0 ..< 4:
mm_store_si128(line[i].addr, vecZero)
i += 4
elif mm_movemask_epi8(eq255) == 0xffff and rgbx.a == 255:
i += 16
else:
var coverage = coverage
for _ in 0 ..< 4:
let
backdrop = mm_loadu_si128(line[i].addr)
source = rgbxVec.applyCoverage(coverage)
mm_store_si128(line[i].addr, blendMaskSimd(backdrop, source))
coverage = mm_srli_si128(coverage, 4)
i += 4
for i in i ..< len:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
when defined(release):
{.pop.}

View file

@ -5,8 +5,7 @@ const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis in q
var font = readFont("tests/fonts/Roboto-Regular_1.ttf")
font.size = 16
let
image = newImage(500, 300)
let image = newImage(500, 300)
timeIt "typeset":
discard font.typeset(text, bounds = vec2(image.width.float32, 0))

View file

@ -25,3 +25,31 @@ for i in 0 ..< 250:
a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
a.draw(b, translate(translation))
for i in 0 ..< 25:
let a = newImage(rand(1 .. 20), rand(1 .. 20))
for j in 0 ..< 25:
let b = newImage(rand(1 .. 20), rand(1 .. 20))
let
translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5)
rotation = rand(2 * PI).float32
echo a, " ", b, " ", translation, " ", rotation
a.draw(b, translate(vec2(translation.x, translation.y)))
a.draw(b, translate(translation) * rotate(rotation))
for i in 0 ..< 25:
let a = newImage(rand(1 .. 2000), rand(1 .. 2000))
for j in 0 ..< 25:
let b = newImage(rand(1 .. 1000), rand(1 .. 1000))
let
translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500)
rotation = rand(2 * PI).float32
echo a, " ", b, " ", translation, " ", rotation
a.draw(b, translate(vec2(translation.x, translation.y)))
a.draw(b, translate(translation) * rotate(rotation))

View file

@ -1,31 +0,0 @@
import pixie, random
randomize()
for i in 0 ..< 25:
let a = newImage(rand(1 .. 20), rand(1 .. 20))
for j in 0 ..< 25:
let b = newImage(rand(1 .. 20), rand(1 .. 20))
let
translation = vec2(rand(25.0), rand(25.0)) - vec2(5, 5)
rotation = rand(2 * PI).float32
echo a, " ", b, " ", translation, " ", rotation
a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
a.draw(b, translate(translation) * rotate(rotation))
for i in 0 ..< 25:
let a = newImage(rand(1 .. 2000), rand(1 .. 2000))
for j in 0 ..< 25:
let b = newImage(rand(1 .. 1000), rand(1 .. 1000))
let
translation = vec2(rand(2500.0), rand(2500.0)) - vec2(500, 500)
rotation = rand(2 * PI).float32
echo a, " ", b, " ", translation, " ", rotation
a.draw(b, translate(vec2(translation.x.trunc, translation.y.trunc)))
a.draw(b, translate(translation) * rotate(rotation))