commit
915d1d5246
4 changed files with 160 additions and 43 deletions
|
@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
|
||||||
requires "chroma >= 0.2.6"
|
requires "chroma >= 0.2.6"
|
||||||
requires "zippy >= 0.10.3"
|
requires "zippy >= 0.10.3"
|
||||||
requires "flatty >= 0.3.4"
|
requires "flatty >= 0.3.4"
|
||||||
requires "nimsimd >= 1.1.9"
|
requires "nimsimd >= 1.1.10"
|
||||||
requires "bumpy >= 1.1.1"
|
requires "bumpy >= 1.1.1"
|
||||||
|
|
||||||
task bindings, "Generate bindings":
|
task bindings, "Generate bindings":
|
||||||
|
|
|
@ -383,6 +383,11 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
|
||||||
proc blitLineNormalAvx2*(
|
proc blitLineNormalAvx2*(
|
||||||
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
) {.simd.} =
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 31) != 0:
|
||||||
|
a[i] = blendNormal(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
let
|
let
|
||||||
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
||||||
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
||||||
|
@ -393,8 +398,6 @@ proc blitLineNormalAvx2*(
|
||||||
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
|
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
|
||||||
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
|
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
|
||||||
)
|
)
|
||||||
|
|
||||||
var i: int
|
|
||||||
while i < len - 8:
|
while i < len - 8:
|
||||||
let
|
let
|
||||||
source = mm256_loadu_si256(b[i].addr)
|
source = mm256_loadu_si256(b[i].addr)
|
||||||
|
@ -402,7 +405,7 @@ proc blitLineNormalAvx2*(
|
||||||
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
|
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
|
||||||
mm256_storeu_si256(a[i].addr, source)
|
mm256_storeu_si256(a[i].addr, source)
|
||||||
else:
|
else:
|
||||||
let backdrop = mm256_loadu_si256(a[i].addr)
|
let backdrop = mm256_load_si256(a[i].addr)
|
||||||
|
|
||||||
var
|
var
|
||||||
sourceAlpha = mm256_and_si256(source, alphaMask)
|
sourceAlpha = mm256_and_si256(source, alphaMask)
|
||||||
|
@ -423,7 +426,7 @@ proc blitLineNormalAvx2*(
|
||||||
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
|
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
|
||||||
)
|
)
|
||||||
|
|
||||||
mm256_storeu_si256(a[i].addr, added)
|
mm256_store_si256(a[i].addr, added)
|
||||||
|
|
||||||
i += 8
|
i += 8
|
||||||
|
|
||||||
|
@ -433,6 +436,11 @@ proc blitLineNormalAvx2*(
|
||||||
proc blitLineMaskAvx2*(
|
proc blitLineMaskAvx2*(
|
||||||
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
) {.simd.} =
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 31) != 0:
|
||||||
|
a[i] = blendMask(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
let
|
let
|
||||||
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
|
||||||
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
oddMask = mm256_set1_epi16(cast[int16](0xff00))
|
||||||
|
@ -442,8 +450,6 @@ proc blitLineMaskAvx2*(
|
||||||
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
|
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
|
||||||
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
|
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
|
||||||
)
|
)
|
||||||
|
|
||||||
var i: int
|
|
||||||
while i < len - 8:
|
while i < len - 8:
|
||||||
let
|
let
|
||||||
source = mm256_loadu_si256(b[i].addr)
|
source = mm256_loadu_si256(b[i].addr)
|
||||||
|
@ -451,7 +457,7 @@ proc blitLineMaskAvx2*(
|
||||||
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
|
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
|
||||||
discard
|
discard
|
||||||
else:
|
else:
|
||||||
let backdrop = mm256_loadu_si256(a[i].addr)
|
let backdrop = mm256_load_si256(a[i].addr)
|
||||||
|
|
||||||
var
|
var
|
||||||
sourceAlpha = mm256_and_si256(source, alphaMask)
|
sourceAlpha = mm256_and_si256(source, alphaMask)
|
||||||
|
@ -465,7 +471,7 @@ proc blitLineMaskAvx2*(
|
||||||
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
|
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
|
||||||
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
|
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)
|
||||||
|
|
||||||
mm256_storeu_si256(
|
mm256_store_si256(
|
||||||
a[i].addr,
|
a[i].addr,
|
||||||
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
|
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import chroma, internal, nimsimd/neon, pixie/common, vmath
|
import chroma, internal, nimsimd/neon, pixie/blends, pixie/common, vmath
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
@ -58,9 +58,9 @@ proc isOneColorNeon*(image: Image): bool {.simd.} =
|
||||||
rgEq = vandq_u8(rEq, gEq)
|
rgEq = vandq_u8(rEq, gEq)
|
||||||
baEq = vandq_u8(bEq, aEq)
|
baEq = vandq_u8(bEq, aEq)
|
||||||
rgbaEq = vandq_u8(rgEq, baEq)
|
rgbaEq = vandq_u8(rgEq, baEq)
|
||||||
mask =
|
mask = vget_lane_u64(cast[uint64x1](
|
||||||
cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and
|
vand_u8(vget_low_u8(rgbaEq), vget_high_u8(rgbaEq)
|
||||||
cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq)))
|
)), 0)
|
||||||
if mask != uint64.high:
|
if mask != uint64.high:
|
||||||
return false
|
return false
|
||||||
i += 16
|
i += 16
|
||||||
|
@ -82,12 +82,16 @@ proc isTransparentNeon*(image: Image): bool {.simd.} =
|
||||||
|
|
||||||
result = true
|
result = true
|
||||||
|
|
||||||
let iterations = (image.data.len - i) div 16
|
let
|
||||||
|
vecZero = vmovq_n_u8(0)
|
||||||
|
iterations = (image.data.len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
alphas = vld4q_u8(image.data[i].addr).val[3]
|
alphas = vld4q_u8(image.data[i].addr).val[3]
|
||||||
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0))
|
eq = vceqq_u8(alphas, vecZero)
|
||||||
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
|
mask = vget_lane_u64(cast[uint64x1](
|
||||||
|
vand_u8(vget_low_u8(eq), vget_high_u8(eq)
|
||||||
|
)), 0)
|
||||||
if mask != uint64.high:
|
if mask != uint64.high:
|
||||||
return false
|
return false
|
||||||
i += 16
|
i += 16
|
||||||
|
@ -109,12 +113,16 @@ proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
p += 4
|
||||||
|
|
||||||
let iterations = (start + len - i) div 16
|
let
|
||||||
|
vec255 = vmovq_n_u8(255)
|
||||||
|
iterations = (start + len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
let
|
let
|
||||||
alphas = vld4q_u8(data[i].addr).val[3]
|
alphas = vld4q_u8(data[i].addr).val[3]
|
||||||
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high))
|
eq = vceqq_u8(alphas, vec255)
|
||||||
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
|
mask = vget_lane_u64(cast[uint64x1](
|
||||||
|
vand_u8(vget_low_u8(eq), vget_high_u8(eq)
|
||||||
|
)), 0)
|
||||||
if mask != uint64.high:
|
if mask != uint64.high:
|
||||||
return false
|
return false
|
||||||
i += 16
|
i += 16
|
||||||
|
@ -138,19 +146,25 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
||||||
inc i
|
inc i
|
||||||
p += 4
|
p += 4
|
||||||
|
|
||||||
proc premultiply(c, a: uint8x8): uint8x8 {.inline.} =
|
template multiply(c, a: uint8x8): uint8x8 =
|
||||||
let ca = vmull_u8(c, a)
|
let ca = vmull_u8(c, a)
|
||||||
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))
|
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))
|
||||||
|
|
||||||
let iterations = (data.len - i) div 8
|
template multiply(c, a: uint8x16): uint8x16 =
|
||||||
|
vcombine_u8(
|
||||||
|
multiply(vget_low_u8(c), vget_low_u8(a)),
|
||||||
|
multiply(vget_high_u8(c), vget_high_u8(a))
|
||||||
|
)
|
||||||
|
|
||||||
|
let iterations = (data.len - i) div 16
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
var channels = vld4_u8(cast[pointer](p))
|
var channels = vld4q_u8(cast[pointer](p))
|
||||||
channels.val[0] = premultiply(channels.val[0], channels.val[3])
|
channels.val[0] = multiply(channels.val[0], channels.val[3])
|
||||||
channels.val[1] = premultiply(channels.val[1], channels.val[3])
|
channels.val[1] = multiply(channels.val[1], channels.val[3])
|
||||||
channels.val[2] = premultiply(channels.val[2], channels.val[3])
|
channels.val[2] = multiply(channels.val[2], channels.val[3])
|
||||||
vst4_u8(cast[pointer](p), channels)
|
vst4q_u8(cast[pointer](p), channels)
|
||||||
p += 32
|
p += 64
|
||||||
i += 8 * iterations
|
i += 16 * iterations
|
||||||
|
|
||||||
for i in i ..< data.len:
|
for i in i ..< data.len:
|
||||||
var c = data[i]
|
var c = data[i]
|
||||||
|
@ -211,19 +225,19 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
|
||||||
i: int
|
i: int
|
||||||
p = cast[uint](image.data[0].addr)
|
p = cast[uint](image.data[0].addr)
|
||||||
|
|
||||||
proc apply(c, o: uint8x8): uint8x8 {.inline.} =
|
template multiply(c, a: uint8x8): uint8x8 =
|
||||||
let co = vmull_u8(c, o)
|
let ca = vmull_u8(c, a)
|
||||||
vraddhn_u16(co, vrshrq_n_u16(co, 8))
|
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))
|
||||||
|
|
||||||
let
|
let
|
||||||
opacityVec = vmov_n_u8(opacity)
|
opacityVec = vmov_n_u8(opacity)
|
||||||
iterations = image.data.len div 8
|
iterations = image.data.len div 8
|
||||||
for _ in 0 ..< iterations:
|
for _ in 0 ..< iterations:
|
||||||
var channels = vld4_u8(cast[pointer](p))
|
var channels = vld4_u8(cast[pointer](p))
|
||||||
channels.val[0] = apply(channels.val[0], opacityVec)
|
channels.val[0] = multiply(channels.val[0], opacityVec)
|
||||||
channels.val[1] = apply(channels.val[1], opacityVec)
|
channels.val[1] = multiply(channels.val[1], opacityVec)
|
||||||
channels.val[2] = apply(channels.val[2], opacityVec)
|
channels.val[2] = multiply(channels.val[2], opacityVec)
|
||||||
channels.val[3] = apply(channels.val[3], opacityVec)
|
channels.val[3] = multiply(channels.val[3], opacityVec)
|
||||||
vst4_u8(cast[pointer](p), channels)
|
vst4_u8(cast[pointer](p), channels)
|
||||||
p += 32
|
p += 32
|
||||||
i += 8 * iterations
|
i += 8 * iterations
|
||||||
|
@ -400,5 +414,96 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} =
|
||||||
result.width * 4
|
result.width * 4
|
||||||
)
|
)
|
||||||
|
|
||||||
|
proc blitLineNormalNeon*(
|
||||||
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 15) != 0:
|
||||||
|
a[i] = blendNormal(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
|
let vec255 = vmovq_n_u8(255)
|
||||||
|
while i < len - 16:
|
||||||
|
let
|
||||||
|
source = vld4q_u8(b[i].addr)
|
||||||
|
eq255 = vceqq_u8(source.val[3], vec255)
|
||||||
|
mask = vget_lane_u64(cast[uint64x1](
|
||||||
|
vand_u8(vget_low_u8(eq255), vget_high_u8(eq255)
|
||||||
|
)), 0)
|
||||||
|
if mask == uint64.high:
|
||||||
|
vst4q_u8(a[i].addr, source)
|
||||||
|
else:
|
||||||
|
template multiply(c, a: uint8x8): uint8x8 =
|
||||||
|
let ca = vmull_u8(c, a)
|
||||||
|
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))
|
||||||
|
|
||||||
|
template multiply(c, a: uint8x16): uint8x16 =
|
||||||
|
vcombine_u8(
|
||||||
|
multiply(vget_low_u8(c), vget_low_u8(a)),
|
||||||
|
multiply(vget_high_u8(c), vget_high_u8(a))
|
||||||
|
)
|
||||||
|
|
||||||
|
let
|
||||||
|
backdrop = vld4q_u8(a[i].addr)
|
||||||
|
multiplier = vsubq_u8(vec255, source.val[3])
|
||||||
|
|
||||||
|
var blended: uint8x16x4
|
||||||
|
blended.val[0] = multiply(backdrop.val[0], multiplier)
|
||||||
|
blended.val[1] = multiply(backdrop.val[1], multiplier)
|
||||||
|
blended.val[2] = multiply(backdrop.val[2], multiplier)
|
||||||
|
blended.val[3] = multiply(backdrop.val[3], multiplier)
|
||||||
|
blended.val[0] = vaddq_u8(blended.val[0], source.val[0])
|
||||||
|
blended.val[1] = vaddq_u8(blended.val[1], source.val[1])
|
||||||
|
blended.val[2] = vaddq_u8(blended.val[2], source.val[2])
|
||||||
|
blended.val[3] = vaddq_u8(blended.val[3], source.val[3])
|
||||||
|
vst4q_u8(a[i].addr, blended)
|
||||||
|
|
||||||
|
i += 16
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
a[i] = blendNormal(a[i], b[i])
|
||||||
|
|
||||||
|
proc blitLineMaskNeon*(
|
||||||
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 15) != 0:
|
||||||
|
a[i] = blendMask(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
|
let vec255 = vmovq_n_u8(255)
|
||||||
|
while i < len - 16:
|
||||||
|
let
|
||||||
|
source = vld4q_u8(b[i].addr)
|
||||||
|
eq255 = vceqq_u8(source.val[3], vec255)
|
||||||
|
mask = vget_lane_u64(cast[uint64x1](
|
||||||
|
vand_u8(vget_low_u8(eq255), vget_high_u8(eq255)
|
||||||
|
)), 0)
|
||||||
|
if mask == uint64.high:
|
||||||
|
discard
|
||||||
|
else:
|
||||||
|
template multiply(c, a: uint8x8): uint8x8 =
|
||||||
|
let ca = vmull_u8(c, a)
|
||||||
|
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))
|
||||||
|
|
||||||
|
template multiply(c, a: uint8x16): uint8x16 =
|
||||||
|
vcombine_u8(
|
||||||
|
multiply(vget_low_u8(c), vget_low_u8(a)),
|
||||||
|
multiply(vget_high_u8(c), vget_high_u8(a))
|
||||||
|
)
|
||||||
|
|
||||||
|
let backdrop = vld4q_u8(a[i].addr)
|
||||||
|
var blended: uint8x16x4
|
||||||
|
blended.val[0] = multiply(backdrop.val[0], source.val[3])
|
||||||
|
blended.val[1] = multiply(backdrop.val[1], source.val[3])
|
||||||
|
blended.val[2] = multiply(backdrop.val[2], source.val[3])
|
||||||
|
blended.val[3] = multiply(backdrop.val[3], source.val[3])
|
||||||
|
vst4q_u8(a[i].addr, blended)
|
||||||
|
|
||||||
|
i += 16
|
||||||
|
|
||||||
|
for i in i ..< len:
|
||||||
|
a[i] = blendMask(a[i], b[i])
|
||||||
|
|
||||||
when defined(release):
|
when defined(release):
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
|
@ -530,14 +530,17 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
|
||||||
proc blitLineNormalSse2*(
|
proc blitLineNormalSse2*(
|
||||||
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
) {.simd.} =
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 15) != 0:
|
||||||
|
a[i] = blendNormal(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
let
|
let
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
vec255 = mm_set1_epi8(255)
|
vec255 = mm_set1_epi8(255)
|
||||||
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
|
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
|
||||||
|
|
||||||
var i: int
|
|
||||||
while i < len - 4:
|
while i < len - 4:
|
||||||
let
|
let
|
||||||
source = mm_loadu_si128(b[i].addr)
|
source = mm_loadu_si128(b[i].addr)
|
||||||
|
@ -545,7 +548,7 @@ proc blitLineNormalSse2*(
|
||||||
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
|
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
|
||||||
mm_storeu_si128(a[i].addr, source)
|
mm_storeu_si128(a[i].addr, source)
|
||||||
else:
|
else:
|
||||||
let backdrop = mm_loadu_si128(a[i].addr)
|
let backdrop = mm_load_si128(a[i].addr)
|
||||||
|
|
||||||
var
|
var
|
||||||
sourceAlpha = mm_and_si128(source, alphaMask)
|
sourceAlpha = mm_and_si128(source, alphaMask)
|
||||||
|
@ -566,7 +569,7 @@ proc blitLineNormalSse2*(
|
||||||
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
|
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_storeu_si128(a[i].addr, added)
|
mm_store_si128(a[i].addr, added)
|
||||||
|
|
||||||
i += 4
|
i += 4
|
||||||
|
|
||||||
|
@ -576,13 +579,16 @@ proc blitLineNormalSse2*(
|
||||||
proc blitLineMaskSse2*(
|
proc blitLineMaskSse2*(
|
||||||
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
a, b: ptr UncheckedArray[ColorRGBX], len: int
|
||||||
) {.simd.} =
|
) {.simd.} =
|
||||||
|
var i: int
|
||||||
|
while (cast[uint](a[i].addr) and 15) != 0:
|
||||||
|
a[i] = blendMask(a[i], b[i])
|
||||||
|
inc i
|
||||||
|
|
||||||
let
|
let
|
||||||
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
|
||||||
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
oddMask = mm_set1_epi16(cast[int16](0xff00))
|
||||||
div255 = mm_set1_epi16(cast[int16](0x8081))
|
div255 = mm_set1_epi16(cast[int16](0x8081))
|
||||||
vec255 = mm_set1_epi8(255)
|
vec255 = mm_set1_epi8(255)
|
||||||
|
|
||||||
var i: int
|
|
||||||
while i < len - 4:
|
while i < len - 4:
|
||||||
let
|
let
|
||||||
source = mm_loadu_si128(b[i].addr)
|
source = mm_loadu_si128(b[i].addr)
|
||||||
|
@ -590,7 +596,7 @@ proc blitLineMaskSse2*(
|
||||||
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
|
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
|
||||||
discard
|
discard
|
||||||
else:
|
else:
|
||||||
let backdrop = mm_loadu_si128(a[i].addr)
|
let backdrop = mm_load_si128(a[i].addr)
|
||||||
|
|
||||||
var
|
var
|
||||||
sourceAlpha = mm_and_si128(source, alphaMask)
|
sourceAlpha = mm_and_si128(source, alphaMask)
|
||||||
|
@ -604,7 +610,7 @@ proc blitLineMaskSse2*(
|
||||||
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
|
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
|
||||||
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
|
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)
|
||||||
|
|
||||||
mm_storeu_si128(
|
mm_store_si128(
|
||||||
a[i].addr,
|
a[i].addr,
|
||||||
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
|
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue