Merge pull request #482 from treeform/guzba
faster flip vertical & horizontal
This commit is contained in:
commit
46e61a5a85
|
@ -77,23 +77,25 @@ proc isOpaque*(image: Image): bool {.raises: [].} =
|
|||
|
||||
proc flipHorizontal*(image: Image) {.raises: [].} =
|
||||
## Flips the image around the Y axis.
|
||||
let w = image.width div 2
|
||||
let halfWidth = image.width div 2
|
||||
for y in 0 ..< image.height:
|
||||
for x in 0 ..< w:
|
||||
swap(
|
||||
image.data[image.dataIndex(x, y)],
|
||||
image.data[image.dataIndex(image.width - x - 1, y)]
|
||||
)
|
||||
var
|
||||
left = image.dataIndex(0, y)
|
||||
right = left + image.width - 1
|
||||
for x in 0 ..< halfWidth:
|
||||
swap(image.data[left], image.data[right])
|
||||
inc left
|
||||
dec right
|
||||
|
||||
proc flipVertical*(image: Image) {.raises: [].} =
|
||||
## Flips the image around the X axis.
|
||||
let h = image.height div 2
|
||||
for y in 0 ..< h:
|
||||
let halfHeight = image.height div 2
|
||||
for y in 0 ..< halfHeight:
|
||||
let
|
||||
topStart = image.dataIndex(0, y)
|
||||
bottomStart = image.dataIndex(0, image.height - y - 1)
|
||||
for x in 0 ..< image.width:
|
||||
swap(
|
||||
image.data[image.dataIndex(x, y)],
|
||||
image.data[image.dataIndex(x, image.height - y - 1)]
|
||||
)
|
||||
swap(image.data[topStart + x], image.data[bottomStart + x])
|
||||
|
||||
proc rotate90*(image: Image) {.raises: [PixieError].} =
|
||||
## Rotates the image 90 degrees clockwise.
|
||||
|
|
|
@ -110,7 +110,7 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
|
|||
oddMask = mm256_set1_epi16(0xff00)
|
||||
vec128 = mm256_set1_epi16(128)
|
||||
hiMask = mm256_set1_epi16(255 shl 8)
|
||||
iterations = data.len div 8
|
||||
iterations = (data.len - i) div 8
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
values = mm256_load_si256(cast[pointer](p))
|
||||
|
@ -163,7 +163,7 @@ proc invertAvx2*(image: Image) {.simd.} =
|
|||
|
||||
let
|
||||
vec255 = mm256_set1_epi8(255)
|
||||
iterations = image.data.len div 16
|
||||
iterations = (image.data.len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
a = mm256_load_si256(cast[pointer](p))
|
||||
|
@ -211,7 +211,7 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} =
|
|||
div255 = mm256_set1_epi16(0x8081)
|
||||
zeroVec = mm256_setzero_si256()
|
||||
opacityVec = mm256_slli_epi16(mm256_set1_epi16(opacity), 8)
|
||||
iterations = image.data.len div 8
|
||||
iterations = (image.data.len - i) div 8
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
values = mm256_load_si256(cast[pointer](p))
|
||||
|
@ -257,7 +257,7 @@ proc ceilAvx2*(image: Image) {.simd.} =
|
|||
let
|
||||
vecZero = mm256_setzero_si256()
|
||||
vec255 = mm256_set1_epi8(255)
|
||||
iterations = image.data.len div 8
|
||||
iterations = (image.data.len - i) div 8
|
||||
for _ in 0 ..< iterations:
|
||||
var values = mm256_load_si256(cast[pointer](p))
|
||||
values = mm256_cmpeq_epi8(values, vecZero)
|
||||
|
@ -330,9 +330,8 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
|
|||
addedOddDiv4 = mm256_srli_epi16(addedOdd, 2)
|
||||
merged = mm256_or_si256(addedEvenDiv4, mm256_slli_epi16(addedOddDiv4, 8))
|
||||
# Merged has the correct values for the next two pixels at
|
||||
# index 0, 2, 4, 6 so mask the others out and permute into position
|
||||
masked = mm256_and_si256(merged, mergedMask)
|
||||
permuted = mm_256_permutevar8x32_epi32(masked, permuteControl)
|
||||
# index 0, 2, 4, 6 so permute into position and store
|
||||
permuted = mm_256_permutevar8x32_epi32(merged, permuteControl)
|
||||
mm_storeu_si128(
|
||||
result.data[result.dataIndex(x, y)].addr,
|
||||
mm256_castsi256_si128(permuted)
|
||||
|
|
|
@ -236,5 +236,30 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
|
|||
rgbx.a = ((rgbx.a * opacity) div 255).uint8
|
||||
image.data[i] = rgbx
|
||||
|
||||
proc ceilNeon*(image: Image) {.simd.} =
|
||||
var
|
||||
i: int
|
||||
p = cast[uint](image.data[0].addr)
|
||||
|
||||
let
|
||||
zeroVec = vmovq_n_u8(0)
|
||||
vec255 = vmovq_n_u8(255)
|
||||
iterations = image.data.len div 4
|
||||
for _ in 0 ..< iterations:
|
||||
var values = vld1q_u8(cast[pointer](p))
|
||||
values = vceqq_u8(values, zeroVec)
|
||||
values = vbicq_u8(vec255, values)
|
||||
vst1q_u8(cast[pointer](p), values)
|
||||
p += 16
|
||||
i += 4 * iterations
|
||||
|
||||
for i in i ..< image.data.len:
|
||||
var rgbx = image.data[i]
|
||||
rgbx.r = if rgbx.r == 0: 0 else: 255
|
||||
rgbx.g = if rgbx.g == 0: 0 else: 255
|
||||
rgbx.b = if rgbx.b == 0: 0 else: 255
|
||||
rgbx.a = if rgbx.a == 0: 0 else: 255
|
||||
image.data[i] = rgbx
|
||||
|
||||
when defined(release):
|
||||
{.pop.}
|
||||
|
|
|
@ -212,7 +212,7 @@ proc invertSse2*(image: Image) {.simd.} =
|
|||
|
||||
let
|
||||
vec255 = mm_set1_epi8(255)
|
||||
iterations = image.data.len div 16
|
||||
iterations = (image.data.len - i) div 16
|
||||
for _ in 0 ..< iterations:
|
||||
let
|
||||
a = mm_load_si128(cast[pointer](p))
|
||||
|
@ -264,7 +264,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} =
|
|||
div255 = mm_set1_epi16(0x8081)
|
||||
zeroVec = mm_setzero_si128()
|
||||
opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8)
|
||||
iterations = image.data.len div 4
|
||||
iterations = (image.data.len - i) div 4
|
||||
for _ in 0 ..< iterations:
|
||||
let values = mm_loadu_si128(cast[pointer](p))
|
||||
if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff:
|
||||
|
@ -308,7 +308,7 @@ proc ceilSse2*(image: Image) {.simd.} =
|
|||
let
|
||||
vecZero = mm_setzero_si128()
|
||||
vec255 = mm_set1_epi8(255)
|
||||
iterations = image.data.len div 8
|
||||
iterations = (image.data.len - i) div 8
|
||||
for _ in 0 ..< iterations:
|
||||
var
|
||||
values0 = mm_loadu_si128(cast[pointer](p))
|
||||
|
@ -383,13 +383,10 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
|
|||
addedOddDiv4 = mm_srli_epi16(addedOdd, 2)
|
||||
merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8))
|
||||
# Merged has the correct values for the next two pixels at
|
||||
# index 0 and 2 so mask the others out and shift 0 and 2 into
|
||||
# position and store
|
||||
masked = mm_and_si128(merged, mergedMask)
|
||||
mm_storeu_si128(
|
||||
result.data[result.dataIndex(x, y)].addr,
|
||||
mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0))
|
||||
)
|
||||
# index 0 and 2 so shift 0 and 2 into position and store
|
||||
shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0))
|
||||
lower = mm_cvtsi128_si64(shuffled)
|
||||
copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8)
|
||||
x += 2
|
||||
|
||||
for x in x ..< resultEvenWidth:
|
||||
|
|
Loading…
Reference in a new issue