faster

2022-07-27 00:25:23 -05:00 · 2022-07-27 00:25:23 -05:00 · 2d39091c44
commit 2d39091c44
parent bf15e44b4f
2 changed files with 4 additions and 7 deletions
--- a/src/pixie/simd/avx2.nim
+++ b/src/pixie/simd/avx2.nim
@ -330,9 +330,8 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
          addedOddDiv4 = mm256_srli_epi16(addedOdd, 2)
          merged = mm256_or_si256(addedEvenDiv4, mm256_slli_epi16(addedOddDiv4, 8))
          # Merged has the correct values for the next two pixels at
-          # index 0, 2, 4, 6 so mask the others out and permute into position
-          masked = mm256_and_si256(merged, mergedMask)
-          permuted = mm_256_permutevar8x32_epi32(masked, permuteControl)
+          # index 0, 2, 4, 6 so permute into position and store
+          permuted = mm_256_permutevar8x32_epi32(merged, permuteControl)
        mm_storeu_si128(
          result.data[result.dataIndex(x, y)].addr,
          mm256_castsi256_si128(permuted)
--- a/src/pixie/simd/sse2.nim
+++ b/src/pixie/simd/sse2.nim
@ -383,10 +383,8 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
          addedOddDiv4 = mm_srli_epi16(addedOdd, 2)
          merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8))
          # Merged has the correct values for the next two pixels at
-          # index 0 and 2 so mask the others out and shift 0 and 2 into
-          # position and store
-          masked = mm_and_si128(merged, mergedMask)
-          shuffled = mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0))
+          # index 0 and 2 so shift 0 and 2 into position and store
+          shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0))
          lower = mm_cvtsi128_si64(shuffled)
        copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8)
        x += 2