diff --git a/src/pixie/images.nim b/src/pixie/images.nim
index af92375..2cb808e 100644
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@@ -394,26 +394,26 @@ proc magnifyBy2*(image: Image, power = 1): Image {.raises: [PixieError].} =
     var x: int
     when defined(amd64) and not defined(pixieNoSimd):
       if scale == 2:
-        let mask = cast[M128i]([uint32.high, 0, 0, 0])
-        for _ in countup(0, image.width - 4, 2):
+        while x <= image.width - 4:
           let
             values = mm_loadu_si128(image.data[image.dataIndex(x, y)].addr)
-            first = mm_and_si128(values, mask)
-            second = mm_and_si128(mm_srli_si128(values, 4), mask)
-            combined = mm_or_si128(first, mm_slli_si128(second, 8))
-            doubled = mm_or_si128(combined, mm_slli_si128(combined, 4))
+            lo = mm_unpacklo_epi32(values, mm_setzero_si128())
+            hi = mm_unpackhi_epi32(values, mm_setzero_si128())
           mm_storeu_si128(
-            result.data[result.dataIndex(x * scale, y * scale)].addr,
-            doubled
+            result.data[result.dataIndex(x * scale + 0, y * scale)].addr,
+            mm_or_si128(lo, mm_slli_si128(lo, 4))
           )
-          x += 2
-    for _ in x ..< image.width:
+          mm_storeu_si128(
+            result.data[result.dataIndex(x * scale + 4, y * scale)].addr,
+            mm_or_si128(hi, mm_slli_si128(hi, 4))
+          )
+          x += 4
+    for x in x ..< image.width:
       let
         rgbx = image.unsafe[x, y]
         resultIdx = result.dataIndex(x * scale, y * scale)
       for i in 0 ..< scale:
         result.data[resultIdx + i] = rgbx
-      inc x
     # Copy that row of pixels into (scale - 1) more rows
     let rowStart = result.dataIndex(0, y * scale)
     for i in 1 ..< scale:
diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim
index affa6e3..7643a21 100644
--- a/src/pixie/masks.nim
+++ b/src/pixie/masks.nim
@@ -166,14 +166,41 @@ proc magnifyBy2*(mask: Mask, power = 1): Mask {.raises: [PixieError].} =
 
   let scale = 2 ^ power
   result = newMask(mask.width * scale, mask.height * scale)
-  for y in 0 ..< result.height:
-    for x in 0 ..< mask.width:
+
+  for y in 0 ..< mask.height:
+    # Write one row of values duplicated by scale
+    var x: int
+    when defined(amd64) and not defined(pixieNoSimd):
+      if scale == 2:
+        while x <= mask.width - 16:
+          let
+            values = mm_loadu_si128(mask.data[mask.dataIndex(x, y)].addr)
+            lo = mm_unpacklo_epi8(values, mm_setzero_si128())
+            hi = mm_unpacklo_epi8(values, mm_setzero_si128())
+          mm_storeu_si128(
+            result.data[result.dataIndex(x * scale + 0, y * scale)].addr,
+            mm_or_si128(lo, mm_slli_si128(lo, 1))
+          )
+          mm_storeu_si128(
+            result.data[result.dataIndex(x * scale + 16, y * scale)].addr,
+            mm_or_si128(hi, mm_slli_si128(hi, 1))
+          )
+          x += 16
+    for x in x ..< mask.width:
       let
         value = mask.unsafe[x, y div scale]
         scaledX = x * scale
         idx = result.dataIndex(scaledX, y)
       for i in 0 ..< scale:
         result.data[idx + i] = value
+    # Copy that row of values into (scale - 1) more rows
+    let rowStart = result.dataIndex(0, y * scale)
+    for i in 1 ..< scale:
+      copyMem(
+        result.data[rowStart + result.width * i].addr,
+        result.data[rowStart].addr,
+        result.width * 4
+      )
 
 proc fillUnsafe*(
   data: var seq[uint8], value: uint8, start, len: int
diff --git a/tests/benchmark_masks.nim b/tests/benchmark_masks.nim
index 2effded..cd7a527 100644
--- a/tests/benchmark_masks.nim
+++ b/tests/benchmark_masks.nim
@@ -13,6 +13,12 @@ timeIt "minifyBy2":
 
 reset()
 
+timeIt "magnifyBy2":
+  let magnified = mask.magnifyBy2()
+  doAssert magnified[0, 0] == 63
+
+reset()
+
 timeIt "invert":
   mask.invert()