Merge pull request #79 from guzba/master

tiger 2x faster, uint8 coverage (less mem), small things
2021-01-26 15:21:55 -08:00 · 2021-01-26 15:21:55 -08:00 · 7bcb138c6f
commit 7bcb138c6f
parent a252dad7b3 f49564b167
7 changed files with 102 additions and 85 deletions
--- a/experiments/benchmark_cairo.nim
+++ b/experiments/benchmark_cairo.nim
@ -1,4 +1,4 @@
-import cairo, math, benchy, pixie, chroma
+import cairo, math, benchy, pixie, pixie/paths, chroma
 var
  surface = imageSurfaceCreate(FORMAT_ARGB32, 1000, 1000)
--- a/pixie.nimble
+++ b/pixie.nimble
@ -10,5 +10,5 @@ requires "vmath >= 0.4.0"
 requires "chroma >= 0.2.1"
 requires "zippy >= 0.3.5"
 requires "flatty >= 0.1.3"
-requires "nimsimd >= 0.4.6"
+requires "nimsimd >= 0.4.8"
 requires "bumpy >= 1.0.1"
--- a/src/pixie/blends.nim
+++ b/src/pixie/blends.nim
@ -299,16 +299,16 @@ when defined(amd64) and not defined(pixieNoSimd):
 else:
  proc alphaFix(backdrop, source, mixed: ColorRGBA): ColorRGBA {.inline.} =
    let
-      sa = source.a.int32
+      sa = source.a.uint32
-      ba = backdrop.a.int32
+      ba = backdrop.a.uint32
      t0 = sa * (255 - ba)
      t1 = sa * ba
      t2 = (255 - sa) * ba
    let
-      r = t0 * source.r.int32 + t1 * mixed.r.int32 + t2 * backdrop.r.int32
+      r = t0 * source.r.uint32 + t1 * mixed.r.uint32 + t2 * backdrop.r.uint32
-      g = t0 * source.g.int32 + t1 * mixed.g.int32 + t2 * backdrop.g.int32
+      g = t0 * source.g.uint32 + t1 * mixed.g.uint32 + t2 * backdrop.g.uint32
-      b = t0 * source.b.int32 + t1 * mixed.b.int32 + t2 * backdrop.b.int32
+      b = t0 * source.b.uint32 + t1 * mixed.b.uint32 + t2 * backdrop.b.uint32
      a = sa + ba * (255 - sa) div 255
    if a == 0:
--- a/src/pixie/common.nim
+++ b/src/pixie/common.nim
@ -47,8 +47,9 @@ proc toPremultipliedAlpha*(c: Color): Color {.inline.} =
 proc toStraightAlpha*(c: Color): Color {.inline.} =
  ## Converts a color to from premultiplied alpha to straight.
-  if c.a == 0:
+  if c.a != 0 and c.a != 1:
-    return
+    result = c
  else:
    result.r = c.r / c.a
    result.g = c.g / c.a
    result.b = c.b / c.a
--- a/src/pixie/fileformats/png.nim
+++ b/src/pixie/fileformats/png.nim
@ -437,10 +437,9 @@ proc encodePng*(
      raise newException(PixieError, "Invalid PNG number of channels")
  let data = cast[ptr UncheckedArray[uint8]](data)
  const signature = [137.uint8, 80, 78, 71, 13, 10, 26, 10]
  # Add the PNG file signature
-  result.add(signature)
+  result.add(pngSignature)
  # Add IHDR
  result.addUint32(13.uint32.swap())
--- a/src/pixie/images.nim
+++ b/src/pixie/images.nim
@ -66,7 +66,7 @@ proc `[]=`*(image: Image, x, y: int, rgba: ColorRGBA) {.inline.} =
    image.setRgbaUnsafe(x, y, rgba)
 proc fillUnsafe(data: var seq[ColorRGBA], rgba: ColorRGBA, start, len: int) =
-  ## Fills the image data with a solid color starting at index start and
+  ## Fills the image data with the parameter color starting at index start and
  ## continuing for len indices.
  # Use memset when every byte has the same value
@ -95,7 +95,7 @@ proc fillUnsafe(data: var seq[ColorRGBA], rgba: ColorRGBA, start, len: int) =
      data[j] = rgba
 proc fill*(image: Image, rgba: ColorRgba) {.inline.} =
-  ## Fills the image with a solid color.
+  ## Fills the image with the parameter color.
  fillUnsafe(image.data, rgba, 0, image.data.len)
 proc flipHorizontal*(image: Image) =
@ -234,7 +234,7 @@ proc invert*(image: Image) =
  ## Inverts all of the colors and alpha.
  var i: int
  when defined(amd64) and not defined(pixieNoSimd):
-    let vec255 = mm_set1_epi8(255)
+    let vec255 = mm_set1_epi8(cast[int8](255))
    while i < image.data.len - 4:
      var m = mm_loadu_si128(image.data[i].addr)
      m = mm_sub_epi8(vec255, m)
@ -251,18 +251,18 @@ proc invert*(image: Image) =
 proc getRgbaSmooth*(image: Image, x, y: float32): ColorRGBA {.inline.} =
  let
    minX = x.floor.int
-    difX = x - x.floor
+    diffX = x - x.floor
    minY = y.floor.int
-    difY = y - y.floor
+    diffY = y - y.floor
-    vX0Y0 = image[minX, minY].toPremultipliedAlpha()
+    x0y0 = image[minX, minY].toPremultipliedAlpha()
-    vX1Y0 = image[minX + 1, minY].toPremultipliedAlpha()
+    x1y0 = image[minX + 1, minY].toPremultipliedAlpha()
-    vX0Y1 = image[minX, minY + 1].toPremultipliedAlpha()
+    x0y1 = image[minX, minY + 1].toPremultipliedAlpha()
-    vX1Y1 = image[minX + 1, minY + 1].toPremultipliedAlpha()
+    x1y1 = image[minX + 1, minY + 1].toPremultipliedAlpha()
-    bottomMix = lerp(vX0Y0, vX1Y0, difX)
+    bottomMix = lerp(x0y0, x1y0, diffX)
-    topMix = lerp(vX0Y1, vX1Y1, difX)
+    topMix = lerp(x0y1, x1y1, diffX)
-    finalMix = lerp(bottomMix, topMix, difY)
+    finalMix = lerp(bottomMix, topMix, diffY)
  finalMix.toStraightAlpha()
@ -376,6 +376,7 @@ proc blurAlpha*(image: Image, radius: float32) =
 proc shift*(image: Image, offset: Vec2) =
  ## Shifts the image by offset.
  if offset != vec2(0, 0):
    let copy = image.copy() # Copy to read from.
    image.fill(rgba(0, 0, 0, 0)) # Reset this for being drawn to.
    image.draw(copy, offset) # Draw copy into image.
@ -465,7 +466,7 @@ proc drawCorrect*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
 proc drawUber(
  a, b: Image,
  p, dx, dy: Vec2,
-  lines: array[0..3, Segment],
+  segments: array[0..3, Segment],
  blendMode: BlendMode,
  smooth: bool
 ) =
@ -475,13 +476,13 @@ proc drawUber(
      xMin = a.width
      xMax = 0
    for yOffset in [0.float32, 1]:
-      var scanLine = segment(
+      var scanLine = Line(
-        vec2(-100000, y.float32 + yOffset),
+        a: vec2(-1000, y.float32 + yOffset),
-        vec2(10000, y.float32 + yOffset)
+        b: vec2(1000, y.float32 + yOffset)
      )
-      for l in lines:
+      for segment in segments:
        var at: Vec2
-        if intersects(l, scanLine, at) and l.to != at:
+        if scanline.intersects(segment, at) and segment.to != at:
          xMin = min(xMin, at.x.floor.int)
          xMax = max(xMax, at.x.ceil.int)
@ -519,7 +520,7 @@ proc draw*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
      mat * vec2(b.width.float32, b.height.float32),
      mat * vec2(0, b.height.float32)
    ]
-    lines = [
+    segments = [
      segment(corners[0], corners[1]),
      segment(corners[1], corners[2]),
      segment(corners[2], corners[3]),
@ -543,10 +544,14 @@ proc draw*(a, b: Image, mat: Mat3, blendMode: BlendMode) =
    minFilterBy2 /= 2
    matInv = matInv * scale(vec2(0.5, 0.5))
-  let smooth = not(dx.length == 1.0 and dy.length == 1.0 and
+  let smooth = not(
-    mat[2, 0].fractional == 0.0 and mat[2, 1].fractional == 0.0)
+    dx.length == 1.0 and
    dy.length == 1.0 and
    mat[2, 0].fractional == 0.0 and
    mat[2, 1].fractional == 0.0
  )
-  a.drawUber(b, p, dx, dy, lines, blendMode, smooth)
+  a.drawUber(b, p, dx, dy, segments, blendMode, smooth)
 proc draw*(a, b: Image, pos = vec2(0, 0), blendMode = bmNormal) {.inline.} =
  a.draw(b, translate(pos), blendMode)
--- a/src/pixie/paths.nim
+++ b/src/pixie/paths.nim
@ -744,13 +744,14 @@ proc quickSort(a: var seq[(float32, bool)], inl, inr: int) =
  quickSort(a, inl, r)
  quickSort(a, l, inr)
-proc computeBounds(shape: seq[Vec2]): Rect =
+proc computeBounds(shapes: seq[seq[(Segment, bool)]]): Rect =
  var
    xMin = float32.high
    xMax = float32.low
    yMin = float32.high
    yMax = float32.low
-  for segment in shape.segments:
+  for shape in shapes:
    for (segment, _) in shape:
      xMin = min(xMin, min(segment.at.x, segment.to.x))
      xMax = max(xMax, max(segment.at.x, segment.to.x))
      yMin = min(yMin, min(segment.at.y, segment.to.y))
@ -775,36 +776,23 @@ proc fillShapes(
  var sortedShapes = newSeq[seq[(Segment, bool)]](shapes.len)
  for i, sorted in sortedShapes.mpairs:
    for segment in shapes[i].segments:
-      if segment.at.y == segment.to.y:
+      if segment.at.y == segment.to.y: # Skip horizontal
        # Skip horizontal and zero-length
        continue
-      var
+      let winding = segment.at.y > segment.to.y
        segment = segment
        winding = segment.at.y > segment.to.y
      if winding:
        var segment = segment
        swap(segment.at, segment.to)
        sorted.add((segment, winding))
      else:
        sorted.add((segment, winding))
-  # Compute the bounds of each shape
+  # Figure out the total bounds of all the shapes,
-  var bounds = newSeq[Rect](shapes.len)
+  # rasterize only within the total bounds
  for i, shape in shapes:
    bounds[i] = computeBounds(shape)
  # Figure out the total bounds of all the shapes
  var
    minX = float32.high
    minY = float32.high
    maxY = float32.low
  for bounds in bounds:
    minX = min(minX, bounds.x)
    minY = min(minY, bounds.y)
    maxY = max(maxY, bounds.y + bounds.h)
  # Rasterize only within the total bounds
  let
-    startX = max(0, minX.int)
+    bounds = computeBounds(sortedShapes)
-    startY = max(0, miny.int)
+    startX = max(0, bounds.x.int)
-    stopY = min(image.height, maxY.int)
+    startY = max(0, bounds.y.int)
    stopY = min(image.height, (bounds.y + bounds.h).int)
  const
    quality = 5 # Must divide 255 cleanly
@ -815,12 +803,12 @@ proc fillShapes(
  var
    hits = newSeq[(float32, bool)](4)
-    coverages = newSeq[uint32](image.width)
+    coverages = newSeq[uint8](image.width)
    numHits: int
  for y in startY ..< stopY:
    # Reset buffer for this row
-    zeroMem(coverages[0].addr, coverages.len * 4)
+    zeroMem(coverages[0].addr, coverages.len)
    # Do scanlines for this row
    for m in 0 ..< quality:
@ -829,10 +817,9 @@ proc fillShapes(
        scanline = Line(a: vec2(0, yLine), b: vec2(1000, yLine))
      numHits = 0
      for i, shape in sortedShapes:
        let bounds = bounds[i]
        if bounds.y > y.float32 or bounds.y + bounds.h < y.float32:
          continue
        for (segment, winding) in shape:
          if segment.at.y > yLine or segment.to.y < y.float32:
            continue
          var at: Vec2
          if scanline.intersects(segment, at):# and segment.to != at:
            if numHits == hits.len:
@ -872,11 +859,14 @@ proc fillShapes(
        if fillLen > 0 and shouldFill(windingRule, count):
          var i = fillStart
          when defined(amd64) and not defined(pixieNoSimd):
-            let m = mm_set1_epi32(sampleCoverage.int32)
+            let vSampleCoverage = mm_set1_epi8(cast[int8](sampleCoverage))
-            for j in countup(i, fillStart + fillLen - 4, 4):
+            for j in countup(i, fillStart + fillLen - 16, 16):
              let current = mm_loadu_si128(coverages[j].addr)
-              mm_storeu_si128(coverages[j].addr, mm_add_epi32(m, current))
+              mm_storeu_si128(
-              i += 4
+                coverages[j].addr,
                mm_add_epi8(current, vSampleCoverage)
              )
              i += 16
          for j in i ..< fillStart + fillLen:
            coverages[j] += sampleCoverage
@ -889,17 +879,39 @@ proc fillShapes(
      # When supported, SIMD blend as much as possible
      let
        coverageMask1 = cast[M128i]([0xffffffff, 0, 0, 0]) # First 32 bits
        coverageMask3 = mm_set1_epi32(cast[int32](0x000000ff)) # Only `r`
        oddMask = mm_set1_epi16(cast[int16](0xff00))
        div255 = mm_set1_epi16(cast[int16](0x8081))
        zero = mm_set1_epi32(0)
        v255 = mm_set1_epi32(255)
        vColor = mm_set1_epi32(cast[int32](color))
-      for _ in countup(x, coverages.len - 4, 4):
+      for _ in countup(x, coverages.len - 16, 16):
        var coverage = mm_loadu_si128(coverages[x].addr)
        coverage = mm_and_si128(coverage, coverageMask1)
-        if mm_movemask_epi8(mm_cmpeq_epi32(coverage, zero)) != 0xffff:
+        if mm_movemask_epi8(mm_cmpeq_epi16(coverage, zero)) != 0xffff:
          # If the coverages are not all zero
-          var source = mm_set1_epi32(cast[int32](color))
+          var source = vColor
          coverage = mm_slli_si128(coverage, 2)
          coverage = mm_shuffle_epi32(coverage, MM_SHUFFLE(1, 1, 0, 0))
          var
            a = mm_and_si128(coverage, coverageMask1)
            b = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 4))
            c = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 8))
            d = mm_and_si128(coverage, mm_slli_si128(coverageMask1, 12))
          # Shift the coverages to `r`
          a = mm_srli_si128(a, 2)
          b = mm_srli_si128(b, 3)
          d = mm_srli_si128(d, 1)
          coverage = mm_and_si128(
            mm_or_si128(mm_or_si128(a, b), mm_or_si128(c, d)),
            coverageMask3
          )
          if mm_movemask_epi8(mm_cmpeq_epi32(coverage, v255)) != 0xffff:
            # If the coverages are not all 255
@ -932,10 +944,10 @@ proc fillShapes(
        x += 4
    while x < image.width:
-      if x + 2 <= coverages.len:
+      if x + 8 <= coverages.len:
        let peeked = cast[ptr uint64](coverages[x].addr)[]
        if peeked == 0:
-          x += 2
+          x += 8
          continue
      let coverage = coverages[x]