From 0358330d3b3f867f390a773e2705d243a4629ba3 Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Sun, 4 Jun 2017 12:55:43 -0500
Subject: [PATCH 1/6] The initializer is surprisingly expensive.

Removing the call to Alpha(1) and replacing it with an inline definition
produces measurable improvements. Replacing each instance of ZV with
Vec{} further improves things. We keep an inline RGBA because there
are circumstances (mostly when using pictures) where we don't want to
have to set colors to get default behavior.

For a fairly triangle-heavy thing, this reduces time spent in SetLen
from something over 10% of execution time to around 2.5% of execution
time.
---
 data.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data.go b/data.go
index 4e6c528..c941241 100644
--- a/data.go
+++ b/data.go
@@ -45,7 +45,7 @@ func (td *TrianglesData) SetLen(len int) {
 				Color     RGBA
 				Picture   Vec
 				Intensity float64
-			}{ZV, Alpha(1), ZV, 0})
+			}{Color: RGBA{1, 1, 1, 1}})
 		}
 	}
 	if len < td.Len() {

From 34cdd8729b0915aa422c5a7c84c29e10ec2c572d Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Mon, 5 Jun 2017 18:54:53 -0500
Subject: [PATCH 2/6] Simplify Matrix math, use 6-value affine matrixes.

It turns out that affine matrices are much simpler than the 3x3 matrices
they imply, and we can use this to dramatically streamline some code.
For a test program, this was about a 50% gain in frame rate just from
the cost of the applyMatrixAndMask calls in imdraw, which were calling
matrix.Project() many times. Simplifying matrix.Project, alone, got a
nearly 50% frame rate boost!

Also modify pixelgl's SetMatrix to copy the six values of a 3x2
Affine into the corresponding locations of a 3x3 matrix.
---
 geometry.go       | 75 ++++++++++++++++++++++++++---------------------
 pixelgl/canvas.go | 11 +++++--
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/geometry.go b/geometry.go
index f934839..928c9c5 100644
--- a/geometry.go
+++ b/geometry.go
@@ -3,8 +3,6 @@ package pixel
 import (
 	"fmt"
 	"math"
-
-	"github.com/go-gl/mathgl/mgl64"
 )
 
 // Vec is a 2D vector type with X and Y coordinates.
@@ -251,7 +249,7 @@ func (r Rect) Union(s Rect) Rect {
 	)
 }
 
-// Matrix is a 3x3 transformation matrix that can be used for all kinds of spacial transforms, such
+// Matrix is a 3x2 affine matrix that can be used for all kinds of spatial transforms, such
 // as movement, scaling and rotations.
 //
 // Matrix has a handful of useful methods, each of which adds a transformation to the matrix. For
@@ -261,38 +259,41 @@ func (r Rect) Union(s Rect) Rect {
 //
 // This code creates a Matrix that first moves everything by 100 units horizontally and 200 units
 // vertically and then rotates everything by 90 degrees around the origin.
-type Matrix [9]float64
+//
+// Layout is:
+// [0] [2] [4]
+// [1] [3] [5]
+//  0   0   1  [implicit row]
+type Matrix [6]float64
 
 // IM stands for identity matrix. Does nothing, no transformation.
-var IM = Matrix(mgl64.Ident3())
+var IM = Matrix{1, 0, 0, 1, 0, 0}
 
 // String returns a string representation of the Matrix.
 //
 //   m := pixel.IM
-//   fmt.Println(m) // Matrix(1 0 0 | 0 1 0 | 0 0 1)
+//   fmt.Println(m) // Matrix(1 0 0 | 0 1 0)
 func (m Matrix) String() string {
 	return fmt.Sprintf(
-		"Matrix(%v %v %v | %v %v %v | %v %v %v)",
-		m[0], m[3], m[6],
-		m[1], m[4], m[7],
-		m[2], m[5], m[8],
+		"Matrix(%v %v %v | %v %v %v)",
+		m[0], m[2], m[4],
+		m[1], m[3], m[5],
 	)
 }
 
 // Moved moves everything by the delta vector.
 func (m Matrix) Moved(delta Vec) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(delta.XY()).Mul3(m3)
-	return Matrix(m3)
+	m[4], m[5] = m[4]+delta.X, m[5]+delta.Y
+	return m
 }
 
 // ScaledXY scales everything around a given point by the scale factor in each axis respectively.
 func (m Matrix) ScaledXY(around Vec, scale Vec) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(around.Scaled(-1).XY()).Mul3(m3)
-	m3 = mgl64.Scale2D(scale.XY()).Mul3(m3)
-	m3 = mgl64.Translate2D(around.XY()).Mul3(m3)
-	return Matrix(m3)
+	m[4], m[5] = m[4]-around.X, m[5]-around.Y
+	m[0], m[2], m[4] = m[0]*scale.X, m[2]*scale.X, m[4]*scale.X
+	m[1], m[3], m[5] = m[1]*scale.Y, m[3]*scale.Y, m[5]*scale.Y
+	m[4], m[5] = m[4]+around.X, m[5]+around.Y
+	return m
 }
 
 // Scaled scales everything around a given point by the scale factor.
@@ -302,36 +303,44 @@ func (m Matrix) Scaled(around Vec, scale float64) Matrix {
 
 // Rotated rotates everything around a given point by the given angle in radians.
 func (m Matrix) Rotated(around Vec, angle float64) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(around.Scaled(-1).XY()).Mul3(m3)
-	m3 = mgl64.Rotate3DZ(angle).Mul3(m3)
-	m3 = mgl64.Translate2D(around.XY()).Mul3(m3)
-	return Matrix(m3)
+	sint, cost := math.Sincos(angle)
+	m[4], m[5] = m[4]-around.X, m[5]-around.Y
+	m = m.Chained(Matrix{cost, sint, -sint, cost, 0, 0})
+	m[4], m[5] = m[4]+around.X, m[5]+around.Y
+	return m
 }
 
 // Chained adds another Matrix to this one. All tranformations by the next Matrix will be applied
 // after the transformations of this Matrix.
 func (m Matrix) Chained(next Matrix) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Mat3(next).Mul3(m3)
-	return Matrix(m3)
+	return Matrix{
+		m[0]*next[0] + m[2]*next[1],
+		m[1]*next[0] + m[3]*next[1],
+		m[0]*next[2] + m[2]*next[3],
+		m[1]*next[2] + m[3]*next[3],
+		m[0]*next[4] + m[2]*next[5] + m[4],
+		m[1]*next[4] + m[3]*next[5] + m[5],
+	}
 }
 
 // Project applies all transformations added to the Matrix to a vector u and returns the result.
 //
 // Time complexity is O(1).
 func (m Matrix) Project(u Vec) Vec {
-	m3 := mgl64.Mat3(m)
-	proj := m3.Mul3x1(mgl64.Vec3{u.X, u.Y, 1})
-	return V(proj.X(), proj.Y())
+	return Vec{X: m[0]*u.X + m[2]*u.Y + m[4], Y: m[1]*u.X + m[3]*u.Y + m[5]}
 }
 
 // Unproject does the inverse operation to Project.
 //
+// It turns out that multiplying a vector by the inverse matrix of m
+// can be nearly-accomplished by subtracting the translate part of the
+// matrix and multplying by the inverse of the top-left 2x2 matrix,
+// and the inverse of a 2x2 matrix is simple enough to just be
+// inlined in the computation.
+//
 // Time complexity is O(1).
 func (m Matrix) Unproject(u Vec) Vec {
-	m3 := mgl64.Mat3(m)
-	inv := m3.Inv()
-	unproj := inv.Mul3x1(mgl64.Vec3{u.X, u.Y, 1})
-	return V(unproj.X(), unproj.Y())
+	d := (m[0] * m[3]) - (m[1] * m[2])
+	u.X, u.Y = (u.X-m[4])/d, (u.Y-m[5])/d
+	return Vec{u.X*m[3] - u.Y*m[1], u.Y*m[0] - u.X*m[2]}
 }
diff --git a/pixelgl/canvas.go b/pixelgl/canvas.go
index a088350..b7b5cfc 100644
--- a/pixelgl/canvas.go
+++ b/pixelgl/canvas.go
@@ -90,9 +90,16 @@ func (c *Canvas) MakePicture(p pixel.Picture) pixel.TargetPicture {
 }
 
 // SetMatrix sets a Matrix that every point will be projected by.
+// pixel.Matrix is 3x2 with an implicit 0, 0, 1 row after it. So
+// [0] [2] [4]    [0] [3] [6]
+// [1] [3] [5] => [1] [4] [7]
+//  0   0   1      0   0   1
+// since all matrix ops are affine, the last row never changes,
+// and we don't need to copy it
+//
 func (c *Canvas) SetMatrix(m pixel.Matrix) {
-	for i := range m {
-		c.mat[i] = float32(m[i])
+	for i, j := range [6]int{ 0, 1, 3, 4, 6, 7} {
+		c.mat[j] = float32(m[i])
 	}
 }
 

From 9a7ab1c6b0d1aa611d73ced78ef10dbb44d6bf80 Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Mon, 5 Jun 2017 19:37:53 -0500
Subject: [PATCH 3/6] use point pool

For internal operations (anything using getAndClearPoints), there's a
pretty good chance that the operation will repeatedly invoke something
like fillPolygon(), meaning that it needs to push "a few" points
and then invoke something that uses those points.

So, we add a slice for containing spare slices of points, and on the
way out of each such function, shove the current imd.points (as used
inside that function) onto a stack, and set imd.points to [0:0] of
the thing it was called with.

Performance goes from 11-13fps to 17-18fps on my test case.
---
 imdraw/imdraw.go | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/imdraw/imdraw.go b/imdraw/imdraw.go
index 626cb2f..2fe1fad 100644
--- a/imdraw/imdraw.go
+++ b/imdraw/imdraw.go
@@ -52,6 +52,7 @@ type IMDraw struct {
 	EndShape  EndShape
 
 	points []point
+	pool   [][]point
 	matrix pixel.Matrix
 	mask   pixel.RGBA
 
@@ -109,7 +110,7 @@ func (imd *IMDraw) Clear() {
 //
 // This does not affect matrix and color mask set by SetMatrix and SetColorMask.
 func (imd *IMDraw) Reset() {
-	imd.points = nil
+	imd.points = imd.points[:0]
 	imd.Color = pixel.Alpha(1)
 	imd.Picture = pixel.ZV
 	imd.Intensity = 0
@@ -256,10 +257,22 @@ func (imd *IMDraw) EllipseArc(radius pixel.Vec, low, high, thickness float64) {
 
 func (imd *IMDraw) getAndClearPoints() []point {
 	points := imd.points
-	imd.points = nil
+	// use one of the existing pools so we don't reallocate as often
+	if len(imd.pool) > 0 {
+		pos := len(imd.pool) - 1
+		imd.points = imd.pool[pos]
+		imd.pool = imd.pool[0:pos]
+	} else {
+		imd.points = nil
+	}
 	return points
 }
 
+func (imd *IMDraw) restorePoints(points []point) {
+	imd.pool = append(imd.pool, imd.points)
+	imd.points = points[:0]
+}
+
 func (imd *IMDraw) applyMatrixAndMask(off int) {
 	for i := range (*imd.tri)[off:] {
 		(*imd.tri)[off+i].Position = imd.matrix.Project((*imd.tri)[off+i].Position)
@@ -271,6 +284,7 @@ func (imd *IMDraw) fillRectangle() {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 2 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -302,12 +316,14 @@ func (imd *IMDraw) fillRectangle() {
 
 	imd.applyMatrixAndMask(off)
 	imd.batch.Dirty()
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) outlineRectangle(thickness float64) {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 2 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -323,12 +339,14 @@ func (imd *IMDraw) outlineRectangle(thickness float64) {
 		imd.pushPt(pixel.V(b.pos.X, a.pos.Y), mid)
 		imd.polyline(thickness, true)
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) fillPolygon() {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 3 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -346,6 +364,7 @@ func (imd *IMDraw) fillPolygon() {
 
 	imd.applyMatrixAndMask(off)
 	imd.batch.Dirty()
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) fillEllipseArc(radius pixel.Vec, low, high float64) {
@@ -387,6 +406,7 @@ func (imd *IMDraw) fillEllipseArc(radius pixel.Vec, low, high float64) {
 		imd.applyMatrixAndMask(off)
 		imd.batch.Dirty()
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) outlineEllipseArc(radius pixel.Vec, low, high, thickness float64, doEndShape bool) {
@@ -485,12 +505,14 @@ func (imd *IMDraw) outlineEllipseArc(radius pixel.Vec, low, high, thickness floa
 			}
 		}
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) polyline(thickness float64, closed bool) {
 	points := imd.getAndClearPoints()
 
 	if len(points) == 0 {
+		imd.restorePoints(points)
 		return
 	}
 	if len(points) == 1 {
@@ -591,4 +613,5 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			imd.fillEllipseArc(pixel.V(thickness/2, thickness/2), normal.Angle(), normal.Angle()-math.Pi)
 		}
 	}
+	imd.restorePoints(points)
 }

From 918031892a487e02b46755913bdc75fb0c5a1776 Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Mon, 5 Jun 2017 19:46:16 -0500
Subject: [PATCH 4/6] smaller imdraw optimizations

For polyline, don't compute each normal twice; when we're going through a line,
the "next" normal for segment N is always the "previous" normal for segment
N+1, and we can compute fewer of them.
---
 imdraw/imdraw.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/imdraw/imdraw.go b/imdraw/imdraw.go
index 2fe1fad..0d40452 100644
--- a/imdraw/imdraw.go
+++ b/imdraw/imdraw.go
@@ -543,6 +543,8 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 	imd.pushPt(points[j].pos.Sub(normal), points[j])
 
 	// middle points
+	// compute "previous" normal:
+	ijNormal := points[1].pos.Sub(points[0].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 	for i := 0; i < len(points); i++ {
 		j, k := i+1, i+2
 
@@ -558,7 +560,6 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			k %= len(points)
 		}
 
-		ijNormal := points[j].pos.Sub(points[i].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 		jkNormal := points[k].pos.Sub(points[j].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 
 		orientation := 1.0
@@ -589,6 +590,8 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			imd.pushPt(points[j].pos.Add(jkNormal), points[j])
 			imd.pushPt(points[j].pos.Sub(jkNormal), points[j])
 		}
+		// "next" normal becomes previous normal
+		ijNormal = jkNormal
 	}
 
 	// last point

From fc858bff4d66c35184fe63d179d39927329fa144 Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Mon, 5 Jun 2017 20:12:35 -0500
Subject: [PATCH 5/6] Reduce copying in fillPolygon

A slice of points means copying every point into the slice, then
copying every point's data from the slice to TrianglesData. An
array of indicies lets the compiler make better choices.
---
 imdraw/imdraw.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/imdraw/imdraw.go b/imdraw/imdraw.go
index 0d40452..e5a8c95 100644
--- a/imdraw/imdraw.go
+++ b/imdraw/imdraw.go
@@ -354,11 +354,12 @@ func (imd *IMDraw) fillPolygon() {
 	imd.tri.SetLen(imd.tri.Len() + 3*(len(points)-2))
 
 	for i, j := 1, off; i+1 < len(points); i, j = i+1, j+3 {
-		for k, p := range []point{points[0], points[i], points[i+1]} {
-			(*imd.tri)[j+k].Position = p.pos
-			(*imd.tri)[j+k].Color = p.col
-			(*imd.tri)[j+k].Picture = p.pic
-			(*imd.tri)[j+k].Intensity = p.in
+		for k, p := range []int{0, i, i + 1} {
+			tri := &(*imd.tri)[j+k]
+			tri.Position = points[p].pos
+			tri.Color = points[p].col
+			tri.Picture = points[p].pic
+			tri.Intensity = points[p].in
 		}
 	}
 

From 7215265523612fdda6a2429b7409f9da76a83f76 Mon Sep 17 00:00:00 2001
From: Seebs <seebs@seebs.net>
Date: Fri, 9 Jun 2017 00:07:08 -0500
Subject: [PATCH 6/6] Don't duplicate computations in gltriangles.go

The computation including a call to Stride() can't be optimized away
safely because the compiler can't tell that Stride() is effectively
constant, but we know it won't change so we can make a slice pointing
at that part of the array.

CPU time for updateData goes from 26.35% to 18.65% in my test case.
---
 pixelgl/gltriangles.go | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pixelgl/gltriangles.go b/pixelgl/gltriangles.go
index 64b7355..bf7a895 100644
--- a/pixelgl/gltriangles.go
+++ b/pixelgl/gltriangles.go
@@ -103,15 +103,17 @@ func (gt *GLTriangles) updateData(t pixel.Triangles) {
 				tx, ty = (*t)[i].Picture.XY()
 				in     = (*t)[i].Intensity
 			)
-			gt.data[i*gt.vs.Stride()+0] = float32(px)
-			gt.data[i*gt.vs.Stride()+1] = float32(py)
-			gt.data[i*gt.vs.Stride()+2] = float32(col.R)
-			gt.data[i*gt.vs.Stride()+3] = float32(col.G)
-			gt.data[i*gt.vs.Stride()+4] = float32(col.B)
-			gt.data[i*gt.vs.Stride()+5] = float32(col.A)
-			gt.data[i*gt.vs.Stride()+6] = float32(tx)
-			gt.data[i*gt.vs.Stride()+7] = float32(ty)
-			gt.data[i*gt.vs.Stride()+8] = float32(in)
+			s := gt.vs.Stride()
+			d := gt.data[i*s : i*s+9]
+			d[0] = float32(px)
+			d[1] = float32(py)
+			d[2] = float32(col.R)
+			d[3] = float32(col.G)
+			d[4] = float32(col.B)
+			d[5] = float32(col.A)
+			d[6] = float32(tx)
+			d[7] = float32(ty)
+			d[8] = float32(in)
 		}
 		return
 	}