diff --git a/examples/tiger/main.c b/examples/tiger/main.c
index 44d1d62..55e8164 100644
--- a/examples/tiger/main.c
+++ b/examples/tiger/main.c
@@ -65,7 +65,7 @@ int main()
 
 	//NOTE: create surface
 	mg_surface surface = mg_surface_create_for_window(window, MG_BACKEND_DEFAULT);
-	mg_surface_swap_interval(surface, 1);
+	mg_surface_swap_interval(surface, 0);
 
 	//TODO: create canvas
 	mg_canvas canvas = mg_canvas_create(surface);
diff --git a/src/mtl_canvas.m b/src/mtl_canvas.m
index 6da7310..0516f2f 100644
--- a/src/mtl_canvas.m
+++ b/src/mtl_canvas.m
@@ -244,6 +244,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[shapeEncoder setBuffer: backend->tilesBuffer offset:0 atIndex: 2];
 		[shapeEncoder setBuffer: backend->tilesOffset offset:0 atIndex: 3];
 		[shapeEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+		[shapeEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 5];
 
 		MTLSize shapeGroupSize = MTLSizeMake(backend->shapePipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize shapeGridSize = MTLSizeMake(shapeCount, 1, 1);
diff --git a/src/mtl_shader.h b/src/mtl_shader.h
index 604488a..7317800 100644
--- a/src/mtl_shader.h
+++ b/src/mtl_shader.h
@@ -65,12 +65,13 @@ typedef struct mg_triangle_data
 using namespace metal;
 #endif
 
-#define MG_TILE_CMD_MASK (1<<31)
+#define MG_TILE_CMD_MASK (3<<30)
 
 typedef enum mg_tile_cmd_kind
 {
 	mg_cmd_triangle = 0,
-	mg_cmd_color = 1<<31,
+	mg_cmd_color    = 1<<30,
+	mg_cmd_flip     = 2<<30
 } mg_tile_cmd_kind;
 
 typedef int mg_tile_cmd;
diff --git a/src/mtl_shader.metal b/src/mtl_shader.metal
index 1ee6241..7df34d3 100644
--- a/src/mtl_shader.metal
+++ b/src/mtl_shader.metal
@@ -61,9 +61,10 @@ kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]],
                        device mg_tile* tilesBuffer [[buffer(2)]],
                        device volatile atomic_uint* tilesOffset [[buffer(3)]],
                        constant float* scaling [[buffer(4)]],
+                       constant int2* viewport [[buffer(5)]],
                        uint gid [[thread_position_in_grid]])
 {
-
+	int2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
 	float4 box = shapeBuffer[gid].clip * scaling[0];
 
 	int2 firstTile = int2(box.xy)/RENDERER_TILE_SIZE;
@@ -72,8 +73,11 @@ kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]],
 	//      any tile queue, the tileQueues pointer for that shape would alias the tileQueues pointer of another
 	//      shape, and we would have to detect that in the tiling and drawing kernels. Instead, just accept some
 	//      waste and keep the other kernels more uniforms for now...
-	int nTilesX = int(box.z)/RENDERER_TILE_SIZE - firstTile.x + 1;
-	int nTilesY = int(box.w)/RENDERER_TILE_SIZE - firstTile.y + 1;
+	//TODO limit to screen
+	int2 lastTile = max(firstTile, min(int2(box.zw)/RENDERER_TILE_SIZE, tilesMatrixDim));
+
+	int nTilesX = lastTile.x - firstTile.x + 1;
+	int nTilesY = lastTile.y - firstTile.y + 1;
 
 	int tileCount = nTilesX * nTilesY;
 
@@ -172,8 +176,8 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 
 	int xMin = max(0, coarseBox.x - shapeQueue->area.x);
 	int yMin = max(0, coarseBox.y - shapeQueue->area.y);
-	int xMax = min(coarseBox.z - shapeQueue->area.x, shapeQueue->area.z);
-	int yMax = min(coarseBox.w - shapeQueue->area.y, shapeQueue->area.w);
+	int xMax = min(coarseBox.z - shapeQueue->area.x, shapeQueue->area.z-1);
+	int yMax = min(coarseBox.w - shapeQueue->area.y, shapeQueue->area.w-1);
 
 	//NOTE(martin): it's important to do the computation with signed int, so that we can have negative xMax/yMax
 	//              otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
@@ -233,6 +237,7 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 
 				device mg_tile_elt* elt = &eltBuffer[eltIndex];
 				elt->triangleIndex = gid;
+
 				elt->next = atomic_exchange_explicit(&tile->firstElt, eltIndex, memory_order_relaxed);
 
 				atomic_fetch_add_explicit(&tile->eltCount, 1, memory_order_relaxed);
@@ -254,6 +259,7 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 
 				if(allLeftFromEdge0 && allLeftFromEdge1 && allLeftFromEdge2 && triangleFull)
 				{
+					elt->triangleIndex |= mg_cmd_flip;
 					atomic_fetch_add_explicit(&tile->flipCount, 1, memory_order_relaxed);
 				}
 				else
@@ -310,7 +316,7 @@ kernel void GatherKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer(
 						const device mg_tile_elt* elt = &eltBuffer[firstEltIndex];
 
 						count = 0;
-						tileArray[count] = mg_cmd_color | elt->triangleIndex;
+						tileArray[count] = mg_cmd_color | (elt->triangleIndex & ~MG_TILE_CMD_MASK);
 						count++;
 						continue;
 					}
@@ -458,63 +464,75 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 			currentShapeIndex = triangle->shapeIndex;
 		}
 
-		if(cmdKind == mg_cmd_color)
+		switch(cmdKind)
 		{
-			for(int sampleIndex=0; sampleIndex<sampleCount; sampleIndex++)
+			case mg_cmd_color:
 			{
-				float4 nextColor = triangle->color;
-				nextColor.rgb *= nextColor.a;
-				sampleColor[sampleIndex] = nextColor;
-				flipCount[sampleIndex] = 0;
-			}
-		}
-		else
-		{
-			int2 p0 = triangle->p0;
-			int2 p1 = triangle->p1;
-			int2 p2 = triangle->p2;
-
-			int cw = triangle->cw;
-
-			int bias0 = triangle->bias0;
-			int bias1 = triangle->bias1;
-			int bias2 = triangle->bias2;
-
-			float4 cubic0 = triangle->cubic0;
-			float4 cubic1 = triangle->cubic1;
-			float4 cubic2 = triangle->cubic2;
-
-			bool fullTriangle = triangle->full;
-
-			int4 clip = triangle->box;
-
-			for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
-			{
-				int2 samplePoint = samplePoints[sampleIndex];
-
-				if(  samplePoint.x < clip.x
-			  	|| samplePoint.x > clip.z
-			  	|| samplePoint.y < clip.y
-			  	|| samplePoint.y > clip.w)
+				for(int sampleIndex=0; sampleIndex<sampleCount; sampleIndex++)
 				{
-					continue;
+					float4 nextColor = triangle->color;
+					nextColor.rgb *= nextColor.a;
+					sampleColor[sampleIndex] = nextColor;
+					flipCount[sampleIndex] = 0;
 				}
+			} break;
 
-				int w0 = cw*orient2d(p1, p2, samplePoint);
-				int w1 = cw*orient2d(p2, p0, samplePoint);
-				int w2 = cw*orient2d(p0, p1, samplePoint);
-
-				if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
+			case mg_cmd_flip:
+			{
+				for(int sampleIndex=0; sampleIndex<sampleCount; sampleIndex++)
 				{
-					float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
+					flipCount[sampleIndex]++;
+				}
+			} break;
 
-					if( fullTriangle
-					  ||(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= 0))
+			case mg_cmd_triangle:
+			{
+				int2 p0 = triangle->p0;
+				int2 p1 = triangle->p1;
+				int2 p2 = triangle->p2;
+
+				int cw = triangle->cw;
+
+				int bias0 = triangle->bias0;
+				int bias1 = triangle->bias1;
+				int bias2 = triangle->bias2;
+
+				float4 cubic0 = triangle->cubic0;
+				float4 cubic1 = triangle->cubic1;
+				float4 cubic2 = triangle->cubic2;
+
+				bool fullTriangle = triangle->full;
+
+				int4 clip = triangle->box;
+
+				for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
+				{
+					int2 samplePoint = samplePoints[sampleIndex];
+
+					if(  samplePoint.x < clip.x
+			  		|| samplePoint.x > clip.z
+			  		|| samplePoint.y < clip.y
+			  		|| samplePoint.y > clip.w)
 					{
-						flipCount[sampleIndex]++;
+						continue;
+					}
+
+					int w0 = cw*orient2d(p1, p2, samplePoint);
+					int w1 = cw*orient2d(p2, p0, samplePoint);
+					int w2 = cw*orient2d(p0, p1, samplePoint);
+
+					if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
+					{
+						float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
+
+						if( fullTriangle
+					  	||(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= 0))
+						{
+							flipCount[sampleIndex]++;
+						}
 					}
 				}
-			}
+			} break;
 		}
     }