[mtl canvas] Tiling per rectangles into per-shape tile queues (linked lists of triangle indices) to avoid sorting

2023-03-20 16:46:12 +01:00 · 2023-03-20 16:46:12 +01:00 · a4ef58f2d8
parent 11113f597c
commit a4ef58f2d8
6 changed files with 304 additions and 262 deletions
--- a/examples/canvas/main.c
+++ b/examples/canvas/main.c
@ -162,6 +162,7 @@ int main()

 			// head
 			mg_set_color_rgba(1, 1, 0, 1);
+
 			mg_circle_fill(x, y, 200);

 			// smile
--- a/examples/tiger/main.c
+++ b/examples/tiger/main.c
@ -82,6 +82,11 @@ int main()
 	mp_window_bring_to_front(window);
 	mp_window_focus(window);

+	bool tracked = false;
+	vec2 trackPoint = {0};
+	f32 zoom = 1;
+	f32 startX = 0, startY = 0;
+
 	f64 frameTime = 0;

 	while(!mp_should_quit())
@ -99,8 +104,35 @@ int main()
 					mp_request_quit();
 				} break;

-				case MP_EVENT_KEYBOARD_KEY:
+				case MP_EVENT_MOUSE_BUTTON:
 				{
+					if(event.key.code == MP_MOUSE_LEFT)
+					{
+						if(event.key.action == MP_KEY_PRESS)
+						{
+							tracked = true;
+							vec2 mousePos = mp_mouse_position();
+							trackPoint.x = mousePos.x/zoom - startX;
+							trackPoint.y = mousePos.y/zoom - startY;
+						}
+						else
+						{
+							tracked = false;
+						}
+					}
+				} break;
+
+				case MP_EVENT_MOUSE_WHEEL:
+				{
+					vec2 mousePos = mp_mouse_position();
+					f32 trackX = mousePos.x/zoom - startX;
+					f32 trackY = mousePos.y/zoom - startY;
+
+					zoom *= 1 + event.move.deltaY * 0.01;
+					zoom = Clamp(zoom, 0.2, 10);
+
+					startX = mousePos.x/zoom - trackX;
+					startY = mousePos.y/zoom - trackY;
 				} break;

 				default:
@ -108,13 +140,20 @@ int main()
 			}
 		}

+		if(tracked)
+		{
+			vec2 mousePos = mp_mouse_position();
+			startX = mousePos.x/zoom - trackPoint.x;
+			startY = mousePos.y/zoom - trackPoint.y;
+		}
+
 		mg_surface_prepare(surface);

 		mg_set_color_rgba(1, 0, 1, 1);
 		mg_clear();

-		mg_matrix_push((mg_mat2x3){1, 0, 300,
-		                           0, 1, 200});
+		mg_matrix_push((mg_mat2x3){zoom, 0, 300+startX*zoom,
+		                           0, zoom, 200+startY*zoom});

 		draw_tiger();

--- a/src/graphics.c
+++ b/src/graphics.c
@ -208,6 +208,7 @@ typedef struct mg_canvas_data
 	mp_rect srcRegion;

 	vec4 shapeExtents;
+	vec4 shapeScreenExtents;
 	u32 nextShapeIndex;
 	u32 vertexCount;
 	u32 indexCount;
@ -789,6 +790,13 @@ void mg_finalize_shape(mg_canvas_data* canvas)
 		mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 		*(mg_mat2x3*)(layout->uvTransformBuffer + index*layout->uvTransformStride) = uvTransform;

+		//TODO: transform extents before clipping
+		mp_rect clip = {maximum(canvas->clip.x, canvas->shapeScreenExtents.x),
+		                maximum(canvas->clip.y, canvas->shapeScreenExtents.y),
+		                minimum(canvas->clip.x + canvas->clip.w, canvas->shapeScreenExtents.z),
+		                minimum(canvas->clip.y + canvas->clip.h, canvas->shapeScreenExtents.w)};
+
+		*(mp_rect*)(((char*)layout->clipBuffer) + index*layout->clipStride) = clip;
 	}
 }

@ -799,17 +807,12 @@ u32 mg_next_shape(mg_canvas_data* canvas, mg_attributes* attributes)
 	canvas->transform = attributes->transform;
 	canvas->srcRegion = attributes->srcRegion;
 	canvas->shapeExtents = (vec4){FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX};
+	canvas->shapeScreenExtents = (vec4){FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX};

 	mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 	int index = canvas->nextShapeIndex;
 	canvas->nextShapeIndex++;

-	mp_rect clip = {canvas->clip.x,
-	                canvas->clip.y,
-	                canvas->clip.x + canvas->clip.w,
-	                canvas->clip.y + canvas->clip.h};
-
-	*(mp_rect*)(((char*)layout->clipBuffer) + index*layout->clipStride) = clip;
 	*(mg_color*)(((char*)layout->colorBuffer) + index*layout->colorStride) = attributes->color;

 	return(index);
@ -842,6 +845,11 @@ void mg_push_vertex_cubic(mg_canvas_data* canvas, vec2 pos, vec4 cubic)

 	vec2 screenPos = mg_mat2x3_mul(canvas->transform, pos);

+	canvas->shapeScreenExtents.x = minimum(canvas->shapeScreenExtents.x, screenPos.x);
+	canvas->shapeScreenExtents.y = minimum(canvas->shapeScreenExtents.y, screenPos.y);
+	canvas->shapeScreenExtents.z = maximum(canvas->shapeScreenExtents.z, screenPos.x);
+	canvas->shapeScreenExtents.w = maximum(canvas->shapeScreenExtents.w, screenPos.y);
+
 	mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 	ASSERT(canvas->vertexCount < layout->maxVertexCount);
 	ASSERT(canvas->nextShapeIndex > 0);
--- a/src/mtl_canvas.m
+++ b/src/mtl_canvas.m
@ -34,9 +34,8 @@ typedef struct mg_mtl_canvas_backend
 	mg_color clearColor;

 	// permanent metal resources
+	id<MTLComputePipelineState> shapePipeline;
 	id<MTLComputePipelineState> trianglePipeline;
-	id<MTLComputePipelineState> tilingPipeline;
-	id<MTLComputePipelineState> sortingPipeline;
 	id<MTLComputePipelineState> computePipeline;
 	id<MTLRenderPipelineState> renderPipeline;

@ -53,9 +52,10 @@ typedef struct mg_mtl_canvas_backend
 	id<MTLBuffer> shapeBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
 	id<MTLBuffer> vertexBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
 	id<MTLBuffer> indexBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
-	id<MTLBuffer> tileCounters;
-	id<MTLBuffer> tileArrayBuffer;
+	id<MTLBuffer> shapeQueueBuffer;
 	id<MTLBuffer> triangleArray;
+	id<MTLBuffer> arenaBuffer;
+	id<MTLBuffer> arenaOffset;

 } mg_mtl_canvas_backend;

@ -212,15 +212,33 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		int triangleCount = indexCount/3;

 		//-----------------------------------------------------------
-		//NOTE(martin): encode the clear counter
+		//NOTE(martin): encode the clear arena offset
 		//-----------------------------------------------------------
 		id<MTLBlitCommandEncoder> blitEncoder = [surface->commandBuffer blitCommandEncoder];
-		blitEncoder.label = @"clear counters";
-		[blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0];
+		blitEncoder.label = @"clear arena";
+		[blitEncoder fillBuffer: backend->arenaOffset range: NSMakeRange(0, sizeof(int)) value: 0];
 		[blitEncoder endEncoding];

 		//-----------------------------------------------------------
-		//NOTE(martin): encode the triangle prepass
+		//NOTE(martin): encode the shape setup pass
+		//-----------------------------------------------------------
+		id<MTLComputeCommandEncoder> shapeEncoder = [surface->commandBuffer computeCommandEncoder];
+		shapeEncoder.label = @"shape pass";
+		[shapeEncoder setComputePipelineState: backend->shapePipeline];
+		[shapeEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 0];
+		[shapeEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 1];
+		[shapeEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2];
+		[shapeEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 3];
+		[shapeEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+
+		MTLSize shapeGroupSize = MTLSizeMake(backend->shapePipeline.maxTotalThreadsPerThreadgroup, 1, 1);
+		MTLSize shapeGridSize = MTLSizeMake(shapeCount, 1, 1);
+
+		[shapeEncoder dispatchThreads: shapeGridSize threadsPerThreadgroup: shapeGroupSize];
+		[shapeEncoder endEncoding];
+
+		//-----------------------------------------------------------
+		//NOTE(martin): encode the triangle setup and binning
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> triangleEncoder = [surface->commandBuffer computeCommandEncoder];
 		triangleEncoder.label = @"triangle pass";
@ -229,8 +247,11 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[triangleEncoder setBuffer: backend->indexBuffer[backend->bufferIndex] offset:backend->indexBufferOffset atIndex: 1];
 		[triangleEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 2];
 		[triangleEncoder setBuffer: backend->triangleArray offset:0 atIndex: 3];
+		[triangleEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 4];
+		[triangleEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 5];
+		[triangleEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 6];

-		[triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+		[triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 7];

 		MTLSize triangleGroupSize = MTLSizeMake(backend->trianglePipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize triangleGridSize = MTLSizeMake(triangleCount, 1, 1);
@ -238,56 +259,15 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[triangleEncoder dispatchThreads: triangleGridSize threadsPerThreadgroup: triangleGroupSize];
 		[triangleEncoder endEncoding];

-		//-----------------------------------------------------------
-		//NOTE(martin): encode the tiling pass
-		//-----------------------------------------------------------
-
-		id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder];
-		tileEncoder.label = @"tiling pass";
-		[tileEncoder setComputePipelineState: backend->tilingPipeline];
-		[tileEncoder setBuffer: backend->triangleArray offset:0 atIndex: 0];
-		[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
-		[tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 2];
-
-		[tileEncoder setBytes: &triangleCount length:sizeof(int) atIndex: 3];
-		[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 4];
-		[tileEncoder setBytes: &scale length: sizeof(float) atIndex: 5];
-
-		MTLSize tileGroupSize = MTLSizeMake(1, 1, 16);
-		MTLSize tileGridSize = MTLSizeMake(nTilesX, nTilesY, 16);
-
-		[tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize];
-		[tileEncoder endEncoding];
-
-		//-----------------------------------------------------------
-		//NOTE(martin): encode the sorting pass
-		//-----------------------------------------------------------
-/*
-		id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder];
-		sortEncoder.label = @"sorting pass";
-		[sortEncoder setComputePipelineState: backend->sortingPipeline];
-		[sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 0];
-		[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
-		[sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 2];
-
-		u32     nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
-		u32     nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
-
-		MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
-		MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1);
-
-		[sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize];
-		[sortEncoder endEncoding];
-*/
 		//-----------------------------------------------------------
 		//NOTE(martin): encode drawing pass
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> drawEncoder = [surface->commandBuffer computeCommandEncoder];
 		drawEncoder.label = @"drawing pass";
 		[drawEncoder setComputePipelineState:backend->computePipeline];
-		[drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0];
-		[drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 1];
-		[drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 2];
+		[drawEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 0];
+		[drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1];
+		[drawEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2];

 		[drawEncoder setTexture: backend->outTexture atIndex: 0];
 		int useTexture = 0;
@ -298,8 +278,9 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 			useTexture = 1;
 		}

-		[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 3];
-		[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+		[drawEncoder setBytes: &shapeCount length:sizeof(int) atIndex: 3];
+		[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 4];
+		[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 5];

 		//TODO: check that we don't exceed maxTotalThreadsPerThreadgroup
 		DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup);
@ -391,8 +372,13 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface)
 		}
 		//NOTE: semaphore does not have a destructor?

-		[backend->tileArrayBuffer release];
+		[backend->shapeQueueBuffer release];
 		[backend->triangleArray release];
+		[backend->arenaBuffer release];
+		[backend->arenaOffset release];
+
+		//////////////////////////////////////////
+		//TODO release all pipelines
 		[backend->computePipeline release];
 	}
 }
@ -524,18 +510,18 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                        		options: bufferOptions];
 			}

-			backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
-								options: MTLResourceStorageModePrivate];
-
 			backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data)
 								options: MTLResourceStorageModePrivate];

-			//TODO(martin): retain ?
-			//-----------------------------------------------------------
-			//NOTE(martin): create and initialize tile counters
-			//-----------------------------------------------------------
-			backend->tileCounters = [metalSurface->device newBufferWithLength: RENDERER_MAX_TILES*sizeof(uint)
-		                                         	options: MTLResourceStorageModePrivate];
+			backend->shapeQueueBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape_queue)
+								options: MTLResourceStorageModePrivate];
+
+			backend->arenaBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_queue_elt)
+								options: MTLResourceStorageModePrivate];
+
+			backend->arenaOffset = [metalSurface->device newBufferWithLength: sizeof(int)
+								options: MTLResourceStorageModePrivate];
+
 			//-----------------------------------------------------------
 			//NOTE(martin): load the library
 			//-----------------------------------------------------------
@ -551,9 +537,8 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 				LOG_ERROR("error : %s\n", errStr);
 				return(0);
 			}
+			id<MTLFunction> shapeFunction = [library newFunctionWithName:@"ShapeSetup"];
 			id<MTLFunction> triangleFunction = [library newFunctionWithName:@"TriangleKernel"];
-			id<MTLFunction> tilingFunction = [library newFunctionWithName:@"TileKernel"];
-			id<MTLFunction> sortingFunction = [library newFunctionWithName:@"SortKernel"];
 			id<MTLFunction> computeFunction = [library newFunctionWithName:@"RenderKernel"];
 			id<MTLFunction> vertexFunction = [library newFunctionWithName:@"VertexShader"];
 			id<MTLFunction> fragmentFunction = [library newFunctionWithName:@"FragmentShader"];
@ -566,6 +551,14 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                                                           	error:&error];
 			ASSERT(backend->computePipeline);

+			MTLComputePipelineDescriptor* shapePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
+			shapePipelineDesc.computeFunction = shapeFunction;
+
+			backend->shapePipeline = [metalSurface->device newComputePipelineStateWithDescriptor: shapePipelineDesc
+		                                           	options: MTLPipelineOptionNone
+		                                           	reflection: nil
+		                                           	error: &error];
+
 			MTLComputePipelineDescriptor* trianglePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			trianglePipelineDesc.computeFunction = triangleFunction;

@ -574,22 +567,6 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                           	reflection: nil
 		                                           	error: &error];

-			MTLComputePipelineDescriptor* tilingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
-			tilingPipelineDesc.computeFunction = tilingFunction;
-
-			backend->tilingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: tilingPipelineDesc
-		                                           	options: MTLPipelineOptionNone
-		                                           	reflection: nil
-		                                           	error: &error];
-
-			MTLComputePipelineDescriptor* sortingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
-			sortingPipelineDesc.computeFunction = sortingFunction;
-
-			backend->sortingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: sortingPipelineDesc
-		                                           	options: MTLPipelineOptionNone
-		                                           	reflection: nil
-		                                           	error: &error];
-
 			//-----------------------------------------------------------
 			//NOTE(martin): setup our render pipeline state
 			//-----------------------------------------------------------
--- a/src/mtl_shader.h
+++ b/src/mtl_shader.h
@ -59,4 +59,32 @@ typedef struct mg_triangle_data

 } mg_triangle_data;

+#ifndef __METAL_VERSION__
+#define device
+#else
+using namespace metal;
+#endif
+
+typedef struct mg_tile_elt
+{
+	int triangleIndex;
+	int next;
+} mg_queue_elt;
+
+typedef struct mg_tile_queue
+{
+	atomic_int first;
+} mg_tile_queue;
+
+typedef struct mg_shape_queue
+{
+	vector_int4 area;
+	device mg_tile_queue* tileQueues;
+} mg_shape_queue;
+
+#ifndef __METAL_VERSION__
+#undef device
+#endif
+
+
 #endif //__MTL_RENDERER_H_
--- a/src/mtl_shader.metal
+++ b/src/mtl_shader.metal
@ -46,13 +46,60 @@ int orient2d(int2 a, int2 b, int2 c)
 	return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
 }

+device uchar* arena_allocate(device uchar* arenaBuffer,
+                             device volatile atomic_uint* arenaOffset,
+                             uint size)
+{
+	uint index = atomic_fetch_add_explicit(arenaOffset, size, memory_order_relaxed);
+	return(&arenaBuffer[index]);
+}
+
+//NOTE: shape setup allocates tile queues for each shape
+
+kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]],
+                       device mg_shape_queue* shapeQueueBuffer [[buffer(1)]],
+                       device uchar* arenaBuffer [[buffer(2)]],
+                       device volatile atomic_uint* arenaOffset [[buffer(3)]],
+                       constant float* scaling [[buffer(4)]],
+                       uint gid [[thread_position_in_grid]])
+{
+
+	float4 box = shapeBuffer[gid].clip * scaling[0];
+
+	int2 firstTile = int2(box.xy)/RENDERER_TILE_SIZE;
+
+	//WARN: the following can result in a 1x1 tile allocated even for empty boxes. But if we didn't allocate
+	//      any tile queue, the tileQueues pointer for that shape would alias the tileQueues pointer of another
+	//      shape, and we would have to detect that in the tiling and drawing kernels. Instead, just accept some
+	//      waste and keep the other kernels more uniforms for now...
+	int nTilesX = int(box.z)/RENDERER_TILE_SIZE - firstTile.x + 1;
+	int nTilesY = int(box.w)/RENDERER_TILE_SIZE - firstTile.y + 1;
+
+	int tileCount = nTilesX * nTilesY;
+	int tileArraySize =  tileCount * sizeof(mg_tile_queue);
+
+	shapeQueueBuffer[gid].area = int4(firstTile.x, firstTile.y, nTilesX, nTilesY);
+	shapeQueueBuffer[gid].tileQueues = (device mg_tile_queue*)arena_allocate(arenaBuffer, arenaOffset, tileArraySize);
+
+	for(int i=0; i<tileCount; i++)
+	{
+		atomic_store_explicit(&shapeQueueBuffer[gid].tileQueues[i].first, -1, memory_order_relaxed);
+	}
+}
+
+//NOTE: setup triangle data and bucket triangle into tile queues
+
 kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 		                   constant uint* indexBuffer [[buffer(1)]],
 		                   constant mg_shape* shapeBuffer [[buffer(2)]],
                           device mg_triangle_data* triangleArray [[buffer(3)]],
-                           constant float* scaling [[buffer(4)]],
+                           device mg_shape_queue* shapeQueueBuffer [[buffer(4)]],
+                           device uchar* arenaBuffer [[buffer(5)]],
+                           device volatile atomic_uint* arenaOffset [[buffer(6)]],
+                           constant float* scaling [[buffer(7)]],
                           uint gid [[thread_position_in_grid]])
 {
+	//NOTE: triangle setup
 	uint triangleIndex = gid * 3;

 	uint i0 = indexBuffer[triangleIndex];
@ -95,7 +142,6 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 	triangleArray[gid].p1 = ip1;
 	triangleArray[gid].p2 = ip2;

-	//NOTE(martin): compute triangle orientation and bias for each edge
 	int cw = orient2d(ip0, ip1, ip2) > 0 ? 1 : -1;

 	triangleArray[gid].cw = cw;
@ -103,92 +149,45 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 	triangleArray[gid].bias1 = is_top_left(p2, p0) ? -(1-cw)/2 : -(1+cw)/2;
 	triangleArray[gid].bias2 = is_top_left(p0, p1) ? -(1-cw)/2 : -(1+cw)/2;

-	triangleArray[gid].tileBox = int4(fbox)/RENDERER_TILE_SIZE;
-}
+	int4 tileBox = int4(fbox)/RENDERER_TILE_SIZE;
+	triangleArray[gid].tileBox = tileBox;

-kernel void TileKernel(const device mg_triangle_data* triangleArray [[buffer(0)]],
-                       device uint* tileCounters [[buffer(1)]],
-                       device uint* tileArrayBuffer [[buffer(2)]],
-                       constant int* triangleCount [[buffer(3)]],
-                       constant uint2* viewport [[buffer(4)]],
-                       constant float* scaling [[buffer(5)]],
-                       uint3 gid [[thread_position_in_grid]])
-{
-	uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
-	int nTilesX = tilesMatrixDim.x;

-	int tileX = gid.x;
-	int tileY = gid.y;
-	int tileIndex = tileY * nTilesX + tileX;
-	int groupIndex = gid.z;
+	//NOTE: bucket triangle into tiles
+	device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex];

-	const int groupSize = 16;
-	int count = 0;
-	int mask = 0xffff>>(16-groupIndex);
+	int xMin = max(0, tileBox.x - shapeQueue->area.x);
+	int yMin = max(0, tileBox.y - shapeQueue->area.y);
+	int xMax = min(tileBox.z - shapeQueue->area.x, shapeQueue->area.z-1);
+	int yMax = min(tileBox.w - shapeQueue->area.y, shapeQueue->area.w-1);

-	for(int triangleBatchIndex=0; triangleBatchIndex<triangleCount[0]; triangleBatchIndex += groupSize)
+	//NOTE(martin): it's important to do the computation with signed int, so that we can have negative xMax/yMax
+	//              otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
+
+	for(int y = yMin; y <= yMax; y++)
 	{
-		int triangleIndex = triangleBatchIndex + groupIndex;
-		bool active = false;
-//		if(triangleIndex + groupIndex < triangleCount[0])
+		for(int x = xMin ; x <= xMax; x++)
 		{
-			int4 box = triangleArray[triangleIndex].tileBox;
-/*
-			if(  tileX >= box.x && tileX <= box.z
-		  	&& tileY >= box.y && tileY <= box.w)
-			{
-				active = true;
-			}
-			*/
-		}
+			int tileIndex = y*shapeQueue->area.z + x;

-		int vote = uint64_t(simd_ballot(active));
-		if(active)
-		{
-			int batchOffset = popcount(vote & mask);
-			tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + count + batchOffset] = triangleIndex;
-		}
-		count += popcount(vote);
-	}
-	if(groupIndex == 0)
-	{
-		tileCounters[tileIndex] = count;
-	}
-}
+			device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileIndex];
+			device mg_queue_elt* elt = (device mg_queue_elt*)arena_allocate(arenaBuffer, arenaOffset, sizeof(mg_queue_elt));
+			int eltIndex = (device uchar*)elt - arenaBuffer;

-kernel void SortKernel(constant mg_triangle_data* triangleArray [[buffer(0)]],
-                       const device uint* tileCounters [[buffer(1)]],
-                       device uint* tileArrayBuffer [[buffer(2)]],
-                       uint gid [[thread_position_in_grid]])
-{
-	uint tileIndex = gid;
-	uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE;
-	uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
+			elt->next = atomic_exchange_explicit(&tileQueue->first, eltIndex, memory_order_relaxed);

-	for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++)
-	{
-		for(uint sortIndex = tileArrayIndex; sortIndex > 0; sortIndex--)
-		{
-			int shapeIndex = triangleArray[tileArrayBuffer[tileArrayOffset + sortIndex]].shapeIndex;
-			int prevShapeIndex = triangleArray[tileArrayBuffer[tileArrayOffset + sortIndex - 1]].shapeIndex;
-
-			if(shapeIndex >= prevShapeIndex)
-			{
-				break;
-			}
-			uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex];
-			tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1];
-			tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp;
+			elt->triangleIndex = gid;
 		}
 	}
 }

-kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
-                         const device uint* tileArrayBuffer [[buffer(1)]],
-                         const device mg_triangle_data* triangleArray [[buffer(2)]],
+kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer(0)]],
+                         const device mg_triangle_data* triangleArray [[buffer(1)]],
+                         const device uchar* arenaBuffer [[buffer(2)]],

-                         constant int* useTexture [[buffer(3)]],
-                         constant float* scaling [[buffer(4)]],
+                         constant int* shapeCount [[buffer(3)]],
+                         constant int* useTexture [[buffer(4)]],
+                         constant float* scaling [[buffer(5)]],

                         texture2d<float, access::write> outTexture [[texture(0)]],
                         texture2d<float> texAtlas [[texture(1)]],
@ -200,37 +199,7 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 {
 	//TODO: guard against thread group size not equal to tile size?
 	const int2 pixelCoord = int2(gid);
-	const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE;
-	const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
-	const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x;
-	const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
-
-#ifdef RENDERER_DEBUG_TILES
-	//NOTE(martin): color code debug values and show the tile grid
-	{
-		float4 fragColor = float4(0);
-
-		if( pixelCoord.x % 16 == 0
-	  	  ||pixelCoord.y % 16 == 0)
-		{
-			fragColor = float4(0, 0, 0, 1);
-		}
-		else if(tileCounters[tileIndex] == 0xffffu)
-		{
-			fragColor = float4(1, 0, 1, 1);
-		}
-		else if(tileCounter != 0u)
-		{
-			fragColor = float4(0, 1, 0, 1);
-		}
-		else
-		{
-			fragColor = float4(1, 0, 0, 1);
-		}
-		outTexture.write(fragColor, gid);
-		return;
-	}
-#endif
+	const int2 tileCoord = pixelCoord/ RENDERER_TILE_SIZE;

 	const int subPixelFactor = 16;
 	const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
@ -258,89 +227,108 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 		currentColor[i] = float4(0, 0, 0, 0);
    }

-    for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++)
+    for(int shapeIndex = 0; shapeIndex < shapeCount[0]; shapeIndex++)
    {
-    	int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex];
-		const device mg_triangle_data* triangle = &triangleArray[triangleIndex];
+		const device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex];

-		int2 p0 = triangle->p0;
-		int2 p1 = triangle->p1;
-		int2 p2 = triangle->p2;
-
-		int cw = triangle->cw;
-
-		int bias0 = triangle->bias0;
-		int bias1 = triangle->bias1;
-		int bias2 = triangle->bias2;
-
-		float4 cubic0 = triangle->cubic0;
-		float4 cubic1 = triangle->cubic1;
-		float4 cubic2 = triangle->cubic2;
-
-		int shapeIndex = triangle->shapeIndex;
-		float4 color = triangle->color;
-		color.rgb *= color.a;
-
-		int4 clip = triangle->box;
-
-		matrix_float3x3 uvTransform = triangle->uvTransform;
-
-		for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
+		// get the tile queue that corresponds to our tile in the shape area
+		int2 tileQueueCoord = tileCoord - shapeQueue->area.xy;
+		if(  tileQueueCoord.x >= 0
+		  && tileQueueCoord.y >= 0
+		  && tileQueueCoord.x < shapeQueue->area.z
+		  && tileQueueCoord.y < shapeQueue->area.w)
 		{
-			int2 samplePoint = samplePoints[sampleIndex];
+			int tileQueueIndex = tileQueueCoord.y * shapeQueue->area.z + tileQueueCoord.x;
+			device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileQueueIndex];

-			if(  samplePoint.x < clip.x
-			  || samplePoint.x > clip.z
-			  || samplePoint.y < clip.y
-			  || samplePoint.y > clip.w)
+			int firstEltIndex = atomic_load_explicit(&tileQueue->first, memory_order_relaxed);
+			device mg_queue_elt* elt = 0;
+
+			for(int eltIndex = firstEltIndex; eltIndex >= 0; eltIndex = elt->next)
 			{
-				continue;
-			}
+				elt = (device mg_queue_elt*)(arenaBuffer + eltIndex);
+				const device mg_triangle_data* triangle = &triangleArray[elt->triangleIndex];

-			int w0 = cw*orient2d(p1, p2, samplePoint);
-			int w1 = cw*orient2d(p2, p0, samplePoint);
-			int w2 = cw*orient2d(p0, p1, samplePoint);
+				int2 p0 = triangle->p0;
+				int2 p1 = triangle->p1;
+				int2 p2 = triangle->p2;

-			if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
-			{
-				float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
+				int cw = triangle->cw;

-				float eps = 0.0001;
-				if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
+				int bias0 = triangle->bias0;
+				int bias1 = triangle->bias1;
+				int bias2 = triangle->bias2;
+
+				float4 cubic0 = triangle->cubic0;
+				float4 cubic1 = triangle->cubic1;
+				float4 cubic2 = triangle->cubic2;
+
+				int shapeIndex = triangle->shapeIndex;
+				float4 color = triangle->color;
+				color.rgb *= color.a;
+
+				int4 clip = triangle->box;
+
+				matrix_float3x3 uvTransform = triangle->uvTransform;
+
+				for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
 				{
-					if(shapeIndex == currentShapeIndex[sampleIndex])
+					int2 samplePoint = samplePoints[sampleIndex];
+
+					if(  samplePoint.x < clip.x
+			  		|| samplePoint.x > clip.z
+			  		|| samplePoint.y < clip.y
+			  		|| samplePoint.y > clip.w)
 					{
-						flipCount[sampleIndex]++;
+						continue;
 					}
-					else
+
+					int w0 = cw*orient2d(p1, p2, samplePoint);
+					int w1 = cw*orient2d(p2, p0, samplePoint);
+					int w2 = cw*orient2d(p0, p1, samplePoint);
+
+					if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
 					{
-						if(flipCount[sampleIndex] & 0x01)
+						float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
+
+						float eps = 0.0001;
+						if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
 						{
-							sampleColor[sampleIndex] = currentColor[sampleIndex];
+							if(shapeIndex == currentShapeIndex[sampleIndex])
+							{
+								flipCount[sampleIndex]++;
+							}
+							else
+							{
+								if(flipCount[sampleIndex] & 0x01)
+								{
+									sampleColor[sampleIndex] = currentColor[sampleIndex];
+								}
+
+								float4 nextColor = color;
+
+								if(useTexture[0])
+								{
+									float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
+									float2 uv = (uvTransform * sampleFP).xy;
+
+									constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
+									float4 texColor = texAtlas.sample(smp, uv);
+
+									texColor.rgb *= texColor.a;
+									nextColor *= texColor;
+								}
+
+								currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
+								currentShapeIndex[sampleIndex] = shapeIndex;
+								flipCount[sampleIndex] = 1;
+							}
 						}
-
-						float4 nextColor = color;
-
-						if(useTexture[0])
-						{
-							float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
-							float2 uv = (uvTransform * sampleFP).xy;
-
-							constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
-							float4 texColor = texAtlas.sample(smp, uv);
-
-							texColor.rgb *= texColor.a;
-							nextColor *= texColor;
-						}
-
-						currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
-						currentShapeIndex[sampleIndex] = shapeIndex;
-						flipCount[sampleIndex] = 1;
 					}
 				}
 			}
 		}
-    }
+	}

    float4 pixelColor = float4(0);
    for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
@ -353,4 +341,5 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 	}

 	outTexture.write(pixelColor/float(sampleCount), gid);
+
 }