[mtl canvas] Tiling per rectangles into per-shape tile queues (linked lists of triangle indices) to avoid sorting

2023-03-20 16:46:12 +01:00 · 2023-03-20 16:46:12 +01:00 · a4ef58f2d8
parent 11113f597c
commit a4ef58f2d8
6 changed files with 304 additions and 262 deletions
--- a/examples/canvas/main.c
+++ b/examples/canvas/main.c
@ -162,6 +162,7 @@ int main()
 			// head
 			mg_set_color_rgba(1, 1, 0, 1);
 			mg_circle_fill(x, y, 200);
 			// smile
--- a/examples/tiger/main.c
+++ b/examples/tiger/main.c
@ -82,6 +82,11 @@ int main()
 	mp_window_bring_to_front(window);
 	mp_window_focus(window);
 	bool tracked = false;
 	vec2 trackPoint = {0};
 	f32 zoom = 1;
 	f32 startX = 0, startY = 0;
 	f64 frameTime = 0;
 	while(!mp_should_quit())
@ -99,8 +104,35 @@ int main()
 					mp_request_quit();
 				} break;
-				case MP_EVENT_KEYBOARD_KEY:
+				case MP_EVENT_MOUSE_BUTTON:
 				{
 					if(event.key.code == MP_MOUSE_LEFT)
 					{
 						if(event.key.action == MP_KEY_PRESS)
 						{
 							tracked = true;
 							vec2 mousePos = mp_mouse_position();
 							trackPoint.x = mousePos.x/zoom - startX;
 							trackPoint.y = mousePos.y/zoom - startY;
 						}
 						else
 						{
 							tracked = false;
 						}
 					}
 				} break;
 				case MP_EVENT_MOUSE_WHEEL:
 				{
 					vec2 mousePos = mp_mouse_position();
 					f32 trackX = mousePos.x/zoom - startX;
 					f32 trackY = mousePos.y/zoom - startY;
 					zoom *= 1 + event.move.deltaY * 0.01;
 					zoom = Clamp(zoom, 0.2, 10);
 					startX = mousePos.x/zoom - trackX;
 					startY = mousePos.y/zoom - trackY;
 				} break;
 				default:
@ -108,13 +140,20 @@ int main()
 			}
 		}
 		if(tracked)
 		{
 			vec2 mousePos = mp_mouse_position();
 			startX = mousePos.x/zoom - trackPoint.x;
 			startY = mousePos.y/zoom - trackPoint.y;
 		}
 		mg_surface_prepare(surface);
 		mg_set_color_rgba(1, 0, 1, 1);
 		mg_clear();
-		mg_matrix_push((mg_mat2x3){1, 0, 300,
+		mg_matrix_push((mg_mat2x3){zoom, 0, 300+startX*zoom,
-		                           0, 1, 200});
+		                           0, zoom, 200+startY*zoom});
 		draw_tiger();
--- a/src/graphics.c
+++ b/src/graphics.c
@ -208,6 +208,7 @@ typedef struct mg_canvas_data
 	mp_rect srcRegion;
 	vec4 shapeExtents;
 	vec4 shapeScreenExtents;
 	u32 nextShapeIndex;
 	u32 vertexCount;
 	u32 indexCount;
@ -789,6 +790,13 @@ void mg_finalize_shape(mg_canvas_data* canvas)
 		mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 		*(mg_mat2x3*)(layout->uvTransformBuffer + index*layout->uvTransformStride) = uvTransform;
 		//TODO: transform extents before clipping
 		mp_rect clip = {maximum(canvas->clip.x, canvas->shapeScreenExtents.x),
 		                maximum(canvas->clip.y, canvas->shapeScreenExtents.y),
 		                minimum(canvas->clip.x + canvas->clip.w, canvas->shapeScreenExtents.z),
 		                minimum(canvas->clip.y + canvas->clip.h, canvas->shapeScreenExtents.w)};
 		*(mp_rect*)(((char*)layout->clipBuffer) + index*layout->clipStride) = clip;
 	}
 }
@ -799,17 +807,12 @@ u32 mg_next_shape(mg_canvas_data* canvas, mg_attributes* attributes)
 	canvas->transform = attributes->transform;
 	canvas->srcRegion = attributes->srcRegion;
 	canvas->shapeExtents = (vec4){FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX};
 	canvas->shapeScreenExtents = (vec4){FLT_MAX, FLT_MAX, -FLT_MAX, -FLT_MAX};
 	mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 	int index = canvas->nextShapeIndex;
 	canvas->nextShapeIndex++;
 	mp_rect clip = {canvas->clip.x,
 	                canvas->clip.y,
 	                canvas->clip.x + canvas->clip.w,
 	                canvas->clip.y + canvas->clip.h};
 	*(mp_rect*)(((char*)layout->clipBuffer) + index*layout->clipStride) = clip;
 	*(mg_color*)(((char*)layout->colorBuffer) + index*layout->colorStride) = attributes->color;
 	return(index);
@ -842,6 +845,11 @@ void mg_push_vertex_cubic(mg_canvas_data* canvas, vec2 pos, vec4 cubic)
 	vec2 screenPos = mg_mat2x3_mul(canvas->transform, pos);
 	canvas->shapeScreenExtents.x = minimum(canvas->shapeScreenExtents.x, screenPos.x);
 	canvas->shapeScreenExtents.y = minimum(canvas->shapeScreenExtents.y, screenPos.y);
 	canvas->shapeScreenExtents.z = maximum(canvas->shapeScreenExtents.z, screenPos.x);
 	canvas->shapeScreenExtents.w = maximum(canvas->shapeScreenExtents.w, screenPos.y);
 	mg_vertex_layout* layout = &canvas->backend->vertexLayout;
 	ASSERT(canvas->vertexCount < layout->maxVertexCount);
 	ASSERT(canvas->nextShapeIndex > 0);
--- a/src/mtl_canvas.m
+++ b/src/mtl_canvas.m
@ -34,9 +34,8 @@ typedef struct mg_mtl_canvas_backend
 	mg_color clearColor;
 	// permanent metal resources
 	id<MTLComputePipelineState> shapePipeline;
 	id<MTLComputePipelineState> trianglePipeline;
 	id<MTLComputePipelineState> tilingPipeline;
 	id<MTLComputePipelineState> sortingPipeline;
 	id<MTLComputePipelineState> computePipeline;
 	id<MTLRenderPipelineState> renderPipeline;
@ -53,9 +52,10 @@ typedef struct mg_mtl_canvas_backend
 	id<MTLBuffer> shapeBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
 	id<MTLBuffer> vertexBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
 	id<MTLBuffer> indexBuffer[MG_MTL_MAX_BUFFER_AVAILABLE];
-	id<MTLBuffer> tileCounters;
+	id<MTLBuffer> shapeQueueBuffer;
 	id<MTLBuffer> tileArrayBuffer;
 	id<MTLBuffer> triangleArray;
 	id<MTLBuffer> arenaBuffer;
 	id<MTLBuffer> arenaOffset;
 } mg_mtl_canvas_backend;
@ -212,15 +212,33 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		int triangleCount = indexCount/3;
 		//-----------------------------------------------------------
-		//NOTE(martin): encode the clear counter
+		//NOTE(martin): encode the clear arena offset
 		//-----------------------------------------------------------
 		id<MTLBlitCommandEncoder> blitEncoder = [surface->commandBuffer blitCommandEncoder];
-		blitEncoder.label = @"clear counters";
+		blitEncoder.label = @"clear arena";
-		[blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0];
+		[blitEncoder fillBuffer: backend->arenaOffset range: NSMakeRange(0, sizeof(int)) value: 0];
 		[blitEncoder endEncoding];
 		//-----------------------------------------------------------
-		//NOTE(martin): encode the triangle prepass
+		//NOTE(martin): encode the shape setup pass
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> shapeEncoder = [surface->commandBuffer computeCommandEncoder];
 		shapeEncoder.label = @"shape pass";
 		[shapeEncoder setComputePipelineState: backend->shapePipeline];
 		[shapeEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 0];
 		[shapeEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 1];
 		[shapeEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2];
 		[shapeEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 3];
 		[shapeEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
 		MTLSize shapeGroupSize = MTLSizeMake(backend->shapePipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize shapeGridSize = MTLSizeMake(shapeCount, 1, 1);
 		[shapeEncoder dispatchThreads: shapeGridSize threadsPerThreadgroup: shapeGroupSize];
 		[shapeEncoder endEncoding];
 		//-----------------------------------------------------------
 		//NOTE(martin): encode the triangle setup and binning
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> triangleEncoder = [surface->commandBuffer computeCommandEncoder];
 		triangleEncoder.label = @"triangle pass";
@ -229,8 +247,11 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[triangleEncoder setBuffer: backend->indexBuffer[backend->bufferIndex] offset:backend->indexBufferOffset atIndex: 1];
 		[triangleEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 2];
 		[triangleEncoder setBuffer: backend->triangleArray offset:0 atIndex: 3];
 		[triangleEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 4];
 		[triangleEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 5];
 		[triangleEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 6];
-		[triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+		[triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 7];
 		MTLSize triangleGroupSize = MTLSizeMake(backend->trianglePipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize triangleGridSize = MTLSizeMake(triangleCount, 1, 1);
@ -238,56 +259,15 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[triangleEncoder dispatchThreads: triangleGridSize threadsPerThreadgroup: triangleGroupSize];
 		[triangleEncoder endEncoding];
 		//-----------------------------------------------------------
 		//NOTE(martin): encode the tiling pass
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder];
 		tileEncoder.label = @"tiling pass";
 		[tileEncoder setComputePipelineState: backend->tilingPipeline];
 		[tileEncoder setBuffer: backend->triangleArray offset:0 atIndex: 0];
 		[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
 		[tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 2];
 		[tileEncoder setBytes: &triangleCount length:sizeof(int) atIndex: 3];
 		[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 4];
 		[tileEncoder setBytes: &scale length: sizeof(float) atIndex: 5];
 		MTLSize tileGroupSize = MTLSizeMake(1, 1, 16);
 		MTLSize tileGridSize = MTLSizeMake(nTilesX, nTilesY, 16);
 		[tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize];
 		[tileEncoder endEncoding];
 		//-----------------------------------------------------------
 		//NOTE(martin): encode the sorting pass
 		//-----------------------------------------------------------
 /*
 		id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder];
 		sortEncoder.label = @"sorting pass";
 		[sortEncoder setComputePipelineState: backend->sortingPipeline];
 		[sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 0];
 		[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
 		[sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 2];
 		u32     nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
 		u32     nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
 		MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1);
 		[sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize];
 		[sortEncoder endEncoding];
 */
 		//-----------------------------------------------------------
 		//NOTE(martin): encode drawing pass
 		//-----------------------------------------------------------
 		id<MTLComputeCommandEncoder> drawEncoder = [surface->commandBuffer computeCommandEncoder];
 		drawEncoder.label = @"drawing pass";
 		[drawEncoder setComputePipelineState:backend->computePipeline];
-		[drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0];
+		[drawEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 0];
-		[drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 1];
+		[drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1];
-		[drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 2];
+		[drawEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2];
 		[drawEncoder setTexture: backend->outTexture atIndex: 0];
 		int useTexture = 0;
@ -298,8 +278,9 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 			useTexture = 1;
 		}
-		[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 3];
+		[drawEncoder setBytes: &shapeCount length:sizeof(int) atIndex: 3];
-		[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 4];
+		[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 4];
 		[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 5];
 		//TODO: check that we don't exceed maxTotalThreadsPerThreadgroup
 		DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup);
@ -391,8 +372,13 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface)
 		}
 		//NOTE: semaphore does not have a destructor?
-		[backend->tileArrayBuffer release];
+		[backend->shapeQueueBuffer release];
 		[backend->triangleArray release];
 		[backend->arenaBuffer release];
 		[backend->arenaOffset release];
 		//////////////////////////////////////////
 		//TODO release all pipelines
 		[backend->computePipeline release];
 	}
 }
@ -524,18 +510,18 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                        		options: bufferOptions];
 			}
 			backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
 								options: MTLResourceStorageModePrivate];
 			backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data)
 								options: MTLResourceStorageModePrivate];
-			//TODO(martin): retain ?
+			backend->shapeQueueBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape_queue)
-			//-----------------------------------------------------------
+								options: MTLResourceStorageModePrivate];
-			//NOTE(martin): create and initialize tile counters
+
-			//-----------------------------------------------------------
+			backend->arenaBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_queue_elt)
-			backend->tileCounters = [metalSurface->device newBufferWithLength: RENDERER_MAX_TILES*sizeof(uint)
+								options: MTLResourceStorageModePrivate];
-		                                         	options: MTLResourceStorageModePrivate];
+
 			backend->arenaOffset = [metalSurface->device newBufferWithLength: sizeof(int)
 								options: MTLResourceStorageModePrivate];
 			//-----------------------------------------------------------
 			//NOTE(martin): load the library
 			//-----------------------------------------------------------
@ -551,9 +537,8 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 				LOG_ERROR("error : %s\n", errStr);
 				return(0);
 			}
 			id<MTLFunction> shapeFunction = [library newFunctionWithName:@"ShapeSetup"];
 			id<MTLFunction> triangleFunction = [library newFunctionWithName:@"TriangleKernel"];
 			id<MTLFunction> tilingFunction = [library newFunctionWithName:@"TileKernel"];
 			id<MTLFunction> sortingFunction = [library newFunctionWithName:@"SortKernel"];
 			id<MTLFunction> computeFunction = [library newFunctionWithName:@"RenderKernel"];
 			id<MTLFunction> vertexFunction = [library newFunctionWithName:@"VertexShader"];
 			id<MTLFunction> fragmentFunction = [library newFunctionWithName:@"FragmentShader"];
@ -566,6 +551,14 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                                                           	error:&error];
 			ASSERT(backend->computePipeline);
 			MTLComputePipelineDescriptor* shapePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			shapePipelineDesc.computeFunction = shapeFunction;
 			backend->shapePipeline = [metalSurface->device newComputePipelineStateWithDescriptor: shapePipelineDesc
 		                                           	options: MTLPipelineOptionNone
 		                                           	reflection: nil
 		                                           	error: &error];
 			MTLComputePipelineDescriptor* trianglePipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			trianglePipelineDesc.computeFunction = triangleFunction;
@ -574,22 +567,6 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                           	reflection: nil
 		                                           	error: &error];
 			MTLComputePipelineDescriptor* tilingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			tilingPipelineDesc.computeFunction = tilingFunction;
 			backend->tilingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: tilingPipelineDesc
 		                                           	options: MTLPipelineOptionNone
 		                                           	reflection: nil
 		                                           	error: &error];
 			MTLComputePipelineDescriptor* sortingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			sortingPipelineDesc.computeFunction = sortingFunction;
 			backend->sortingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: sortingPipelineDesc
 		                                           	options: MTLPipelineOptionNone
 		                                           	reflection: nil
 		                                           	error: &error];
 			//-----------------------------------------------------------
 			//NOTE(martin): setup our render pipeline state
 			//-----------------------------------------------------------
--- a/src/mtl_shader.h
+++ b/src/mtl_shader.h
@ -59,4 +59,32 @@ typedef struct mg_triangle_data
 } mg_triangle_data;
 #ifndef __METAL_VERSION__
 #define device
 #else
 using namespace metal;
 #endif
 typedef struct mg_tile_elt
 {
 	int triangleIndex;
 	int next;
 } mg_queue_elt;
 typedef struct mg_tile_queue
 {
 	atomic_int first;
 } mg_tile_queue;
 typedef struct mg_shape_queue
 {
 	vector_int4 area;
 	device mg_tile_queue* tileQueues;
 } mg_shape_queue;
 #ifndef __METAL_VERSION__
 #undef device
 #endif
 #endif //__MTL_RENDERER_H_
--- a/src/mtl_shader.metal
+++ b/src/mtl_shader.metal
@ -46,13 +46,60 @@ int orient2d(int2 a, int2 b, int2 c)
 	return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
 }
 device uchar* arena_allocate(device uchar* arenaBuffer,
                             device volatile atomic_uint* arenaOffset,
                             uint size)
 {
 	uint index = atomic_fetch_add_explicit(arenaOffset, size, memory_order_relaxed);
 	return(&arenaBuffer[index]);
 }
 //NOTE: shape setup allocates tile queues for each shape
 kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]],
                       device mg_shape_queue* shapeQueueBuffer [[buffer(1)]],
                       device uchar* arenaBuffer [[buffer(2)]],
                       device volatile atomic_uint* arenaOffset [[buffer(3)]],
                       constant float* scaling [[buffer(4)]],
                       uint gid [[thread_position_in_grid]])
 {
 	float4 box = shapeBuffer[gid].clip * scaling[0];
 	int2 firstTile = int2(box.xy)/RENDERER_TILE_SIZE;
 	//WARN: the following can result in a 1x1 tile allocated even for empty boxes. But if we didn't allocate
 	//      any tile queue, the tileQueues pointer for that shape would alias the tileQueues pointer of another
 	//      shape, and we would have to detect that in the tiling and drawing kernels. Instead, just accept some
 	//      waste and keep the other kernels more uniforms for now...
 	int nTilesX = int(box.z)/RENDERER_TILE_SIZE - firstTile.x + 1;
 	int nTilesY = int(box.w)/RENDERER_TILE_SIZE - firstTile.y + 1;
 	int tileCount = nTilesX * nTilesY;
 	int tileArraySize =  tileCount * sizeof(mg_tile_queue);
 	shapeQueueBuffer[gid].area = int4(firstTile.x, firstTile.y, nTilesX, nTilesY);
 	shapeQueueBuffer[gid].tileQueues = (device mg_tile_queue*)arena_allocate(arenaBuffer, arenaOffset, tileArraySize);
 	for(int i=0; i<tileCount; i++)
 	{
 		atomic_store_explicit(&shapeQueueBuffer[gid].tileQueues[i].first, -1, memory_order_relaxed);
 	}
 }
 //NOTE: setup triangle data and bucket triangle into tile queues
 kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 		                   constant uint* indexBuffer [[buffer(1)]],
 		                   constant mg_shape* shapeBuffer [[buffer(2)]],
                           device mg_triangle_data* triangleArray [[buffer(3)]],
-                           constant float* scaling [[buffer(4)]],
+                           device mg_shape_queue* shapeQueueBuffer [[buffer(4)]],
                           device uchar* arenaBuffer [[buffer(5)]],
                           device volatile atomic_uint* arenaOffset [[buffer(6)]],
                           constant float* scaling [[buffer(7)]],
                           uint gid [[thread_position_in_grid]])
 {
 	//NOTE: triangle setup
 	uint triangleIndex = gid * 3;
 	uint i0 = indexBuffer[triangleIndex];
@ -95,7 +142,6 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 	triangleArray[gid].p1 = ip1;
 	triangleArray[gid].p2 = ip2;
 	//NOTE(martin): compute triangle orientation and bias for each edge
 	int cw = orient2d(ip0, ip1, ip2) > 0 ? 1 : -1;
 	triangleArray[gid].cw = cw;
@ -103,92 +149,45 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 	triangleArray[gid].bias1 = is_top_left(p2, p0) ? -(1-cw)/2 : -(1+cw)/2;
 	triangleArray[gid].bias2 = is_top_left(p0, p1) ? -(1-cw)/2 : -(1+cw)/2;
-	triangleArray[gid].tileBox = int4(fbox)/RENDERER_TILE_SIZE;
+	int4 tileBox = int4(fbox)/RENDERER_TILE_SIZE;
-}
+	triangleArray[gid].tileBox = tileBox;
 kernel void TileKernel(const device mg_triangle_data* triangleArray [[buffer(0)]],
                       device uint* tileCounters [[buffer(1)]],
                       device uint* tileArrayBuffer [[buffer(2)]],
                       constant int* triangleCount [[buffer(3)]],
                       constant uint2* viewport [[buffer(4)]],
                       constant float* scaling [[buffer(5)]],
                       uint3 gid [[thread_position_in_grid]])
 {
 	uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
 	int nTilesX = tilesMatrixDim.x;
-	int tileX = gid.x;
+	//NOTE: bucket triangle into tiles
-	int tileY = gid.y;
+	device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex];
 	int tileIndex = tileY * nTilesX + tileX;
 	int groupIndex = gid.z;
-	const int groupSize = 16;
+	int xMin = max(0, tileBox.x - shapeQueue->area.x);
-	int count = 0;
+	int yMin = max(0, tileBox.y - shapeQueue->area.y);
-	int mask = 0xffff>>(16-groupIndex);
+	int xMax = min(tileBox.z - shapeQueue->area.x, shapeQueue->area.z-1);
 	int yMax = min(tileBox.w - shapeQueue->area.y, shapeQueue->area.w-1);
-	for(int triangleBatchIndex=0; triangleBatchIndex<triangleCount[0]; triangleBatchIndex += groupSize)
+	//NOTE(martin): it's important to do the computation with signed int, so that we can have negative xMax/yMax
 	//              otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
 	for(int y = yMin; y <= yMax; y++)
 	{
-		int triangleIndex = triangleBatchIndex + groupIndex;
+		for(int x = xMin ; x <= xMax; x++)
 		bool active = false;
 //		if(triangleIndex + groupIndex < triangleCount[0])
 		{
-			int4 box = triangleArray[triangleIndex].tileBox;
+			int tileIndex = y*shapeQueue->area.z + x;
 /*
 			if(  tileX >= box.x && tileX <= box.z
 		  	&& tileY >= box.y && tileY <= box.w)
 			{
 				active = true;
 			}
 			*/
 		}
-		int vote = uint64_t(simd_ballot(active));
+			device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileIndex];
-		if(active)
+			device mg_queue_elt* elt = (device mg_queue_elt*)arena_allocate(arenaBuffer, arenaOffset, sizeof(mg_queue_elt));
-		{
+			int eltIndex = (device uchar*)elt - arenaBuffer;
 			int batchOffset = popcount(vote & mask);
 			tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + count + batchOffset] = triangleIndex;
 		}
 		count += popcount(vote);
 	}
 	if(groupIndex == 0)
 	{
 		tileCounters[tileIndex] = count;
 	}
 }
-kernel void SortKernel(constant mg_triangle_data* triangleArray [[buffer(0)]],
+			elt->next = atomic_exchange_explicit(&tileQueue->first, eltIndex, memory_order_relaxed);
                       const device uint* tileCounters [[buffer(1)]],
                       device uint* tileArrayBuffer [[buffer(2)]],
                       uint gid [[thread_position_in_grid]])
 {
 	uint tileIndex = gid;
 	uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE;
 	uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
-	for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++)
+			elt->triangleIndex = gid;
 	{
 		for(uint sortIndex = tileArrayIndex; sortIndex > 0; sortIndex--)
 		{
 			int shapeIndex = triangleArray[tileArrayBuffer[tileArrayOffset + sortIndex]].shapeIndex;
 			int prevShapeIndex = triangleArray[tileArrayBuffer[tileArrayOffset + sortIndex - 1]].shapeIndex;
 			if(shapeIndex >= prevShapeIndex)
 			{
 				break;
 			}
 			uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex];
 			tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1];
 			tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp;
 		}
 	}
 }
-kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
+kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer(0)]],
-                         const device uint* tileArrayBuffer [[buffer(1)]],
+                         const device mg_triangle_data* triangleArray [[buffer(1)]],
-                         const device mg_triangle_data* triangleArray [[buffer(2)]],
+                         const device uchar* arenaBuffer [[buffer(2)]],
-                         constant int* useTexture [[buffer(3)]],
+                         constant int* shapeCount [[buffer(3)]],
-                         constant float* scaling [[buffer(4)]],
+                         constant int* useTexture [[buffer(4)]],
                         constant float* scaling [[buffer(5)]],
                         texture2d<float, access::write> outTexture [[texture(0)]],
                         texture2d<float> texAtlas [[texture(1)]],
@ -200,37 +199,7 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 {
 	//TODO: guard against thread group size not equal to tile size?
 	const int2 pixelCoord = int2(gid);
-	const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE;
+	const int2 tileCoord = pixelCoord/ RENDERER_TILE_SIZE;
 	const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
 	const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x;
 	const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
 #ifdef RENDERER_DEBUG_TILES
 	//NOTE(martin): color code debug values and show the tile grid
 	{
 		float4 fragColor = float4(0);
 		if( pixelCoord.x % 16 == 0
 	  	  ||pixelCoord.y % 16 == 0)
 		{
 			fragColor = float4(0, 0, 0, 1);
 		}
 		else if(tileCounters[tileIndex] == 0xffffu)
 		{
 			fragColor = float4(1, 0, 1, 1);
 		}
 		else if(tileCounter != 0u)
 		{
 			fragColor = float4(0, 1, 0, 1);
 		}
 		else
 		{
 			fragColor = float4(1, 0, 0, 1);
 		}
 		outTexture.write(fragColor, gid);
 		return;
 	}
 #endif
 	const int subPixelFactor = 16;
 	const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
@ -258,89 +227,108 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 		currentColor[i] = float4(0, 0, 0, 0);
    }
-    for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++)
+    for(int shapeIndex = 0; shapeIndex < shapeCount[0]; shapeIndex++)
    {
-    	int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex];
+		const device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex];
 		const device mg_triangle_data* triangle = &triangleArray[triangleIndex];
-		int2 p0 = triangle->p0;
+		// get the tile queue that corresponds to our tile in the shape area
-		int2 p1 = triangle->p1;
+		int2 tileQueueCoord = tileCoord - shapeQueue->area.xy;
-		int2 p2 = triangle->p2;
+		if(  tileQueueCoord.x >= 0
-
+		  && tileQueueCoord.y >= 0
-		int cw = triangle->cw;
+		  && tileQueueCoord.x < shapeQueue->area.z
-
+		  && tileQueueCoord.y < shapeQueue->area.w)
 		int bias0 = triangle->bias0;
 		int bias1 = triangle->bias1;
 		int bias2 = triangle->bias2;
 		float4 cubic0 = triangle->cubic0;
 		float4 cubic1 = triangle->cubic1;
 		float4 cubic2 = triangle->cubic2;
 		int shapeIndex = triangle->shapeIndex;
 		float4 color = triangle->color;
 		color.rgb *= color.a;
 		int4 clip = triangle->box;
 		matrix_float3x3 uvTransform = triangle->uvTransform;
 		for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
 		{
-			int2 samplePoint = samplePoints[sampleIndex];
+			int tileQueueIndex = tileQueueCoord.y * shapeQueue->area.z + tileQueueCoord.x;
 			device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileQueueIndex];
-			if(  samplePoint.x < clip.x
+			int firstEltIndex = atomic_load_explicit(&tileQueue->first, memory_order_relaxed);
-			  || samplePoint.x > clip.z
+			device mg_queue_elt* elt = 0;
-			  || samplePoint.y < clip.y
+
-			  || samplePoint.y > clip.w)
+			for(int eltIndex = firstEltIndex; eltIndex >= 0; eltIndex = elt->next)
 			{
-				continue;
+				elt = (device mg_queue_elt*)(arenaBuffer + eltIndex);
-			}
+				const device mg_triangle_data* triangle = &triangleArray[elt->triangleIndex];
-			int w0 = cw*orient2d(p1, p2, samplePoint);
+				int2 p0 = triangle->p0;
-			int w1 = cw*orient2d(p2, p0, samplePoint);
+				int2 p1 = triangle->p1;
-			int w2 = cw*orient2d(p0, p1, samplePoint);
+				int2 p2 = triangle->p2;
-			if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
+				int cw = triangle->cw;
 			{
 				float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
-				float eps = 0.0001;
+				int bias0 = triangle->bias0;
-				if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
+				int bias1 = triangle->bias1;
 				int bias2 = triangle->bias2;
 				float4 cubic0 = triangle->cubic0;
 				float4 cubic1 = triangle->cubic1;
 				float4 cubic2 = triangle->cubic2;
 				int shapeIndex = triangle->shapeIndex;
 				float4 color = triangle->color;
 				color.rgb *= color.a;
 				int4 clip = triangle->box;
 				matrix_float3x3 uvTransform = triangle->uvTransform;
 				for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
 				{
-					if(shapeIndex == currentShapeIndex[sampleIndex])
+					int2 samplePoint = samplePoints[sampleIndex];
 					if(  samplePoint.x < clip.x
 			  		|| samplePoint.x > clip.z
 			  		|| samplePoint.y < clip.y
 			  		|| samplePoint.y > clip.w)
 					{
-						flipCount[sampleIndex]++;
+						continue;
 					}
-					else
+
 					int w0 = cw*orient2d(p1, p2, samplePoint);
 					int w1 = cw*orient2d(p2, p0, samplePoint);
 					int w2 = cw*orient2d(p0, p1, samplePoint);
 					if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0)
 					{
-						if(flipCount[sampleIndex] & 0x01)
+						float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
 						float eps = 0.0001;
 						if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
 						{
-							sampleColor[sampleIndex] = currentColor[sampleIndex];
+							if(shapeIndex == currentShapeIndex[sampleIndex])
 							{
 								flipCount[sampleIndex]++;
 							}
 							else
 							{
 								if(flipCount[sampleIndex] & 0x01)
 								{
 									sampleColor[sampleIndex] = currentColor[sampleIndex];
 								}
 								float4 nextColor = color;
 								if(useTexture[0])
 								{
 									float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
 									float2 uv = (uvTransform * sampleFP).xy;
 									constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
 									float4 texColor = texAtlas.sample(smp, uv);
 									texColor.rgb *= texColor.a;
 									nextColor *= texColor;
 								}
 								currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
 								currentShapeIndex[sampleIndex] = shapeIndex;
 								flipCount[sampleIndex] = 1;
 							}
 						}
 						float4 nextColor = color;
 						if(useTexture[0])
 						{
 							float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
 							float2 uv = (uvTransform * sampleFP).xy;
 							constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
 							float4 texColor = texAtlas.sample(smp, uv);
 							texColor.rgb *= texColor.a;
 							nextColor *= texColor;
 						}
 						currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
 						currentShapeIndex[sampleIndex] = shapeIndex;
 						flipCount[sampleIndex] = 1;
 					}
 				}
 			}
 		}
-    }
+	}
    float4 pixelColor = float4(0);
    for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
@ -353,4 +341,5 @@ kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]],
 	}
 	outTexture.write(pixelColor/float(sampleCount), gid);
 }