[wip] trying to simplify metal shader

2023-03-13 18:38:30 +01:00 · 2023-03-13 18:38:30 +01:00 · fd5a4d4cd8
parent cee294d8ad
commit fd5a4d4cd8
2 changed files with 225 additions and 158 deletions
--- a/src/mtl_canvas.m
+++ b/src/mtl_canvas.m
@ -45,7 +45,7 @@ typedef struct mg_mtl_canvas_backend
 	id<MTLBuffer> vertexBuffer;
 	id<MTLBuffer> indexBuffer;
 	id<MTLBuffer> tileCounters;
-	id<MTLBuffer> tilesArray;
+	id<MTLBuffer> tileArrayBuffer;
 	id<MTLBuffer> triangleArray;
 	id<MTLBuffer> boxArray;

@ -163,6 +163,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		[blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0];
 		[blitEncoder endEncoding];

+		/*
 		//-----------------------------------------------------------
 		//NOTE(martin): encode the boxing pass
 		//-----------------------------------------------------------
@ -183,6 +184,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image

 		[boxEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize];
 		[boxEncoder endEncoding];
+		*/

 		//-----------------------------------------------------------
 		//NOTE(martin): encode the tiling pass
@ -191,12 +193,19 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder];
 		tileEncoder.label = @"tiling pass";
 		[tileEncoder setComputePipelineState: backend->tilingPipeline];
-		[tileEncoder setBuffer: backend->boxArray offset:0 atIndex: 0];
-		[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
-		[tileEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2];
-		[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3];
+		[tileEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
+		[tileEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
+		[tileEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
+		[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
+		[tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];

-		[tileEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize];
+		[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 5];
+		[tileEncoder setBytes: &scale length: sizeof(float) atIndex: 6];
+
+		MTLSize tileGroupSize = MTLSizeMake(backend->tilingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
+		MTLSize tileGridSize = MTLSizeMake(indexCount/3, 1, 1);
+
+		[tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize];
 		[tileEncoder endEncoding];

 		//-----------------------------------------------------------
@ -206,15 +215,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder];
 		sortEncoder.label = @"sorting pass";
 		[sortEncoder setComputePipelineState: backend->sortingPipeline];
-		[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0];
-		[sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1];
-		[sortEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2];
-		[sortEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3];
+		[sortEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
+		[sortEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
+		[sortEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
+		[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
+		[sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];

 		u32     nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
 		u32     nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;

-		MTLSize sortGroupSize = MTLSizeMake(backend->boxingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
+		MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
 		MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1);

 		[sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize];
@ -226,35 +236,35 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
 		//TODO: remove that
 		vector_float4 clearColorVec4 = {backend->clearColor.r, backend->clearColor.g, backend->clearColor.b, backend->clearColor.a};

-		id<MTLComputeCommandEncoder> encoder = [surface->commandBuffer computeCommandEncoder];
-		encoder.label = @"drawing pass";
-		[encoder setComputePipelineState:backend->computePipeline];
-		[encoder setTexture: backend->outTexture atIndex: 0];
+		id<MTLComputeCommandEncoder> drawEncoder = [surface->commandBuffer computeCommandEncoder];
+		drawEncoder.label = @"drawing pass";
+		[drawEncoder setComputePipelineState:backend->computePipeline];
+		[drawEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
+		[drawEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
+		[drawEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
+		[drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
+		[drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
+
+		[drawEncoder setTexture: backend->outTexture atIndex: 0];
 		int useTexture = 0;
 		if(image)
 		{
 			mg_mtl_image_data* mtlImage = (mg_mtl_image_data*)image;
-			[encoder setTexture: mtlImage->texture atIndex: 1];
+			[drawEncoder setTexture: mtlImage->texture atIndex: 1];
 			useTexture = 1;
 		}

-		[encoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
-		[encoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 1];
-		[encoder setBuffer: backend->tileCounters offset:0 atIndex: 2];
-		[encoder setBuffer: backend->tilesArray offset:0 atIndex: 3];
-		[encoder setBuffer: backend->triangleArray offset:0 atIndex: 4];
-		[encoder setBuffer: backend->boxArray offset:0 atIndex: 5];
-		[encoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 6];
-		[encoder setBytes: &useTexture length:sizeof(int) atIndex:7];
-		[encoder setBytes: &scale length: sizeof(float) atIndex: 8];
+		[drawEncoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 5];
+		[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex:6];
+		[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 7];

 		//TODO: check that we don't exceed maxTotalThreadsPerThreadgroup
 		DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup);
 		MTLSize threadGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1);
 		MTLSize threadGroupSize = MTLSizeMake(RENDERER_TILE_SIZE, RENDERER_TILE_SIZE, 1);

-		[encoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize];
-		[encoder endEncoding];
+		[drawEncoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize];
+		[drawEncoder endEncoding];

 		//-----------------------------------------------------------
 		//NOTE(martin): blit texture to framebuffer
@ -331,7 +341,7 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface)
 		[backend->outTexture release];
 		[backend->vertexBuffer release];
 		[backend->indexBuffer release];
-		[backend->tilesArray release];
+		[backend->tileArrayBuffer release];
 		[backend->triangleArray release];
 		[backend->boxArray release];
 		[backend->computePipeline release];
@ -459,7 +469,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		    backend->shapeBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape)
 		                                        	options: bufferOptions];

-			backend->tilesArray = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
+			backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
 								options: MTLResourceStorageModePrivate];

 			backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data)
@ -522,6 +532,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                           	reflection: nil
 		                                           	error: &error];

+/*
 			MTLComputePipelineDescriptor* boxingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
 			boxingPipelineDesc.computeFunction = boxingFunction;
 	//		boxingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
@ -530,6 +541,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
 		                                           	options: MTLPipelineOptionNone
 		                                           	reflection: nil
 		                                           	error: &error];
+*/
 			//-----------------------------------------------------------
 			//NOTE(martin): setup our render pipeline state
 			//-----------------------------------------------------------
--- a/src/mtl_shader.metal
+++ b/src/mtl_shader.metal
@ -31,7 +31,7 @@ bool is_top_left(float2 a, float2 b)
 	return( (a.y == b.y && b.x < a.x)
 	      ||(b.y < a.y));
 }
-
+/*
 kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 		              constant uint* indexBuffer [[buffer(1)]],
 		              constant mg_shape* shapeBuffer [[buffer(2)]],
@ -94,19 +94,43 @@ kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
 	triangleArray[triangleIndex].bias1 = bias1;
 	triangleArray[triangleIndex].bias2 = bias2;
 }
+*/

-kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
-                       device volatile atomic_uint* tileCounters [[buffer(1)]],
-                       device uint* tilesArray [[buffer(2)]],
-                       constant vector_uint2* viewport [[buffer(3)]],
+kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
+		               constant uint* indexBuffer [[buffer(1)]],
+		               constant mg_shape* shapeBuffer [[buffer(2)]],
+		               device volatile atomic_uint* tileCounters [[buffer(3)]],
+                       device uint* tileArrayBuffer [[buffer(4)]],
+                       constant uint2* viewport [[buffer(5)]],
+                       constant float* scaling [[buffer(6)]],
                       uint gid [[thread_position_in_grid]])
 {
 	uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
 	int nTilesX = tilesMatrixDim.x;
 	int nTilesY = tilesMatrixDim.y;

-	uint triangleIndex = gid;
-	int4 box = int4(floor(boxArray[triangleIndex]))/RENDERER_TILE_SIZE;
+	uint triangleIndex = gid * 3;
+
+	uint i0 = indexBuffer[triangleIndex];
+	uint i1 = indexBuffer[triangleIndex+1u];
+	uint i2 = indexBuffer[triangleIndex+2u];
+
+	float2 p0 = vertexBuffer[i0].pos * scaling[0];
+	float2 p1 = vertexBuffer[i1].pos * scaling[0];
+	float2 p2 = vertexBuffer[i2].pos * scaling[0];
+
+	int shapeIndex = vertexBuffer[i0].shapeIndex;
+	float4 clip = shapeBuffer[shapeIndex].clip * scaling[0];
+
+	float4 fbox = float4(max(min(min(p0.x, p1.x), p2.x), clip.x),
+		             max(min(min(p0.y, p1.y), p2.y), clip.y),
+		             min(max(max(p0.x, p1.x), p2.x), clip.z),
+		             min(max(max(p0.y, p1.y), p2.y), clip.w));
+
+	int4 box = int4(floor(fbox))/int(RENDERER_TILE_SIZE);
+
+	//NOTE(martin): it's importat to do the computation with signed int, so that we can have negative xMax/yMax
+	//              otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
 	int xMin = max(0, box.x);
 	int yMin = max(0, box.y);
 	int xMax = min(box.z, nTilesX-1);
@ -120,105 +144,125 @@ kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
 			uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed);
 			if(counter < RENDERER_TILE_BUFFER_SIZE)
 			{
-				tilesArray[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex;
+				tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex;
 			}
 		}
 	}
 }

-kernel void SortKernel(const device uint* tileCounters [[buffer(0)]],
-                       const device mg_triangle_data* triangleArray [[buffer(1)]],
-                       device uint* tilesArray [[buffer(2)]],
-                       constant vector_uint2* viewport [[buffer(3)]],
+kernel void SortKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
+                       constant uint* indexBuffer [[buffer(1)]],
+                       constant mg_shape* shapeBuffer [[buffer(2)]],
+                       const device uint* tileCounters [[buffer(3)]],
+                       device uint* tileArrayBuffer [[buffer(4)]],
                       uint gid [[thread_position_in_grid]])
 {
 	uint tileIndex = gid;
-	device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE;
-	uint tileBufferSize = tileCounters[tileIndex];
+	uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE;
+	uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);

-	for(int eltIndex=0; eltIndex < (int)tileBufferSize; eltIndex++)
+	for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++)
 	{
-		uint elt = tileBuffer[eltIndex];
-		uint eltZIndex = triangleArray[elt].shapeIndex;
-
-		int backIndex = eltIndex-1;
-		for(; backIndex >= 0; backIndex--)
+		for(uint sortIndex = tileArrayIndex; sortIndex > 0u; sortIndex--)
 		{
-			uint backElt = tileBuffer[backIndex];
-			uint backEltZIndex = triangleArray[backElt].shapeIndex;
-			if(eltZIndex >= backEltZIndex)
+			uint triangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex]];
+			uint prevTriangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex - 1]];
+
+			int shapeIndex = vertexBuffer[triangleIndex].shapeIndex;
+			int prevShapeIndex = vertexBuffer[prevTriangleIndex].shapeIndex;
+
+			if(shapeIndex >= prevShapeIndex)
 			{
 				break;
 			}
-			else
-			{
-				tileBuffer[backIndex+1] = backElt;
-			}
+			uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex];
+			tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1];
+			tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp;
 		}
-		tileBuffer[backIndex+1] = elt;
 	}
 }

+
+bool is_top_left(int2 a, int2 b)
+{
+	return( (a.y == b.y && b.x < a.x)
+	      ||(b.y < a.y));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//TODO: we should do these computations on 64bits, because otherwise
+//      we might overflow for values > 2048.
+//		Unfortunately this is costly.
+//	    Another way is to precompute triangle edges (b - a) in full precision
+//      once to avoid doing it all the time...
+//////////////////////////////////////////////////////////////////////////////
+
+//TODO: coalesce
 int orient2d(int2 a, int2 b, int2 c)
 {
 	return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
 }

-kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)]],
+int is_clockwise(int2 p0, int2 p1, int2 p2)
+{
+	return((p1 - p0).x*(p2 - p0).y - (p1 - p0).y*(p2 - p0).x);
+}
+
+
+kernel void RenderKernel(const device mg_vertex* vertexBuffer [[buffer(0)]],
+                         const device uint* indexBuffer [[buffer(1)]],
+                         const device mg_shape* shapeBuffer [[buffer(2)]],
+                         const device uint* tileCounters [[buffer(3)]],
+                         const device uint* tileArrayBuffer [[buffer(4)]],
+
+                         constant float4* clearColor [[buffer(5)]],
+                         constant int* useTexture [[buffer(6)]],
+                         constant float* scaling [[buffer(7)]],
+
+                         texture2d<float, access::write> outTexture [[texture(0)]],
                         texture2d<float> texAtlas [[texture(1)]],
-                         const device mg_vertex* vertexBuffer [[buffer(0)]],
-                         const device mg_shape* shapeBuffer [[buffer(1)]],
-                         device uint* tileCounters [[buffer(2)]],
-                         const device uint* tilesArray [[buffer(3)]],
-                         const device mg_triangle_data* triangleArray [[buffer(4)]],
-                         const device float4* boxArray [[buffer(5)]],
-                         constant vector_float4* clearColor [[buffer(6)]],
-                         constant int* useTexture [[buffer(7)]],
-                         constant float* contentsScaling [[buffer(8)]],
+
                         uint2 gid [[thread_position_in_grid]],
                         uint2 tgid [[threadgroup_position_in_grid]],
                         uint2 threadsPerThreadgroup [[threads_per_threadgroup]],
                         uint2 gridSize [[threads_per_grid]])
 {
 	//TODO: guard against thread group size not equal to tile size?
+	const int2 pixelCoord = int2(gid);
+	const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE;
 	const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
-	const uint2 tilePos = gid/RENDERER_TILE_SIZE;
-	const uint tileIndex = tilePos.y * tilesMatrixDim.x + tilePos.x;
-	const device uint* tileBuffer = tilesArray + tileIndex * RENDERER_TILE_BUFFER_SIZE;
-
-	const uint tileBufferSize = tileCounters[tileIndex];
+	const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x;
+	const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);

 #ifdef RENDERER_DEBUG_TILES
 	//NOTE(martin): color code debug values and show the tile grid
-	uint nTileX = tilesMatrixDim.x;
-	uint nTileY = tilesMatrixDim.y;
+	{
+		float4 fragColor = float4(0);

-	if(tilePos.x > nTileX || tilePos.y > nTileY)
-	{
-		outTexture.write(float4(0, 1, 1, 1), gid);
-		return;
-	}
-
-	if((gid.x % RENDERER_TILE_SIZE == 0) || (gid.y % RENDERER_TILE_SIZE == 0))
-	{
-		outTexture.write(float4(0, 0, 0, 1), gid);
-		return;
-	}
-	if(tileBufferSize <= 0)
-	{
-		outTexture.write(float4(0, 1, 0, 1), gid);
-		return;
-	}
-	else
-	{
-		outTexture.write(float4(1, 0, 0, 1), gid);
+		if( pixelCoord.x % 16 == 0
+	  	  ||pixelCoord.y % 16 == 0)
+		{
+			fragColor = float4(0, 0, 0, 1);
+		}
+		else if(tileCounters[tileIndex] == 0xffffu)
+		{
+			fragColor = float4(1, 0, 1, 1);
+		}
+		else if(tileCounter != 0u)
+		{
+			fragColor = float4(0, 1, 0, 1);
+		}
+		else
+		{
+			fragColor = float4(1, 0, 0, 1);
+		}
+		outTexture.write(fragColor, gid);
 		return;
 	}
 #endif

-	int subPixelFactor = 16;
-	int2 pixelCoord = int2(gid);
-	int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
+	const int subPixelFactor = 16;
+	const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);

 	const int sampleCount = 8;
 	int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3),
@ -229,59 +273,75 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
 	                                  centerPoint + int2(-7, 1),
 	                                  centerPoint + int2(3, -7),
 	                                  centerPoint + int2(7, 7)};
-	int zIndices[sampleCount];
-	uint flipCounts[sampleCount];
-	float4 pixelColors[sampleCount];
-	float4 nextColors[sampleCount];
-	for(int i=0; i<sampleCount; i++)
-	{
-		zIndices[i] = -1;
-		flipCounts[i] = 0;
-		pixelColors[i] = float4(0, 0, 0, 0);
-		nextColors[i] = float4(0, 0, 0, 0);
-	}

-	for(uint tileBufferIndex=0; tileBufferIndex < tileBufferSize; tileBufferIndex++)
-	{
-		const device mg_triangle_data* triangle = &triangleArray[tileBuffer[tileBufferIndex]];
+	float4 sampleColor[sampleCount];
+	float4 currentColor[sampleCount];
+    int currentShapeIndex[sampleCount];
+    int flipCount[sampleCount];

-		int2 p0 = int2(triangle->p0 * subPixelFactor);
-		int2 p1 = int2(triangle->p1 * subPixelFactor);
-		int2 p2 = int2(triangle->p2 * subPixelFactor);
+    for(int i=0; i<sampleCount; i++)
+    {
+		currentShapeIndex[i] = -1;
+		flipCount[i] = 0;
+		sampleColor[i] = float4(0, 0, 0, 0);
+		currentColor[i] = float4(0, 0, 0, 0);
+    }

-		int bias0 = triangle->bias0;
-		int bias1 = triangle->bias1;
-		int bias2 = triangle->bias2;
+    for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++)
+    {
+    	int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex];

-		const device mg_vertex* v0 = &(vertexBuffer[triangle->i0]);
-		const device mg_vertex* v1 = &(vertexBuffer[triangle->i1]);
-		const device mg_vertex* v2 = &(vertexBuffer[triangle->i2]);
+		uint i0 = indexBuffer[triangleIndex];
+		uint i1 = indexBuffer[triangleIndex+1];
+		uint i2 = indexBuffer[triangleIndex+2];

-		float4 cubic0 = v0->cubic;
-		float4 cubic1 = v1->cubic;
-		float4 cubic2 = v2->cubic;
+		int2 p0 = int2((vertexBuffer[i0].pos * scaling[0]) * subPixelFactor);
+		int2 p1 = int2((vertexBuffer[i1].pos * scaling[0]) * subPixelFactor);
+		int2 p2 = int2((vertexBuffer[i2].pos * scaling[0]) * subPixelFactor);

-		int shapeIndex = v0->shapeIndex;
+		int shapeIndex = vertexBuffer[i0].shapeIndex;
 		float4 color = shapeBuffer[shapeIndex].color;
 		color.rgb *= color.a;

+		int4 clip = int4(round((shapeBuffer[shapeIndex].clip * scaling[0] + float4(0.5, 0.5, 0.5, 0.5)) * subPixelFactor));
+
 		const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform;
 		matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0},
 		                               {uvTransform2x3[1], uvTransform2x3[4], 0},
 		                               {uvTransform2x3[2], uvTransform2x3[5], 1}};

-		for(int sampleIndex=0; sampleIndex<sampleCount; sampleIndex++)
+		//NOTE(martin): reorder triangle counter-clockwise and compute bias for each edge
+		int cw = is_clockwise(p0, p1, p2);
+		if(cw < 0)
+		{
+			uint tmpIndex = i1;
+			i1 = i2;
+			i2 = tmpIndex;
+
+			int2 tmpPoint = p1;
+			p1 = p2;
+			p2 = tmpPoint;
+		}
+
+		float4 cubic0 = vertexBuffer[i0].cubic;
+		float4 cubic1 = vertexBuffer[i1].cubic;
+		float4 cubic2 = vertexBuffer[i2].cubic;
+
+		int bias0 = is_top_left(p1, p2) ? 0 : -1;
+		int bias1 = is_top_left(p2, p0) ? 0 : -1;
+		int bias2 = is_top_left(p0, p1) ? 0 : -1;
+
+		for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
 		{
 			int2 samplePoint = samplePoints[sampleIndex];

-			//NOTE(martin): cull if pixel is outside box
-			/*
-			// if we use this, make sure box is in fixed points coords
-			if(samplePoint.x < box.x || samplePoint.x > box.z || samplePoint.y < box.y || samplePoint.y > box.w)
+			if(  samplePoint.x < clip.x
+			  || samplePoint.x > clip.z
+			  || samplePoint.y < clip.y
+			  || samplePoint.y > clip.w)
 			{
 				continue;
 			}
-			*/

 			int w0 = orient2d(p1, p2, samplePoint);
 			int w1 = orient2d(p2, p0, samplePoint);
@ -291,57 +351,52 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
 			{
 				float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);

-				//TODO(martin): this is a quick and dirty fix for solid polygons where we use
-				//              cubic = (1, 1, 1, 1) on all vertices, which can cause small errors to
-				//              flip the sign.
-				//              We should really use another value that always lead to <= 0, but we must
-				//              make sure we never share these vertices with bezier shapes.
-				//              Alternatively, an ugly (but maybe less than this one) solution would be
-				//              to check if uvs are equal on all vertices of the triangle and always render
-				//              those triangles.
 				float eps = 0.0001;
 				if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
 				{
-					if(shapeIndex == zIndices[sampleIndex])
+					if(shapeIndex == currentShapeIndex[sampleIndex])
 					{
-						flipCounts[sampleIndex]++;
+						flipCount[sampleIndex]++;
 					}
 					else
 					{
-						if(flipCounts[sampleIndex] & 0x01)
+						if(flipCount[sampleIndex] & 0x01)
 						{
-							pixelColors[sampleIndex] = nextColors[sampleIndex];
+							sampleColor[sampleIndex] = currentColor[sampleIndex];
 						}

 						float4 nextColor = color;
-						if(*useTexture)
+
+						if(useTexture[0])
 						{
-							float2 sampleFP = float2(samplePoint)/subPixelFactor;
-							float2 uv = (uvTransform*(float3(sampleFP/contentsScaling[0], 1))).xy;
+							float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
+							float2 uv = (uvTransform * sampleFP).xy;

 							constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
 							float4 texColor = texAtlas.sample(smp, uv);
+
 							texColor.rgb *= texColor.a;
 							nextColor *= texColor;
 						}

-						nextColors[sampleIndex] = pixelColors[sampleIndex]*(1-nextColor.a) + nextColor;
-						zIndices[sampleIndex] = shapeIndex;
-						flipCounts[sampleIndex] = 1;
+						currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
+						currentShapeIndex[sampleIndex] = shapeIndex;
+						flipCount[sampleIndex] = 1;
 					}
 				}
 			}
 		}
+    }
+
+    float4 pixelColor = float4(0);
+    for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
+    {
+    	if(flipCount[sampleIndex] & 0x01)
+    	{
+			sampleColor[sampleIndex] = currentColor[sampleIndex];
+    	}
+    	pixelColor += sampleColor[sampleIndex];
 	}
-	float4 out = float4(0, 0, 0, 0);
-	for(int i=0; i<sampleCount; i++)
-	{
-		if(flipCounts[i] & 0x01)
-		{
-			pixelColors[i] = nextColors[i];
-		}
-		out += pixelColors[i];
-	}
-	out = out/sampleCount;
-	outTexture.write(out, gid);
+
+	outTexture.write(pixelColor/float(sampleCount), gid);
 }