From 11113f597c9f8fc3d3dc465cfb8c53a11f26133b Mon Sep 17 00:00:00 2001 From: Martin Fouilleul Date: Mon, 20 Mar 2023 10:08:17 +0100 Subject: [PATCH] [mtl canvas, exp] try tiling per tile and avoiding sorting pass (worse on simple shapes and text, but degrades slower with complex scenes (eg tiger). Not great though) --- src/mtl_canvas.m | 56 ++++++++++++++++++++++++-------- src/mtl_shader.h | 1 + src/mtl_shader.metal | 77 ++++++++++++++++++++++++++++---------------- 3 files changed, 93 insertions(+), 41 deletions(-) diff --git a/src/mtl_canvas.m b/src/mtl_canvas.m index f0bee4b..fa5ceca 100644 --- a/src/mtl_canvas.m +++ b/src/mtl_canvas.m @@ -34,6 +34,7 @@ typedef struct mg_mtl_canvas_backend mg_color clearColor; // permanent metal resources + id trianglePipeline; id tilingPipeline; id sortingPipeline; id computePipeline; @@ -206,6 +207,9 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image { f32 scale = surface->mtlLayer.contentsScale; vector_uint2 viewportSize = {backend->viewPort.w * scale, backend->viewPort.h * scale}; + u32 nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; + u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; + int triangleCount = indexCount/3; //----------------------------------------------------------- //NOTE(martin): encode the clear counter @@ -215,6 +219,25 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0]; [blitEncoder endEncoding]; + //----------------------------------------------------------- + //NOTE(martin): encode the triangle prepass + //----------------------------------------------------------- + id triangleEncoder = [surface->commandBuffer computeCommandEncoder]; + triangleEncoder.label = @"triangle pass"; + [triangleEncoder setComputePipelineState: backend->trianglePipeline]; + [triangleEncoder setBuffer: backend->vertexBuffer[backend->bufferIndex] offset:backend->vertexBufferOffset atIndex: 0]; + [triangleEncoder setBuffer: backend->indexBuffer[backend->bufferIndex] offset:backend->indexBufferOffset atIndex: 1]; + [triangleEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 2]; + [triangleEncoder setBuffer: backend->triangleArray offset:0 atIndex: 3]; + + [triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 4]; + + MTLSize triangleGroupSize = MTLSizeMake(backend->trianglePipeline.maxTotalThreadsPerThreadgroup, 1, 1); + MTLSize triangleGridSize = MTLSizeMake(triangleCount, 1, 1); + + [triangleEncoder dispatchThreads: triangleGridSize threadsPerThreadgroup: triangleGroupSize]; + [triangleEncoder endEncoding]; + //----------------------------------------------------------- //NOTE(martin): encode the tiling pass //----------------------------------------------------------- @@ -222,18 +245,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image id tileEncoder = [surface->commandBuffer computeCommandEncoder]; tileEncoder.label = @"tiling pass"; [tileEncoder setComputePipelineState: backend->tilingPipeline]; - [tileEncoder setBuffer: backend->vertexBuffer[backend->bufferIndex] offset:backend->vertexBufferOffset atIndex: 0]; - [tileEncoder setBuffer: backend->indexBuffer[backend->bufferIndex] offset:backend->indexBufferOffset atIndex: 1]; - [tileEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 2]; - [tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3]; - [tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4]; - [tileEncoder setBuffer: backend->triangleArray offset:0 atIndex: 5]; + [tileEncoder setBuffer: backend->triangleArray offset:0 atIndex: 0]; + [tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1]; + [tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 2]; - [tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 6]; - [tileEncoder setBytes: &scale length: sizeof(float) atIndex: 7]; + [tileEncoder setBytes: &triangleCount length:sizeof(int) atIndex: 3]; + [tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 4]; + [tileEncoder setBytes: &scale length: sizeof(float) atIndex: 5]; - MTLSize tileGroupSize = MTLSizeMake(backend->tilingPipeline.maxTotalThreadsPerThreadgroup, 1, 1); - MTLSize tileGridSize = MTLSizeMake(indexCount/3, 1, 1); + MTLSize tileGroupSize = MTLSizeMake(1, 1, 16); + MTLSize tileGridSize = MTLSizeMake(nTilesX, nTilesY, 16); [tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize]; [tileEncoder endEncoding]; @@ -241,7 +262,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image //----------------------------------------------------------- //NOTE(martin): encode the sorting pass //----------------------------------------------------------- - +/* id sortEncoder = [surface->commandBuffer computeCommandEncoder]; sortEncoder.label = @"sorting pass"; [sortEncoder setComputePipelineState: backend->sortingPipeline]; @@ -257,7 +278,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize]; [sortEncoder endEncoding]; - +*/ //----------------------------------------------------------- //NOTE(martin): encode drawing pass //----------------------------------------------------------- @@ -530,6 +551,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) LOG_ERROR("error : %s\n", errStr); return(0); } + id triangleFunction = [library newFunctionWithName:@"TriangleKernel"]; id tilingFunction = [library newFunctionWithName:@"TileKernel"]; id sortingFunction = [library newFunctionWithName:@"SortKernel"]; id computeFunction = [library newFunctionWithName:@"RenderKernel"]; @@ -544,9 +566,16 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) error:&error]; ASSERT(backend->computePipeline); + MTLComputePipelineDescriptor* trianglePipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; + trianglePipelineDesc.computeFunction = triangleFunction; + + backend->trianglePipeline = [metalSurface->device newComputePipelineStateWithDescriptor: trianglePipelineDesc + options: MTLPipelineOptionNone + reflection: nil + error: &error]; + MTLComputePipelineDescriptor* tilingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; tilingPipelineDesc.computeFunction = tilingFunction; - // tilingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; backend->tilingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: tilingPipelineDesc options: MTLPipelineOptionNone @@ -555,7 +584,6 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) MTLComputePipelineDescriptor* sortingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; sortingPipelineDesc.computeFunction = sortingFunction; - // sortingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; backend->sortingPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: sortingPipelineDesc options: MTLPipelineOptionNone diff --git a/src/mtl_shader.h b/src/mtl_shader.h index d50cf4a..e1ff8c5 100644 --- a/src/mtl_shader.h +++ b/src/mtl_shader.h @@ -43,6 +43,7 @@ typedef struct mg_triangle_data vector_float4 cubic2; vector_int4 box; + vector_int4 tileBox; vector_int2 p0; vector_int2 p1; diff --git a/src/mtl_shader.metal b/src/mtl_shader.metal index f2c110f..1a69deb 100644 --- a/src/mtl_shader.metal +++ b/src/mtl_shader.metal @@ -1,6 +1,7 @@ #include #include +#include #include"mtl_shader.h" @@ -45,21 +46,13 @@ int orient2d(int2 a, int2 b, int2 c) return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x)); } - -kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], - constant uint* indexBuffer [[buffer(1)]], - constant mg_shape* shapeBuffer [[buffer(2)]], - device volatile atomic_uint* tileCounters [[buffer(3)]], - device uint* tileArrayBuffer [[buffer(4)]], - device mg_triangle_data* triangleArray [[buffer(5)]], - constant uint2* viewport [[buffer(6)]], - constant float* scaling [[buffer(7)]], - uint gid [[thread_position_in_grid]]) +kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], + constant uint* indexBuffer [[buffer(1)]], + constant mg_shape* shapeBuffer [[buffer(2)]], + device mg_triangle_data* triangleArray [[buffer(3)]], + constant float* scaling [[buffer(4)]], + uint gid [[thread_position_in_grid]]) { - uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1; - int nTilesX = tilesMatrixDim.x; - int nTilesY = tilesMatrixDim.y; - uint triangleIndex = gid * 3; uint i0 = indexBuffer[triangleIndex]; @@ -110,26 +103,56 @@ kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], triangleArray[gid].bias1 = is_top_left(p2, p0) ? -(1-cw)/2 : -(1+cw)/2; triangleArray[gid].bias2 = is_top_left(p0, p1) ? -(1-cw)/2 : -(1+cw)/2; - //NOTE(martin): it's important to do the computation with signed int, so that we can have negative xMax/yMax - // otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0. - int4 tileBox = int4(fbox)/RENDERER_TILE_SIZE; + triangleArray[gid].tileBox = int4(fbox)/RENDERER_TILE_SIZE; +} - int xMin = max(0, tileBox.x); - int yMin = max(0, tileBox.y); - int xMax = min(tileBox.z, nTilesX-1); - int yMax = min(tileBox.w, nTilesY-1); +kernel void TileKernel(const device mg_triangle_data* triangleArray [[buffer(0)]], + device uint* tileCounters [[buffer(1)]], + device uint* tileArrayBuffer [[buffer(2)]], + constant int* triangleCount [[buffer(3)]], + constant uint2* viewport [[buffer(4)]], + constant float* scaling [[buffer(5)]], + uint3 gid [[thread_position_in_grid]]) +{ + uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1; + int nTilesX = tilesMatrixDim.x; - for(int y = yMin; y <= yMax; y++) + int tileX = gid.x; + int tileY = gid.y; + int tileIndex = tileY * nTilesX + tileX; + int groupIndex = gid.z; + + const int groupSize = 16; + int count = 0; + int mask = 0xffff>>(16-groupIndex); + + for(int triangleBatchIndex=0; triangleBatchIndex= box.x && tileX <= box.z + && tileY >= box.y && tileY <= box.w) { - tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = gid; + active = true; } + */ } + + int vote = uint64_t(simd_ballot(active)); + if(active) + { + int batchOffset = popcount(vote & mask); + tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + count + batchOffset] = triangleIndex; + } + count += popcount(vote); + } + if(groupIndex == 0) + { + tileCounters[tileIndex] = count; } }