From 9e8a0f5f6981f27611bafe2fbc4f2b686bf5b8a8 Mon Sep 17 00:00:00 2001 From: Martin Fouilleul Date: Mon, 10 Jul 2023 20:20:18 +0200 Subject: [PATCH] [osx, canvas] Only dispatch raster shader for tiles that are overlapped by at least one path. --- src/mtl_renderer.h | 7 ++++++ src/mtl_renderer.m | 37 +++++++++++++++++++++++------- src/mtl_renderer.metal | 52 +++++++++++++++++++++++++----------------- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/src/mtl_renderer.h b/src/mtl_renderer.h index 3fe8a90..812d6e0 100644 --- a/src/mtl_renderer.h +++ b/src/mtl_renderer.h @@ -97,4 +97,11 @@ typedef struct mg_mtl_tile_queue } mg_mtl_tile_queue; +typedef struct mg_mtl_screen_tile +{ + vector_uint2 tileCoord; + int first; + +} mg_mtl_screen_tile; + #endif //__MTL_RENDERER_H_ diff --git a/src/mtl_renderer.m b/src/mtl_renderer.m index 460d7dd..5103c00 100644 --- a/src/mtl_renderer.m +++ b/src/mtl_renderer.m @@ -52,6 +52,7 @@ typedef struct mg_mtl_canvas_backend id tileOpBuffer; id tileOpCountBuffer; id screenTilesBuffer; + id rasterDispatchBuffer; int msaaCount; vec2 frameSize; @@ -815,12 +816,24 @@ void mg_mtl_render_batch(mg_mtl_canvas_backend* backend, //NOTE: encode GPU commands @autoreleasepool { + //NOTE: create output texture + MTLRenderPassDescriptor* clearDescriptor = [MTLRenderPassDescriptor renderPassDescriptor]; + clearDescriptor.colorAttachments[0].texture = backend->outTexture; + clearDescriptor.colorAttachments[0].loadAction = MTLLoadActionClear; + clearDescriptor.colorAttachments[0].clearColor = MTLClearColorMake(0, 0, 0, 0); + clearDescriptor.colorAttachments[0].storeAction = MTLStoreActionStore; + + id clearEncoder = [surface->commandBuffer renderCommandEncoderWithDescriptor:clearDescriptor]; + clearEncoder.label = @"clear out texture pass"; + [clearEncoder endEncoding]; + //NOTE: clear counters id blitEncoder = [surface->commandBuffer blitCommandEncoder]; blitEncoder.label = @"clear counters"; [blitEncoder fillBuffer: backend->segmentCountBuffer range: NSMakeRange(0, sizeof(int)) value: 0]; [blitEncoder fillBuffer: backend->tileQueueCountBuffer range: NSMakeRange(0, sizeof(int)) value: 0]; [blitEncoder fillBuffer: backend->tileOpCountBuffer range: NSMakeRange(0, sizeof(int)) value: 0]; + [blitEncoder fillBuffer: backend->rasterDispatchBuffer range: NSMakeRange(0, sizeof(MTLDispatchThreadgroupsIndirectArguments)) value: 0]; [blitEncoder endEncoding]; //NOTE: path setup pass @@ -893,11 +906,12 @@ void mg_mtl_render_batch(mg_mtl_canvas_backend* backend, [mergeEncoder setBuffer:backend->tileQueueBuffer offset:0 atIndex:3]; [mergeEncoder setBuffer:backend->tileOpBuffer offset:0 atIndex:4]; [mergeEncoder setBuffer:backend->tileOpCountBuffer offset:0 atIndex:5]; - [mergeEncoder setBuffer:backend->screenTilesBuffer offset:0 atIndex:6]; - [mergeEncoder setBytes:&tileSize length:sizeof(int) atIndex:7]; - [mergeEncoder setBytes:&scale length:sizeof(float) atIndex:8]; - [mergeEncoder setBuffer:backend->logBuffer[backend->bufferIndex] offset:0 atIndex:9]; - [mergeEncoder setBuffer:backend->logOffsetBuffer[backend->bufferIndex] offset:0 atIndex:10]; + [mergeEncoder setBuffer:backend->rasterDispatchBuffer offset:0 atIndex:6]; + [mergeEncoder setBuffer:backend->screenTilesBuffer offset:0 atIndex:7]; + [mergeEncoder setBytes:&tileSize length:sizeof(int) atIndex:8]; + [mergeEncoder setBytes:&scale length:sizeof(float) atIndex:9]; + [mergeEncoder setBuffer:backend->logBuffer[backend->bufferIndex] offset:0 atIndex:10]; + [mergeEncoder setBuffer:backend->logOffsetBuffer[backend->bufferIndex] offset:0 atIndex:11]; MTLSize mergeGridSize = MTLSizeMake(nTilesX, nTilesY, 1); MTLSize mergeGroupSize = MTLSizeMake(16, 16, 1); @@ -933,7 +947,11 @@ void mg_mtl_render_batch(mg_mtl_canvas_backend* backend, MTLSize rasterGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1); MTLSize rasterGroupSize = MTLSizeMake(16, 16, 1); - [rasterEncoder dispatchThreads: rasterGridSize threadsPerThreadgroup: rasterGroupSize]; +// [rasterEncoder dispatchThreads: rasterGridSize threadsPerThreadgroup: rasterGroupSize]; + + [rasterEncoder dispatchThreadgroupsWithIndirectBuffer: backend->rasterDispatchBuffer + indirectBufferOffset: 0 + threadsPerThreadgroup: rasterGroupSize]; [rasterEncoder endEncoding]; @@ -970,7 +988,7 @@ void mg_mtl_canvas_resize(mg_mtl_canvas_backend* backend, vec2 size) int nTilesX = (int)(size.x + tileSize - 1)/tileSize; int nTilesY = (int)(size.y + tileSize - 1)/tileSize; MTLResourceOptions bufferOptions = MTLResourceStorageModePrivate; - backend->screenTilesBuffer = [backend->surface->device newBufferWithLength: nTilesX*nTilesY*sizeof(int) + backend->screenTilesBuffer = [backend->surface->device newBufferWithLength: nTilesX*nTilesY*sizeof(mg_mtl_screen_tile) options: bufferOptions]; if(backend->outTexture) @@ -1435,10 +1453,13 @@ mg_canvas_backend* mtl_canvas_backend_create(mg_mtl_surface* surface) backend->tileOpCountBuffer = [surface->device newBufferWithLength: sizeof(int) options: bufferOptions]; + backend->rasterDispatchBuffer = [surface->device newBufferWithLength: sizeof(MTLDispatchThreadgroupsIndirectArguments) + options: bufferOptions]; + int tileSize = MG_MTL_TILE_SIZE; int nTilesX = (int)(frame.w * scale + tileSize - 1)/tileSize; int nTilesY = (int)(frame.h * scale + tileSize - 1)/tileSize; - backend->screenTilesBuffer = [surface->device newBufferWithLength: nTilesX*nTilesY*sizeof(int) + backend->screenTilesBuffer = [surface->device newBufferWithLength: nTilesX*nTilesY*sizeof(mg_mtl_screen_tile) options: bufferOptions]; bufferOptions = MTLResourceStorageModeShared; diff --git a/src/mtl_renderer.metal b/src/mtl_renderer.metal index cbfddae..f0dad17 100644 --- a/src/mtl_renderer.metal +++ b/src/mtl_renderer.metal @@ -1325,24 +1325,27 @@ kernel void mtl_merge(constant int* pathCount [[buffer(0)]], const device mg_mtl_tile_queue* tileQueueBuffer [[buffer(3)]], device mg_mtl_tile_op* tileOpBuffer [[buffer(4)]], device atomic_int* tileOpCount [[buffer(5)]], - device int* screenTilesBuffer [[buffer(6)]], - constant int* tileSize [[buffer(7)]], - constant float* scale [[buffer(8)]], - device char* logBuffer [[buffer(9)]], - device atomic_int* logOffsetBuffer [[buffer(10)]], + device MTLDispatchThreadgroupsIndirectArguments* dispatchBuffer [[buffer(6)]], + device mg_mtl_screen_tile* screenTilesBuffer [[buffer(7)]], + constant int* tileSize [[buffer(8)]], + constant float* scale [[buffer(9)]], + device char* logBuffer [[buffer(10)]], + device atomic_int* logOffsetBuffer [[buffer(11)]], uint2 threadCoord [[thread_position_in_grid]], uint2 gridSize [[threads_per_grid]]) { int2 tileCoord = int2(threadCoord); - int tileIndex = tileCoord.y * gridSize.x + tileCoord.x; - device int* nextLink = &screenTilesBuffer[tileIndex]; - *nextLink = -1; + int tileIndex = -1; + device int* nextLink = 0; /* mtl_log_context log = {.buffer = logBuffer, .offset = logOffsetBuffer, .enabled = true}; */ + dispatchBuffer[0].threadgroupsPerGrid[1] = 1; + dispatchBuffer[0].threadgroupsPerGrid[2] = 1; + for(int pathIndex = 0; pathIndex < pathCount[0]; pathIndex++) { const device mg_mtl_path_queue* pathQueue = &pathQueueBuffer[pathIndex]; @@ -1350,14 +1353,22 @@ kernel void mtl_merge(constant int* pathCount [[buffer(0)]], const device mg_mtl_path* path = &pathBuffer[pathIndex]; float xMax = min(path->box.z, path->clip.z); - int tileMax = xMax * scale[0] / tileSize[0]; - int pathTileMax = tileMax - pathQueue->area.x; + int tileMaxX = xMax * scale[0] / tileSize[0]; + int pathTileMaxX = tileMaxX - pathQueue->area.x; if( pathTileCoord.x >= 0 - && pathTileCoord.x <= pathTileMax + && pathTileCoord.x <= pathTileMaxX && pathTileCoord.y >= 0 && pathTileCoord.y < pathQueue->area.w) { + if(tileIndex < 0) + { + tileIndex = atomic_fetch_add_explicit((device atomic_uint*)&dispatchBuffer[0].threadgroupsPerGrid[0], 1, memory_order_relaxed); + screenTilesBuffer[tileIndex].tileCoord = uint2(tileCoord); + nextLink = &screenTilesBuffer[tileIndex].first; + *nextLink = -1; + } + int pathTileIndex = pathTileCoord.y * pathQueue->area.z + pathTileCoord.x; const device mg_mtl_tile_queue* tileQueue = &tileQueueBuffer[pathQueue->tileQueues + pathTileIndex]; @@ -1399,7 +1410,7 @@ kernel void mtl_merge(constant int* pathCount [[buffer(0)]], if(pathBuffer[pathIndex].color.a == 1) { - screenTilesBuffer[tileIndex] = pathOpIndex; + screenTilesBuffer[tileIndex].first = pathOpIndex; } } nextLink = &pathOp->next; @@ -1441,7 +1452,7 @@ kernel void mtl_merge(constant int* pathCount [[buffer(0)]], } } -kernel void mtl_raster(const device int* screenTilesBuffer [[buffer(0)]], +kernel void mtl_raster(const device mg_mtl_screen_tile* screenTilesBuffer [[buffer(0)]], const device mg_mtl_tile_op* tileOpBuffer [[buffer(1)]], const device mg_mtl_path* pathBuffer [[buffer(2)]], const device mg_mtl_segment* segmentBuffer [[buffer(3)]], @@ -1453,18 +1464,19 @@ kernel void mtl_raster(const device int* screenTilesBuffer [[buffer(0)]], constant int* useTexture [[buffer(9)]], texture2d outTexture [[texture(0)]], texture2d srcTexture [[texture(1)]], - uint2 threadCoord [[thread_position_in_grid]], - uint2 gridSize [[threads_per_grid]]) + uint2 threadGroupCoord [[threadgroup_position_in_grid]], + uint2 localCoord [[thread_position_in_threadgroup]]) { /* mtl_log_context log = {.buffer = logBuffer, .offset = logOffsetBuffer, .enabled = true}; */ - uint2 pixelCoord = threadCoord; - int2 tileCoord = int2(pixelCoord) / tileSize[0]; - int nTilesX = (int(gridSize.x) + tileSize[0] - 1)/tileSize[0]; - int tileIndex = tileCoord.y * nTilesX + tileCoord.x; + int tileIndex = int(threadGroupCoord.x); + uint2 tileCoord = screenTilesBuffer[tileIndex].tileCoord; + uint2 pixelCoord = tileCoord*tileSize[0] + localCoord.xy; + + int opIndex = screenTilesBuffer[tileIndex].first; const int MG_MTL_MAX_SAMPLE_COUNT = 8; float2 sampleCoords[MG_MTL_MAX_SAMPLE_COUNT]; @@ -1500,7 +1512,6 @@ kernel void mtl_raster(const device int* screenTilesBuffer [[buffer(0)]], float4 color = {0}; int winding[MG_MTL_MAX_SAMPLE_COUNT] = {0}; - int opIndex = screenTilesBuffer[tileIndex]; while(opIndex != -1) { @@ -1600,7 +1611,6 @@ kernel void mtl_raster(const device int* screenTilesBuffer [[buffer(0)]], } opIndex = op->next; } - outTexture.write(color, pixelCoord); }