From 0d8bc824a28cbc7b0f074109fb313d8eee78966e Mon Sep 17 00:00:00 2001 From: Martin Fouilleul Date: Tue, 21 Mar 2023 10:22:37 +0100 Subject: [PATCH] [mtl canvas] bucket triangles into per-shape tile queues, then gather into tile arrays. This somewhat improves perf for perf_text test and avoids an awful degradation due to sorting in tiger test (tiger now runs at ~20fps, which is still much slower than it could be, but the triangle coverage method we're using just incurs too much cross product computations...) --- build.sh | 2 +- examples/tiger/Ghostscript_Tiger.svg | 725 +++++++++++++++++++++++++++ examples/tiger/main.c | 3 +- src/graphics.c | 14 + src/mtl_canvas.m | 92 +++- src/mtl_shader.h | 12 +- src/mtl_shader.metal | 420 ++++++++++++---- 7 files changed, 1137 insertions(+), 131 deletions(-) create mode 100644 examples/tiger/Ghostscript_Tiger.svg diff --git a/build.sh b/build.sh index c8f12c9..d658aba 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -DEBUG_FLAGS="-g -DDEBUG -DLOG_COMPILE_DEBUG" +DEBUG_FLAGS="-g -O2 -DDEBUG -DLOG_COMPILE_DEBUG" #DEBUG_FLAGS="-O3" #-------------------------------------------------------------- diff --git a/examples/tiger/Ghostscript_Tiger.svg b/examples/tiger/Ghostscript_Tiger.svg new file mode 100644 index 0000000..679edec --- /dev/null +++ b/examples/tiger/Ghostscript_Tiger.svg @@ -0,0 +1,725 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/tiger/main.c b/examples/tiger/main.c index 830dee4..2d2df6a 100644 --- a/examples/tiger/main.c +++ b/examples/tiger/main.c @@ -158,6 +158,7 @@ int main() draw_tiger(); mg_matrix_pop(); +/* // text mg_set_color_rgba(0, 0, 1, 1); mg_set_font(font); @@ -170,7 +171,7 @@ int main() 1./frameTime); mg_text_outlines(text); mg_fill(); - +*/ printf("Milepost vector graphics test program (frame time = %fs, fps = %f)...\n", frameTime, 1./frameTime); diff --git a/src/graphics.c b/src/graphics.c index 66e09f2..f24fbcd 100644 --- a/src/graphics.c +++ b/src/graphics.c @@ -216,6 +216,8 @@ typedef struct mg_canvas_data mg_surface surface; mg_canvas_backend* backend; + + int splitCount; } mg_canvas_data; static mg_data __mgData = {0}; @@ -894,6 +896,9 @@ void mg_render_fill_quadratic(mg_canvas_data* canvas, vec2 p[3]) void mg_split_and_fill_cubic(mg_canvas_data* canvas, vec2 p[4], f32 tSplit) { + //DEBUG + __mgCurrentCanvas->splitCount++; + int subVertexCount = 0; int subIndexCount = 0; @@ -1639,6 +1644,9 @@ vec2 mg_quadratic_get_point(vec2 p[3], f32 t) void mg_quadratic_split(vec2 p[3], f32 t, vec2 outLeft[3], vec2 outRight[3]) { + //DEBUG + __mgCurrentCanvas->splitCount++; + //NOTE(martin): split bezier curve p at parameter t, using De Casteljau's algorithm // the q_n are the points along the hull's segments at parameter t // s is the split point. @@ -3055,6 +3063,9 @@ void mg_flush_commands(int primitiveCount, mg_primitive* primitives, mg_path_elt canvas->backend->begin(canvas->backend, canvas->clearColor); + //DEBUG + canvas->splitCount = 0; + for(int i=0; i= primitiveCount) @@ -3149,6 +3160,9 @@ void mg_flush_commands(int primitiveCount, mg_primitive* primitives, mg_path_elt } exit_command_loop: ; + printf("path elements: %i, splitCount = %i\n", canvas->path.startIndex + canvas->path.count, canvas->splitCount); + + mg_image_data* imageData = mg_image_data_from_handle(canvas->image); mg_draw_batch(canvas, imageData); diff --git a/src/mtl_canvas.m b/src/mtl_canvas.m index 5911829..154ebd5 100644 --- a/src/mtl_canvas.m +++ b/src/mtl_canvas.m @@ -36,6 +36,7 @@ typedef struct mg_mtl_canvas_backend // permanent metal resources id shapePipeline; id trianglePipeline; + id gatherPipeline; id computePipeline; id renderPipeline; @@ -54,8 +55,13 @@ typedef struct mg_mtl_canvas_backend id indexBuffer[MG_MTL_MAX_BUFFER_AVAILABLE]; id shapeQueueBuffer; id triangleArray; - id arenaBuffer; - id arenaOffset; + id tilesBuffer; + id tilesOffset; + id eltBuffer; + id eltOffset; + + id tileArrayBuffer; + id tileCounters; } mg_mtl_canvas_backend; @@ -211,12 +217,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; int triangleCount = indexCount/3; + printf("triangle count: %i, shape count: %i\n", triangleCount, shapeCount); + //----------------------------------------------------------- //NOTE(martin): encode the clear arena offset //----------------------------------------------------------- id blitEncoder = [surface->commandBuffer blitCommandEncoder]; blitEncoder.label = @"clear arena"; - [blitEncoder fillBuffer: backend->arenaOffset range: NSMakeRange(0, sizeof(int)) value: 0]; + [blitEncoder fillBuffer: backend->tilesOffset range: NSMakeRange(0, sizeof(int)) value: 0]; + [blitEncoder fillBuffer: backend->eltOffset range: NSMakeRange(0, sizeof(int)) value: 0]; + [blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0]; [blitEncoder endEncoding]; //----------------------------------------------------------- @@ -227,8 +237,8 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [shapeEncoder setComputePipelineState: backend->shapePipeline]; [shapeEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 0]; [shapeEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 1]; - [shapeEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2]; - [shapeEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 3]; + [shapeEncoder setBuffer: backend->tilesBuffer offset:0 atIndex: 2]; + [shapeEncoder setBuffer: backend->tilesOffset offset:0 atIndex: 3]; [shapeEncoder setBytes: &scale length: sizeof(float) atIndex: 4]; MTLSize shapeGroupSize = MTLSizeMake(backend->shapePipeline.maxTotalThreadsPerThreadgroup, 1, 1); @@ -248,10 +258,11 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [triangleEncoder setBuffer: backend->shapeBuffer[backend->bufferIndex] offset:backend->shapeBufferOffset atIndex: 2]; [triangleEncoder setBuffer: backend->triangleArray offset:0 atIndex: 3]; [triangleEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 4]; - [triangleEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 5]; - [triangleEncoder setBuffer: backend->arenaOffset offset:0 atIndex: 6]; + [triangleEncoder setBuffer: backend->tilesBuffer offset:0 atIndex: 5]; + [triangleEncoder setBuffer: backend->eltBuffer offset:0 atIndex: 6]; + [triangleEncoder setBuffer: backend->eltOffset offset:0 atIndex: 7]; - [triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 7]; + [triangleEncoder setBytes: &scale length: sizeof(float) atIndex: 8]; MTLSize triangleGroupSize = MTLSizeMake(backend->trianglePipeline.maxTotalThreadsPerThreadgroup, 1, 1); MTLSize triangleGridSize = MTLSizeMake(triangleCount, 1, 1); @@ -259,15 +270,36 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [triangleEncoder dispatchThreads: triangleGridSize threadsPerThreadgroup: triangleGroupSize]; [triangleEncoder endEncoding]; + //----------------------------------------------------------- + //NOTE(martin): encode gathering pass + //----------------------------------------------------------- + id gatherEncoder = [surface->commandBuffer computeCommandEncoder]; + gatherEncoder.label = @"gather pass"; + [gatherEncoder setComputePipelineState: backend->gatherPipeline]; + [gatherEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 0]; + [gatherEncoder setBuffer: backend->tilesBuffer offset:0 atIndex: 1]; + [gatherEncoder setBuffer: backend->eltBuffer offset:0 atIndex: 2]; + [gatherEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3]; + [gatherEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4]; + + [gatherEncoder setBytes: &shapeCount length: sizeof(int) atIndex: 5]; + [gatherEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 6]; + + MTLSize gatherGroupSize = MTLSizeMake(16, 16, 1); + MTLSize gatherGridSize = MTLSizeMake(nTilesX, nTilesY, 1); + + [gatherEncoder dispatchThreads: gatherGridSize threadsPerThreadgroup: gatherGroupSize]; + [gatherEncoder endEncoding]; + //----------------------------------------------------------- //NOTE(martin): encode drawing pass //----------------------------------------------------------- id drawEncoder = [surface->commandBuffer computeCommandEncoder]; drawEncoder.label = @"drawing pass"; [drawEncoder setComputePipelineState:backend->computePipeline]; - [drawEncoder setBuffer: backend->shapeQueueBuffer offset:0 atIndex: 0]; - [drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1]; - [drawEncoder setBuffer: backend->arenaBuffer offset:0 atIndex: 2]; + [drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0]; + [drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 1]; + [drawEncoder setBuffer: backend->triangleArray offset:0 atIndex: 2]; [drawEncoder setTexture: backend->outTexture atIndex: 0]; int useTexture = 0; @@ -278,9 +310,8 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image useTexture = 1; } - [drawEncoder setBytes: &shapeCount length:sizeof(int) atIndex: 3]; - [drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 4]; - [drawEncoder setBytes: &scale length: sizeof(float) atIndex: 5]; + [drawEncoder setBytes: &useTexture length:sizeof(int) atIndex: 3]; + [drawEncoder setBytes: &scale length: sizeof(float) atIndex: 4]; //TODO: check that we don't exceed maxTotalThreadsPerThreadgroup DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup); @@ -374,8 +405,10 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface) [backend->shapeQueueBuffer release]; [backend->triangleArray release]; - [backend->arenaBuffer release]; - [backend->arenaOffset release]; + [backend->tilesBuffer release]; + [backend->tilesOffset release]; + [backend->eltBuffer release]; + [backend->eltOffset release]; ////////////////////////////////////////// //TODO release all pipelines @@ -485,6 +518,8 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) texDesc.height = drawableSize.height; backend->outTexture = [metalSurface->device newTextureWithDescriptor:texDesc]; + + texDesc.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead; backend->backbuffer = [metalSurface->device newTextureWithDescriptor:texDesc]; //TODO(martin): retain ? @@ -516,12 +551,24 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) backend->shapeQueueBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape_queue) options: MTLResourceStorageModePrivate]; - backend->arenaBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_queue_elt) + backend->tilesBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_tile) options: MTLResourceStorageModePrivate]; - backend->arenaOffset = [metalSurface->device newBufferWithLength: sizeof(int) + backend->tilesOffset = [metalSurface->device newBufferWithLength: sizeof(int) options: MTLResourceStorageModePrivate]; + backend->eltBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_tile_elt) + options: MTLResourceStorageModePrivate]; + + backend->eltOffset = [metalSurface->device newBufferWithLength: sizeof(int) + options: MTLResourceStorageModePrivate]; + + backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES + options: MTLResourceStorageModePrivate]; + + backend->tileCounters = [metalSurface->device newBufferWithLength: RENDERER_MAX_TILES*sizeof(uint) + options: MTLResourceStorageModePrivate]; + //----------------------------------------------------------- //NOTE(martin): load the library //----------------------------------------------------------- @@ -539,6 +586,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) } id shapeFunction = [library newFunctionWithName:@"ShapeSetup"]; id triangleFunction = [library newFunctionWithName:@"TriangleKernel"]; + id gatherFunction = [library newFunctionWithName:@"GatherKernel"]; id computeFunction = [library newFunctionWithName:@"RenderKernel"]; id vertexFunction = [library newFunctionWithName:@"VertexShader"]; id fragmentFunction = [library newFunctionWithName:@"FragmentShader"]; @@ -567,6 +615,14 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) reflection: nil error: &error]; + MTLComputePipelineDescriptor* gatherPipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; + gatherPipelineDesc.computeFunction = gatherFunction; + + backend->gatherPipeline = [metalSurface->device newComputePipelineStateWithDescriptor: gatherPipelineDesc + options: MTLPipelineOptionNone + reflection: nil + error: &error]; + //----------------------------------------------------------- //NOTE(martin): setup our render pipeline state //----------------------------------------------------------- diff --git a/src/mtl_shader.h b/src/mtl_shader.h index abfa981..f44b0d8 100644 --- a/src/mtl_shader.h +++ b/src/mtl_shader.h @@ -43,7 +43,6 @@ typedef struct mg_triangle_data vector_float4 cubic2; vector_int4 box; - vector_int4 tileBox; vector_int2 p0; vector_int2 p1; @@ -69,17 +68,18 @@ typedef struct mg_tile_elt { int triangleIndex; int next; -} mg_queue_elt; +} mg_tile_elt; -typedef struct mg_tile_queue +typedef struct mg_tile { - atomic_int first; -} mg_tile_queue; + atomic_int eltCount; + atomic_int firstElt; +} mg_tile; typedef struct mg_shape_queue { vector_int4 area; - device mg_tile_queue* tileQueues; + int tiles; } mg_shape_queue; #ifndef __METAL_VERSION__ diff --git a/src/mtl_shader.metal b/src/mtl_shader.metal index 81399d3..7f2c724 100644 --- a/src/mtl_shader.metal +++ b/src/mtl_shader.metal @@ -58,8 +58,8 @@ device uchar* arena_allocate(device uchar* arenaBuffer, kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]], device mg_shape_queue* shapeQueueBuffer [[buffer(1)]], - device uchar* arenaBuffer [[buffer(2)]], - device volatile atomic_uint* arenaOffset [[buffer(3)]], + device mg_tile* tilesBuffer [[buffer(2)]], + device volatile atomic_uint* tilesOffset [[buffer(3)]], constant float* scaling [[buffer(4)]], uint gid [[thread_position_in_grid]]) { @@ -76,14 +76,18 @@ kernel void ShapeSetup(constant mg_shape* shapeBuffer [[buffer(0)]], int nTilesY = int(box.w)/RENDERER_TILE_SIZE - firstTile.y + 1; int tileCount = nTilesX * nTilesY; - int tileArraySize = tileCount * sizeof(mg_tile_queue); + + int tilesIndex = atomic_fetch_add_explicit(tilesOffset, tileCount, memory_order_relaxed); shapeQueueBuffer[gid].area = int4(firstTile.x, firstTile.y, nTilesX, nTilesY); - shapeQueueBuffer[gid].tileQueues = (device mg_tile_queue*)arena_allocate(arenaBuffer, arenaOffset, tileArraySize); + shapeQueueBuffer[gid].tiles = tilesIndex; + + device mg_tile* tiles = &tilesBuffer[tilesIndex]; for(int i=0; itiles]; int xMin = max(0, tileBox.x - shapeQueue->area.x); int yMin = max(0, tileBox.y - shapeQueue->area.y); @@ -170,24 +174,72 @@ kernel void TriangleKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], { int tileIndex = y*shapeQueue->area.z + x; - device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileIndex]; - device mg_queue_elt* elt = (device mg_queue_elt*)arena_allocate(arenaBuffer, arenaOffset, sizeof(mg_queue_elt)); - int eltIndex = (device uchar*)elt - arenaBuffer; - - elt->next = atomic_exchange_explicit(&tileQueue->first, eltIndex, memory_order_relaxed); + int eltIndex = atomic_fetch_add_explicit(eltOffset, 1, memory_order_relaxed); + device mg_tile_elt* elt = &eltBuffer[eltIndex]; elt->triangleIndex = gid; + elt->next = atomic_exchange_explicit(&tiles[tileIndex].firstElt, eltIndex, memory_order_relaxed); + atomic_fetch_add_explicit(&tiles[tileIndex].eltCount, 1, memory_order_relaxed); } } } -kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer(0)]], - const device mg_triangle_data* triangleArray [[buffer(1)]], - const device uchar* arenaBuffer [[buffer(2)]], +kernel void GatherKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer(0)]], + const device mg_tile* tilesBuffer [[buffer(1)]], + const device mg_tile_elt* eltBuffer [[buffer(2)]], + device int* tileCounters [[buffer(3)]], + device int* tileArrayBuffer [[buffer(4)]], + constant int* shapeCount [[buffer(5)]], + constant uint2* viewport [[buffer(6)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1; + int nTilesX = tilesMatrixDim.x; - constant int* shapeCount [[buffer(3)]], - constant int* useTexture [[buffer(4)]], - constant float* scaling [[buffer(5)]], + int2 tileCoord = int2(gid); + int tileIndex = tileCoord.y * nTilesX + tileCoord.x; + + device int* tileArray = &tileArrayBuffer[tileIndex * RENDERER_TILE_BUFFER_SIZE]; + + int count = 0; + for(int shapeIndex = 0; shapeIndex < shapeCount[0]; shapeIndex++) + { + const device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex]; + const device mg_tile* tiles = &tilesBuffer[shapeQueue->tiles]; + + // get the tile queue that corresponds to our tile in the shape area + int2 tileQueueCoord = tileCoord - shapeQueue->area.xy; + + if( tileQueueCoord.x >= 0 + && tileQueueCoord.y >= 0 + && tileQueueCoord.x < shapeQueue->area.z + && tileQueueCoord.y < shapeQueue->area.w) + { + int localIndex = tileQueueCoord.y * shapeQueue->area.z + tileQueueCoord.x; + const device mg_tile* tile = &tiles[localIndex]; + + int firstEltIndex = *(device int*)&tile->firstElt; + const device mg_tile_elt* elt = 0; + + for(int eltIndex = firstEltIndex; eltIndex >= 0; eltIndex = elt->next) + { + elt = &eltBuffer[eltIndex]; + eltIndex = elt->next; + + tileArray[count] = elt->triangleIndex; + count++; + } + } + } + tileCounters[tileIndex] = count; +} + +kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]], + const device uint* tileArrayBuffer [[buffer(1)]], + const device mg_triangle_data* triangleArray [[buffer(2)]], + + constant int* useTexture [[buffer(3)]], + constant float* scaling [[buffer(4)]], texture2d outTexture [[texture(0)]], texture2d texAtlas [[texture(1)]], @@ -199,7 +251,37 @@ kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer( { //TODO: guard against thread group size not equal to tile size? const int2 pixelCoord = int2(gid); - const int2 tileCoord = pixelCoord/ RENDERER_TILE_SIZE; + const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE; + const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1; + const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x; + const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE); + +#ifdef RENDERER_DEBUG_TILES + //NOTE(martin): color code debug values and show the tile grid + { + float4 fragColor = float4(0); + + if( pixelCoord.x % 16 == 0 + ||pixelCoord.y % 16 == 0) + { + fragColor = float4(0, 0, 0, 1); + } + else if(tileCounters[tileIndex] == 0xffffu) + { + fragColor = float4(1, 0, 1, 1); + } + else if(tileCounter != 0u) + { + fragColor = float4(0, 1, 0, 1); + } + else + { + fragColor = float4(1, 0, 0, 1); + } + outTexture.write(fragColor, gid); + return; + } +#endif const int subPixelFactor = 16; const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor); @@ -227,103 +309,230 @@ kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer( currentColor[i] = float4(0, 0, 0, 0); } - for(int shapeIndex = 0; shapeIndex < shapeCount[0]; shapeIndex++) + for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++) { - const device mg_shape_queue* shapeQueue = &shapeQueueBuffer[shapeIndex]; + int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex]; + const device mg_triangle_data* triangle = &triangleArray[triangleIndex]; - // get the tile queue that corresponds to our tile in the shape area - int2 tileQueueCoord = tileCoord - shapeQueue->area.xy; - if( tileQueueCoord.x >= 0 - && tileQueueCoord.y >= 0 - && tileQueueCoord.x < shapeQueue->area.z - && tileQueueCoord.y < shapeQueue->area.w) + int2 p0 = triangle->p0; + int2 p1 = triangle->p1; + int2 p2 = triangle->p2; + + int cw = triangle->cw; + + int bias0 = triangle->bias0; + int bias1 = triangle->bias1; + int bias2 = triangle->bias2; + + float4 cubic0 = triangle->cubic0; + float4 cubic1 = triangle->cubic1; + float4 cubic2 = triangle->cubic2; + + int shapeIndex = triangle->shapeIndex; + float4 color = triangle->color; + color.rgb *= color.a; + + int4 clip = triangle->box; + + matrix_float3x3 uvTransform = triangle->uvTransform; + + for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { - int tileQueueIndex = tileQueueCoord.y * shapeQueue->area.z + tileQueueCoord.x; - device mg_tile_queue* tileQueue = &shapeQueue->tileQueues[tileQueueIndex]; + int2 samplePoint = samplePoints[sampleIndex]; - int firstEltIndex = atomic_load_explicit(&tileQueue->first, memory_order_relaxed); - device mg_queue_elt* elt = 0; - - for(int eltIndex = firstEltIndex; eltIndex >= 0; eltIndex = elt->next) + if( samplePoint.x < clip.x + || samplePoint.x > clip.z + || samplePoint.y < clip.y + || samplePoint.y > clip.w) { - elt = (device mg_queue_elt*)(arenaBuffer + eltIndex); - const device mg_triangle_data* triangle = &triangleArray[elt->triangleIndex]; + continue; + } - int2 p0 = triangle->p0; - int2 p1 = triangle->p1; - int2 p2 = triangle->p2; + int w0 = cw*orient2d(p1, p2, samplePoint); + int w1 = cw*orient2d(p2, p0, samplePoint); + int w2 = cw*orient2d(p0, p1, samplePoint); - int cw = triangle->cw; + if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0) + { + float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2); - int bias0 = triangle->bias0; - int bias1 = triangle->bias1; - int bias2 = triangle->bias2; - - float4 cubic0 = triangle->cubic0; - float4 cubic1 = triangle->cubic1; - float4 cubic2 = triangle->cubic2; - - int shapeIndex = triangle->shapeIndex; - float4 color = triangle->color; - color.rgb *= color.a; - - int4 clip = triangle->box; - - matrix_float3x3 uvTransform = triangle->uvTransform; - - for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) + float eps = 0.0001; + if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps) { - int2 samplePoint = samplePoints[sampleIndex]; - - if( samplePoint.x < clip.x - || samplePoint.x > clip.z - || samplePoint.y < clip.y - || samplePoint.y > clip.w) + if(shapeIndex == currentShapeIndex[sampleIndex]) { - continue; + flipCount[sampleIndex]++; } - - int w0 = cw*orient2d(p1, p2, samplePoint); - int w1 = cw*orient2d(p2, p0, samplePoint); - int w2 = cw*orient2d(p0, p1, samplePoint); - - if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0) + else { - float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2); - - float eps = 0.0001; - if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps) + if(flipCount[sampleIndex] & 0x01) { - if(shapeIndex == currentShapeIndex[sampleIndex]) - { - flipCount[sampleIndex]++; - } - else - { - if(flipCount[sampleIndex] & 0x01) - { - sampleColor[sampleIndex] = currentColor[sampleIndex]; - } - - float4 nextColor = color; - - if(useTexture[0]) - { - float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1); - float2 uv = (uvTransform * sampleFP).xy; - - constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear); - float4 texColor = texAtlas.sample(smp, uv); - - texColor.rgb *= texColor.a; - nextColor *= texColor; - } - - currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor; - currentShapeIndex[sampleIndex] = shapeIndex; - flipCount[sampleIndex] = 1; - } + sampleColor[sampleIndex] = currentColor[sampleIndex]; } + + float4 nextColor = color; + + if(useTexture[0]) + { + float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1); + float2 uv = (uvTransform * sampleFP).xy; + + constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear); + float4 texColor = texAtlas.sample(smp, uv); + + texColor.rgb *= texColor.a; + nextColor *= texColor; + } + + currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor; + currentShapeIndex[sampleIndex] = shapeIndex; + flipCount[sampleIndex] = 1; + } + } + } + } + } + + float4 pixelColor = float4(0); + for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) + { + if(flipCount[sampleIndex] & 0x01) + { + sampleColor[sampleIndex] = currentColor[sampleIndex]; + } + pixelColor += sampleColor[sampleIndex]; + } + + outTexture.write(pixelColor/float(sampleCount), gid); +} + + +/* +kernel void RenderKernel(const device uint* tileCounters [[buffer(0)]], + const device uint* tileArrayBuffer [[buffer(1)]], + const device mg_triangle_data* triangleArray [[buffer(2)]], + + constant int* useTexture [[buffer(3)]], + constant float* scaling [[buffer(4)]], + + texture2d outTexture [[texture(0)]], + texture2d texAtlas [[texture(1)]], + + uint2 gid [[thread_position_in_grid]], + uint2 tgid [[threadgroup_position_in_grid]], + uint2 threadsPerThreadgroup [[threads_per_threadgroup]], + uint2 gridSize [[threads_per_grid]]) +{ + const int2 pixelCoord = int2(gid); + const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE; + const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1; + const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x; + const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE); + + const int subPixelFactor = 16; + const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor); + + const int sampleCount = 8; + int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3), + centerPoint + int2(-1, -3), + centerPoint + int2(5, -1), + centerPoint + int2(-3, 5), + centerPoint + int2(-5, -5), + centerPoint + int2(-7, 1), + centerPoint + int2(3, -7), + centerPoint + int2(7, 7)}; + + float4 sampleColor[sampleCount]; + float4 currentColor[sampleCount]; + int currentShapeIndex[sampleCount]; + int flipCount[sampleCount]; + + for(int i=0; ip0; + int2 p1 = triangle->p1; + int2 p2 = triangle->p2; + + int cw = triangle->cw; + + int bias0 = triangle->bias0; + int bias1 = triangle->bias1; + int bias2 = triangle->bias2; + + float4 cubic0 = triangle->cubic0; + float4 cubic1 = triangle->cubic1; + float4 cubic2 = triangle->cubic2; + + int shapeIndex = triangle->shapeIndex; + float4 color = triangle->color; + color.rgb *= color.a; + + int4 clip = triangle->box; + + matrix_float3x3 uvTransform = triangle->uvTransform; + + for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) + { + int2 samplePoint = samplePoints[sampleIndex]; + + if( samplePoint.x < clip.x + || samplePoint.x > clip.z + || samplePoint.y < clip.y + || samplePoint.y > clip.w) + { + continue; + } + + int w0 = cw*orient2d(p1, p2, samplePoint); + int w1 = cw*orient2d(p2, p0, samplePoint); + int w2 = cw*orient2d(p0, p1, samplePoint); + + if((w0+bias0) >= 0 && (w1+bias1) >= 0 && (w2+bias2) >= 0) + { + float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2); + + float eps = 0.0001; + if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps) + { + if(shapeIndex == currentShapeIndex[sampleIndex]) + { + flipCount[sampleIndex]++; + } + else + { + if(flipCount[sampleIndex] & 0x01) + { + sampleColor[sampleIndex] = currentColor[sampleIndex]; + } + + float4 nextColor = color; + + if(useTexture[0]) + { + float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1); + float2 uv = (uvTransform * sampleFP).xy; + + constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear); + float4 texColor = texAtlas.sample(smp, uv); + + texColor.rgb *= texColor.a; + nextColor *= texColor; + } + + currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor; + currentShapeIndex[sampleIndex] = shapeIndex; + flipCount[sampleIndex] = 1; } } } @@ -343,3 +552,4 @@ kernel void RenderKernel(const device mg_shape_queue* shapeQueueBuffer [[buffer( outTexture.write(pixelColor/float(sampleCount), gid); } +*/