From b300cc4d7d178ecfd7504e7998627bcfd55e9527 Mon Sep 17 00:00:00 2001 From: martinfouilleul Date: Thu, 27 Jul 2023 15:24:20 +0200 Subject: [PATCH] [gl canvas] balance dispatch of raster shader along 2 dimensions to avoid hitting the max workgroup count per dimension --- build.bat | 2 +- src/gl_canvas.c | 26 +++++++++++++++++++++-- src/glsl_shaders/balance_workgroups.glsl | 27 ++++++++++++++++++++++++ src/glsl_shaders/merge.glsl | 11 ++++------ src/glsl_shaders/raster.glsl | 15 ++++++++++++- 5 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 src/glsl_shaders/balance_workgroups.glsl diff --git a/build.bat b/build.bat index 798dbe1..35fb163 100644 --- a/build.bat +++ b/build.bat @@ -4,7 +4,7 @@ setlocal EnableDelayedExpansion if not exist bin mkdir bin -set glsl_shaders=src\glsl_shaders\common.glsl src\glsl_shaders\blit_vertex.glsl src\glsl_shaders\blit_fragment.glsl src\glsl_shaders\path_setup.glsl src\glsl_shaders\segment_setup.glsl src\glsl_shaders\backprop.glsl src\glsl_shaders\merge.glsl src\glsl_shaders\raster.glsl +set glsl_shaders=src\glsl_shaders\common.glsl src\glsl_shaders\blit_vertex.glsl src\glsl_shaders\blit_fragment.glsl src\glsl_shaders\path_setup.glsl src\glsl_shaders\segment_setup.glsl src\glsl_shaders\backprop.glsl src\glsl_shaders\merge.glsl src\glsl_shaders\raster.glsl src\glsl_shaders\balance_workgroups.glsl call python3 scripts\embed_text.py %glsl_shaders% --prefix=glsl_ --output src\glsl_shaders.h diff --git a/src/gl_canvas.c b/src/gl_canvas.c index 8b5039f..bd82050 100644 --- a/src/gl_canvas.c +++ b/src/gl_canvas.c @@ -149,6 +149,7 @@ typedef struct mg_gl_canvas_backend GLuint segmentSetup; GLuint backprop; GLuint merge; + GLuint balanceWorkgroups; GLuint raster; GLuint blit; @@ -167,6 +168,7 @@ typedef struct mg_gl_canvas_backend GLuint tileOpBuffer; GLuint tileOpCountBuffer; GLuint screenTilesBuffer; + GLuint screenTilesCountBuffer; GLuint rasterDispatchBuffer; GLuint dummyVertexBuffer; @@ -1095,6 +1097,9 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend, glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->rasterDispatchBuffer); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(mg_gl_dispatch_indirect_command), &zero, GL_DYNAMIC_COPY); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesCountBuffer); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(int), &zero, GL_DYNAMIC_COPY); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); int err = glGetError(); @@ -1210,7 +1215,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend, glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, backend->tileOpCountBuffer); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, backend->tileOpBuffer); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, backend->screenTilesBuffer); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, backend->rasterDispatchBuffer); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, backend->screenTilesCountBuffer); glUniform1i(0, tileSize); glUniform1f(1, scale); @@ -1239,6 +1244,17 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend, log_error("gl error %i\n", err); } } + + //NOTE: balance work groups + glUseProgram(backend->balanceWorkgroups); + + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, backend->screenTilesCountBuffer); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, backend->rasterDispatchBuffer); + glUniform1ui(0, maxWorkGroupCount); + + glDispatchCompute(1, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + //NOTE: raster pass glUseProgram(backend->raster); @@ -1246,6 +1262,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend, glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, backend->segmentBuffer); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, backend->tileOpBuffer); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, backend->screenTilesBuffer); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, backend->screenTilesCountBuffer); glUniform1f(0, scale); glUniform1i(1, backend->msaaCount); @@ -1265,6 +1282,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend, } glUniform1i(3, backend->pathBatchStart); + glUniform1ui(4, maxWorkGroupCount); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, backend->rasterDispatchBuffer); glDispatchComputeIndirect(0); @@ -1659,6 +1677,7 @@ mg_canvas_backend* gl_canvas_backend_create(mg_wgl_surface* surface) err |= mg_gl_canvas_compile_compute_program(glsl_segment_setup, &backend->segmentSetup); err |= mg_gl_canvas_compile_compute_program(glsl_backprop, &backend->backprop); err |= mg_gl_canvas_compile_compute_program(glsl_merge, &backend->merge); + err |= mg_gl_canvas_compile_compute_program(glsl_balance_workgroups, &backend->balanceWorkgroups); err |= mg_gl_canvas_compile_compute_program(glsl_raster, &backend->raster); err |= mg_gl_canvas_compile_render_program("blit", glsl_blit_vertex, glsl_blit_fragment, &backend->blit); @@ -1744,11 +1763,14 @@ mg_canvas_backend* gl_canvas_backend_create(mg_wgl_surface* surface) glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesBuffer); glBufferData(GL_SHADER_STORAGE_BUFFER, nTilesX*nTilesY*MG_GL_SCREEN_TILE_SIZE, 0, GL_DYNAMIC_COPY); + glGenBuffers(1, &backend->screenTilesCountBuffer); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesCountBuffer); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(int), 0, GL_DYNAMIC_COPY); + glGenBuffers(1, &backend->rasterDispatchBuffer); glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->rasterDispatchBuffer); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(mg_gl_dispatch_indirect_command), 0, GL_DYNAMIC_COPY); - if(err) { mg_gl_canvas_destroy((mg_canvas_backend*)backend); diff --git a/src/glsl_shaders/balance_workgroups.glsl b/src/glsl_shaders/balance_workgroups.glsl new file mode 100644 index 0000000..668c634 --- /dev/null +++ b/src/glsl_shaders/balance_workgroups.glsl @@ -0,0 +1,27 @@ + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +precision mediump float; +layout(std430) buffer; + +layout(binding = 0) coherent restrict readonly buffer screenTilesCountBufferSSBO +{ + int elements[]; +} screenTilesCountBuffer; + +layout(binding = 1) coherent restrict writeonly buffer dispatchBufferSSBO +{ + mg_gl_dispatch_indirect_command elements[]; +} dispatchBuffer; + + +layout(location = 0) uniform uint maxWorkGroupCount; + +void main() +{ + uint totalWorkGroupCount = screenTilesCountBuffer.elements[0]; + + dispatchBuffer.elements[0].num_groups_x = totalWorkGroupCount > maxWorkGroupCount ? maxWorkGroupCount : totalWorkGroupCount; + dispatchBuffer.elements[0].num_groups_y = (totalWorkGroupCount + maxWorkGroupCount - 1) / maxWorkGroupCount; + dispatchBuffer.elements[0].num_groups_z = 1; +} diff --git a/src/glsl_shaders/merge.glsl b/src/glsl_shaders/merge.glsl index a119305..3a8ffa9 100644 --- a/src/glsl_shaders/merge.glsl +++ b/src/glsl_shaders/merge.glsl @@ -34,10 +34,10 @@ layout(binding = 5) restrict writeonly buffer screenTilesBufferSSBO mg_gl_screen_tile elements[]; } screenTilesBuffer; -layout(binding = 6) coherent restrict buffer dispatchBufferSSBO +layout(binding = 6) coherent restrict buffer screenTilesCountBufferSSBO { - mg_gl_dispatch_indirect_command elements[]; -} dispatchBuffer; + int elements[]; +} screenTilesCountBuffer; layout(location = 0) uniform int tileSize; @@ -53,9 +53,6 @@ void main() int lastOpIndex = -1; - dispatchBuffer.elements[0].num_groups_y = 1; - dispatchBuffer.elements[0].num_groups_z = 1; - for(int pathIndex = 0; pathIndex < pathCount; pathIndex++) { mg_gl_path_queue pathQueue = pathQueueBuffer.elements[pathIndex]; @@ -75,7 +72,7 @@ void main() { if(tileIndex < 0) { - tileIndex = int(atomicAdd(dispatchBuffer.elements[0].num_groups_x, 1)); + tileIndex = int(atomicAdd(screenTilesCountBuffer.elements[0], 1)); screenTilesBuffer.elements[tileIndex].tileCoord = uvec2(tileCoord); screenTilesBuffer.elements[tileIndex].first = -1; } diff --git a/src/glsl_shaders/raster.glsl b/src/glsl_shaders/raster.glsl index 526102e..e7f6188 100644 --- a/src/glsl_shaders/raster.glsl +++ b/src/glsl_shaders/raster.glsl @@ -24,17 +24,30 @@ layout(binding = 3) restrict readonly buffer screenTilesBufferSSBO mg_gl_screen_tile elements[]; } screenTilesBuffer; +layout(binding = 4) restrict readonly buffer screenTilesCountBufferSSBO +{ + int elements[]; +} screenTilesCountBuffer; + + layout(location = 0) uniform float scale; layout(location = 1) uniform int msaaSampleCount; layout(location = 2) uniform uint useTexture; layout(location = 3) uniform int pathBufferStart; +layout(location = 4) uniform uint maxWorkGroupCount; layout(rgba8, binding = 0) uniform restrict writeonly image2D outTexture; layout(binding = 1) uniform sampler2D srcTexture; void main() { - uint tileIndex = gl_WorkGroupID.x; + uint tileIndex = gl_WorkGroupID.y * maxWorkGroupCount + gl_WorkGroupID.x; + + if(tileIndex >= screenTilesCountBuffer.elements[0]) + { + return; + } + uvec2 tileCoord = screenTilesBuffer.elements[tileIndex].tileCoord; ivec2 pixelCoord = ivec2(tileCoord * gl_WorkGroupSize.x + gl_LocalInvocationID.xy);