[gl canvas] balance dispatch of raster shader along 2 dimensions to avoid hitting the max workgroup count per dimension

This commit is contained in:
martinfouilleul 2023-07-27 15:24:20 +02:00
parent 3c103eeb65
commit b300cc4d7d
5 changed files with 70 additions and 11 deletions

View File

@ -4,7 +4,7 @@ setlocal EnableDelayedExpansion
if not exist bin mkdir bin
set glsl_shaders=src\glsl_shaders\common.glsl src\glsl_shaders\blit_vertex.glsl src\glsl_shaders\blit_fragment.glsl src\glsl_shaders\path_setup.glsl src\glsl_shaders\segment_setup.glsl src\glsl_shaders\backprop.glsl src\glsl_shaders\merge.glsl src\glsl_shaders\raster.glsl
set glsl_shaders=src\glsl_shaders\common.glsl src\glsl_shaders\blit_vertex.glsl src\glsl_shaders\blit_fragment.glsl src\glsl_shaders\path_setup.glsl src\glsl_shaders\segment_setup.glsl src\glsl_shaders\backprop.glsl src\glsl_shaders\merge.glsl src\glsl_shaders\raster.glsl src\glsl_shaders\balance_workgroups.glsl
call python3 scripts\embed_text.py %glsl_shaders% --prefix=glsl_ --output src\glsl_shaders.h

View File

@ -149,6 +149,7 @@ typedef struct mg_gl_canvas_backend
GLuint segmentSetup;
GLuint backprop;
GLuint merge;
GLuint balanceWorkgroups;
GLuint raster;
GLuint blit;
@ -167,6 +168,7 @@ typedef struct mg_gl_canvas_backend
GLuint tileOpBuffer;
GLuint tileOpCountBuffer;
GLuint screenTilesBuffer;
GLuint screenTilesCountBuffer;
GLuint rasterDispatchBuffer;
GLuint dummyVertexBuffer;
@ -1095,6 +1097,9 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend,
glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->rasterDispatchBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(mg_gl_dispatch_indirect_command), &zero, GL_DYNAMIC_COPY);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesCountBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(int), &zero, GL_DYNAMIC_COPY);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
int err = glGetError();
@ -1210,7 +1215,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend,
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, backend->tileOpCountBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, backend->tileOpBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, backend->screenTilesBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, backend->rasterDispatchBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, backend->screenTilesCountBuffer);
glUniform1i(0, tileSize);
glUniform1f(1, scale);
@ -1239,6 +1244,17 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend,
log_error("gl error %i\n", err);
}
}
//NOTE: balance work groups
glUseProgram(backend->balanceWorkgroups);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, backend->screenTilesCountBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, backend->rasterDispatchBuffer);
glUniform1ui(0, maxWorkGroupCount);
glDispatchCompute(1, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
//NOTE: raster pass
glUseProgram(backend->raster);
@ -1246,6 +1262,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend,
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, backend->segmentBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, backend->tileOpBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, backend->screenTilesBuffer);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, backend->screenTilesCountBuffer);
glUniform1f(0, scale);
glUniform1i(1, backend->msaaCount);
@ -1265,6 +1282,7 @@ void mg_gl_render_batch(mg_gl_canvas_backend* backend,
}
glUniform1i(3, backend->pathBatchStart);
glUniform1ui(4, maxWorkGroupCount);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, backend->rasterDispatchBuffer);
glDispatchComputeIndirect(0);
@ -1659,6 +1677,7 @@ mg_canvas_backend* gl_canvas_backend_create(mg_wgl_surface* surface)
err |= mg_gl_canvas_compile_compute_program(glsl_segment_setup, &backend->segmentSetup);
err |= mg_gl_canvas_compile_compute_program(glsl_backprop, &backend->backprop);
err |= mg_gl_canvas_compile_compute_program(glsl_merge, &backend->merge);
err |= mg_gl_canvas_compile_compute_program(glsl_balance_workgroups, &backend->balanceWorkgroups);
err |= mg_gl_canvas_compile_compute_program(glsl_raster, &backend->raster);
err |= mg_gl_canvas_compile_render_program("blit", glsl_blit_vertex, glsl_blit_fragment, &backend->blit);
@ -1744,11 +1763,14 @@ mg_canvas_backend* gl_canvas_backend_create(mg_wgl_surface* surface)
glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, nTilesX*nTilesY*MG_GL_SCREEN_TILE_SIZE, 0, GL_DYNAMIC_COPY);
glGenBuffers(1, &backend->screenTilesCountBuffer);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->screenTilesCountBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(int), 0, GL_DYNAMIC_COPY);
glGenBuffers(1, &backend->rasterDispatchBuffer);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, backend->rasterDispatchBuffer);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(mg_gl_dispatch_indirect_command), 0, GL_DYNAMIC_COPY);
if(err)
{
mg_gl_canvas_destroy((mg_canvas_backend*)backend);

View File

@ -0,0 +1,27 @@
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
precision mediump float;
layout(std430) buffer;
layout(binding = 0) coherent restrict readonly buffer screenTilesCountBufferSSBO
{
int elements[];
} screenTilesCountBuffer;
layout(binding = 1) coherent restrict writeonly buffer dispatchBufferSSBO
{
mg_gl_dispatch_indirect_command elements[];
} dispatchBuffer;
layout(location = 0) uniform uint maxWorkGroupCount;
void main()
{
uint totalWorkGroupCount = screenTilesCountBuffer.elements[0];
dispatchBuffer.elements[0].num_groups_x = totalWorkGroupCount > maxWorkGroupCount ? maxWorkGroupCount : totalWorkGroupCount;
dispatchBuffer.elements[0].num_groups_y = (totalWorkGroupCount + maxWorkGroupCount - 1) / maxWorkGroupCount;
dispatchBuffer.elements[0].num_groups_z = 1;
}

View File

@ -34,10 +34,10 @@ layout(binding = 5) restrict writeonly buffer screenTilesBufferSSBO
mg_gl_screen_tile elements[];
} screenTilesBuffer;
layout(binding = 6) coherent restrict buffer dispatchBufferSSBO
layout(binding = 6) coherent restrict buffer screenTilesCountBufferSSBO
{
mg_gl_dispatch_indirect_command elements[];
} dispatchBuffer;
int elements[];
} screenTilesCountBuffer;
layout(location = 0) uniform int tileSize;
@ -53,9 +53,6 @@ void main()
int lastOpIndex = -1;
dispatchBuffer.elements[0].num_groups_y = 1;
dispatchBuffer.elements[0].num_groups_z = 1;
for(int pathIndex = 0; pathIndex < pathCount; pathIndex++)
{
mg_gl_path_queue pathQueue = pathQueueBuffer.elements[pathIndex];
@ -75,7 +72,7 @@ void main()
{
if(tileIndex < 0)
{
tileIndex = int(atomicAdd(dispatchBuffer.elements[0].num_groups_x, 1));
tileIndex = int(atomicAdd(screenTilesCountBuffer.elements[0], 1));
screenTilesBuffer.elements[tileIndex].tileCoord = uvec2(tileCoord);
screenTilesBuffer.elements[tileIndex].first = -1;
}

View File

@ -24,17 +24,30 @@ layout(binding = 3) restrict readonly buffer screenTilesBufferSSBO
mg_gl_screen_tile elements[];
} screenTilesBuffer;
layout(binding = 4) restrict readonly buffer screenTilesCountBufferSSBO
{
int elements[];
} screenTilesCountBuffer;
layout(location = 0) uniform float scale;
layout(location = 1) uniform int msaaSampleCount;
layout(location = 2) uniform uint useTexture;
layout(location = 3) uniform int pathBufferStart;
layout(location = 4) uniform uint maxWorkGroupCount;
layout(rgba8, binding = 0) uniform restrict writeonly image2D outTexture;
layout(binding = 1) uniform sampler2D srcTexture;
void main()
{
uint tileIndex = gl_WorkGroupID.x;
uint tileIndex = gl_WorkGroupID.y * maxWorkGroupCount + gl_WorkGroupID.x;
if(tileIndex >= screenTilesCountBuffer.elements[0])
{
return;
}
uvec2 tileCoord = screenTilesBuffer.elements[tileIndex].tileCoord;
ivec2 pixelCoord = ivec2(tileCoord * gl_WorkGroupSize.x + gl_LocalInvocationID.xy);