[wip] trying to simplify metal shader

This commit is contained in:
Martin Fouilleul 2023-03-13 18:38:30 +01:00
parent cee294d8ad
commit fd5a4d4cd8
2 changed files with 225 additions and 158 deletions

View File

@ -45,7 +45,7 @@ typedef struct mg_mtl_canvas_backend
id<MTLBuffer> vertexBuffer; id<MTLBuffer> vertexBuffer;
id<MTLBuffer> indexBuffer; id<MTLBuffer> indexBuffer;
id<MTLBuffer> tileCounters; id<MTLBuffer> tileCounters;
id<MTLBuffer> tilesArray; id<MTLBuffer> tileArrayBuffer;
id<MTLBuffer> triangleArray; id<MTLBuffer> triangleArray;
id<MTLBuffer> boxArray; id<MTLBuffer> boxArray;
@ -163,6 +163,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
[blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0]; [blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0];
[blitEncoder endEncoding]; [blitEncoder endEncoding];
/*
//----------------------------------------------------------- //-----------------------------------------------------------
//NOTE(martin): encode the boxing pass //NOTE(martin): encode the boxing pass
//----------------------------------------------------------- //-----------------------------------------------------------
@ -183,6 +184,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
[boxEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize]; [boxEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize];
[boxEncoder endEncoding]; [boxEncoder endEncoding];
*/
//----------------------------------------------------------- //-----------------------------------------------------------
//NOTE(martin): encode the tiling pass //NOTE(martin): encode the tiling pass
@ -191,12 +193,19 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder]; id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder];
tileEncoder.label = @"tiling pass"; tileEncoder.label = @"tiling pass";
[tileEncoder setComputePipelineState: backend->tilingPipeline]; [tileEncoder setComputePipelineState: backend->tilingPipeline];
[tileEncoder setBuffer: backend->boxArray offset:0 atIndex: 0]; [tileEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1]; [tileEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[tileEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2]; [tileEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3]; [tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
[tileEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize]; [tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 5];
[tileEncoder setBytes: &scale length: sizeof(float) atIndex: 6];
MTLSize tileGroupSize = MTLSizeMake(backend->tilingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
MTLSize tileGridSize = MTLSizeMake(indexCount/3, 1, 1);
[tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize];
[tileEncoder endEncoding]; [tileEncoder endEncoding];
//----------------------------------------------------------- //-----------------------------------------------------------
@ -206,15 +215,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder]; id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder];
sortEncoder.label = @"sorting pass"; sortEncoder.label = @"sorting pass";
[sortEncoder setComputePipelineState: backend->sortingPipeline]; [sortEncoder setComputePipelineState: backend->sortingPipeline];
[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0]; [sortEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1]; [sortEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[sortEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2]; [sortEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[sortEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3]; [sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
u32 nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; u32 nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
MTLSize sortGroupSize = MTLSizeMake(backend->boxingPipeline.maxTotalThreadsPerThreadgroup, 1, 1); MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1); MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1);
[sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize]; [sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize];
@ -226,35 +236,35 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
//TODO: remove that //TODO: remove that
vector_float4 clearColorVec4 = {backend->clearColor.r, backend->clearColor.g, backend->clearColor.b, backend->clearColor.a}; vector_float4 clearColorVec4 = {backend->clearColor.r, backend->clearColor.g, backend->clearColor.b, backend->clearColor.a};
id<MTLComputeCommandEncoder> encoder = [surface->commandBuffer computeCommandEncoder]; id<MTLComputeCommandEncoder> drawEncoder = [surface->commandBuffer computeCommandEncoder];
encoder.label = @"drawing pass"; drawEncoder.label = @"drawing pass";
[encoder setComputePipelineState:backend->computePipeline]; [drawEncoder setComputePipelineState:backend->computePipeline];
[encoder setTexture: backend->outTexture atIndex: 0]; [drawEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[drawEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[drawEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
[drawEncoder setTexture: backend->outTexture atIndex: 0];
int useTexture = 0; int useTexture = 0;
if(image) if(image)
{ {
mg_mtl_image_data* mtlImage = (mg_mtl_image_data*)image; mg_mtl_image_data* mtlImage = (mg_mtl_image_data*)image;
[encoder setTexture: mtlImage->texture atIndex: 1]; [drawEncoder setTexture: mtlImage->texture atIndex: 1];
useTexture = 1; useTexture = 1;
} }
[encoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0]; [drawEncoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 5];
[encoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 1]; [drawEncoder setBytes: &useTexture length:sizeof(int) atIndex:6];
[encoder setBuffer: backend->tileCounters offset:0 atIndex: 2]; [drawEncoder setBytes: &scale length: sizeof(float) atIndex: 7];
[encoder setBuffer: backend->tilesArray offset:0 atIndex: 3];
[encoder setBuffer: backend->triangleArray offset:0 atIndex: 4];
[encoder setBuffer: backend->boxArray offset:0 atIndex: 5];
[encoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 6];
[encoder setBytes: &useTexture length:sizeof(int) atIndex:7];
[encoder setBytes: &scale length: sizeof(float) atIndex: 8];
//TODO: check that we don't exceed maxTotalThreadsPerThreadgroup //TODO: check that we don't exceed maxTotalThreadsPerThreadgroup
DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup); DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup);
MTLSize threadGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1); MTLSize threadGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1);
MTLSize threadGroupSize = MTLSizeMake(RENDERER_TILE_SIZE, RENDERER_TILE_SIZE, 1); MTLSize threadGroupSize = MTLSizeMake(RENDERER_TILE_SIZE, RENDERER_TILE_SIZE, 1);
[encoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize]; [drawEncoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize];
[encoder endEncoding]; [drawEncoder endEncoding];
//----------------------------------------------------------- //-----------------------------------------------------------
//NOTE(martin): blit texture to framebuffer //NOTE(martin): blit texture to framebuffer
@ -331,7 +341,7 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface)
[backend->outTexture release]; [backend->outTexture release];
[backend->vertexBuffer release]; [backend->vertexBuffer release];
[backend->indexBuffer release]; [backend->indexBuffer release];
[backend->tilesArray release]; [backend->tileArrayBuffer release];
[backend->triangleArray release]; [backend->triangleArray release];
[backend->boxArray release]; [backend->boxArray release];
[backend->computePipeline release]; [backend->computePipeline release];
@ -459,7 +469,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
backend->shapeBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape) backend->shapeBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape)
options: bufferOptions]; options: bufferOptions];
backend->tilesArray = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
options: MTLResourceStorageModePrivate]; options: MTLResourceStorageModePrivate];
backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data) backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data)
@ -522,6 +532,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
reflection: nil reflection: nil
error: &error]; error: &error];
/*
MTLComputePipelineDescriptor* boxingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; MTLComputePipelineDescriptor* boxingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
boxingPipelineDesc.computeFunction = boxingFunction; boxingPipelineDesc.computeFunction = boxingFunction;
// boxingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; // boxingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
@ -530,6 +541,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
options: MTLPipelineOptionNone options: MTLPipelineOptionNone
reflection: nil reflection: nil
error: &error]; error: &error];
*/
//----------------------------------------------------------- //-----------------------------------------------------------
//NOTE(martin): setup our render pipeline state //NOTE(martin): setup our render pipeline state
//----------------------------------------------------------- //-----------------------------------------------------------

View File

@ -31,7 +31,7 @@ bool is_top_left(float2 a, float2 b)
return( (a.y == b.y && b.x < a.x) return( (a.y == b.y && b.x < a.x)
||(b.y < a.y)); ||(b.y < a.y));
} }
/*
kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
constant uint* indexBuffer [[buffer(1)]], constant uint* indexBuffer [[buffer(1)]],
constant mg_shape* shapeBuffer [[buffer(2)]], constant mg_shape* shapeBuffer [[buffer(2)]],
@ -94,19 +94,43 @@ kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
triangleArray[triangleIndex].bias1 = bias1; triangleArray[triangleIndex].bias1 = bias1;
triangleArray[triangleIndex].bias2 = bias2; triangleArray[triangleIndex].bias2 = bias2;
} }
*/
kernel void TileKernel(const device float4* boxArray [[buffer(0)]], kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
device volatile atomic_uint* tileCounters [[buffer(1)]], constant uint* indexBuffer [[buffer(1)]],
device uint* tilesArray [[buffer(2)]], constant mg_shape* shapeBuffer [[buffer(2)]],
constant vector_uint2* viewport [[buffer(3)]], device volatile atomic_uint* tileCounters [[buffer(3)]],
device uint* tileArrayBuffer [[buffer(4)]],
constant uint2* viewport [[buffer(5)]],
constant float* scaling [[buffer(6)]],
uint gid [[thread_position_in_grid]]) uint gid [[thread_position_in_grid]])
{ {
uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1; uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
int nTilesX = tilesMatrixDim.x; int nTilesX = tilesMatrixDim.x;
int nTilesY = tilesMatrixDim.y; int nTilesY = tilesMatrixDim.y;
uint triangleIndex = gid; uint triangleIndex = gid * 3;
int4 box = int4(floor(boxArray[triangleIndex]))/RENDERER_TILE_SIZE;
uint i0 = indexBuffer[triangleIndex];
uint i1 = indexBuffer[triangleIndex+1u];
uint i2 = indexBuffer[triangleIndex+2u];
float2 p0 = vertexBuffer[i0].pos * scaling[0];
float2 p1 = vertexBuffer[i1].pos * scaling[0];
float2 p2 = vertexBuffer[i2].pos * scaling[0];
int shapeIndex = vertexBuffer[i0].shapeIndex;
float4 clip = shapeBuffer[shapeIndex].clip * scaling[0];
float4 fbox = float4(max(min(min(p0.x, p1.x), p2.x), clip.x),
max(min(min(p0.y, p1.y), p2.y), clip.y),
min(max(max(p0.x, p1.x), p2.x), clip.z),
min(max(max(p0.y, p1.y), p2.y), clip.w));
int4 box = int4(floor(fbox))/int(RENDERER_TILE_SIZE);
//NOTE(martin): it's importat to do the computation with signed int, so that we can have negative xMax/yMax
// otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
int xMin = max(0, box.x); int xMin = max(0, box.x);
int yMin = max(0, box.y); int yMin = max(0, box.y);
int xMax = min(box.z, nTilesX-1); int xMax = min(box.z, nTilesX-1);
@ -120,105 +144,125 @@ kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed); uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed);
if(counter < RENDERER_TILE_BUFFER_SIZE) if(counter < RENDERER_TILE_BUFFER_SIZE)
{ {
tilesArray[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex; tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex;
} }
} }
} }
} }
kernel void SortKernel(const device uint* tileCounters [[buffer(0)]], kernel void SortKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
const device mg_triangle_data* triangleArray [[buffer(1)]], constant uint* indexBuffer [[buffer(1)]],
device uint* tilesArray [[buffer(2)]], constant mg_shape* shapeBuffer [[buffer(2)]],
constant vector_uint2* viewport [[buffer(3)]], const device uint* tileCounters [[buffer(3)]],
device uint* tileArrayBuffer [[buffer(4)]],
uint gid [[thread_position_in_grid]]) uint gid [[thread_position_in_grid]])
{ {
uint tileIndex = gid; uint tileIndex = gid;
device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE; uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE;
uint tileBufferSize = tileCounters[tileIndex]; uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
for(int eltIndex=0; eltIndex < (int)tileBufferSize; eltIndex++) for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++)
{ {
uint elt = tileBuffer[eltIndex]; for(uint sortIndex = tileArrayIndex; sortIndex > 0u; sortIndex--)
uint eltZIndex = triangleArray[elt].shapeIndex; {
uint triangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex]];
uint prevTriangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex - 1]];
int backIndex = eltIndex-1; int shapeIndex = vertexBuffer[triangleIndex].shapeIndex;
for(; backIndex >= 0; backIndex--) int prevShapeIndex = vertexBuffer[prevTriangleIndex].shapeIndex;
{
uint backElt = tileBuffer[backIndex]; if(shapeIndex >= prevShapeIndex)
uint backEltZIndex = triangleArray[backElt].shapeIndex;
if(eltZIndex >= backEltZIndex)
{ {
break; break;
} }
else uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex];
{ tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1];
tileBuffer[backIndex+1] = backElt; tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp;
} }
} }
tileBuffer[backIndex+1] = elt;
}
} }
bool is_top_left(int2 a, int2 b)
{
return( (a.y == b.y && b.x < a.x)
||(b.y < a.y));
}
//////////////////////////////////////////////////////////////////////////////
//TODO: we should do these computations on 64bits, because otherwise
// we might overflow for values > 2048.
// Unfortunately this is costly.
// Another way is to precompute triangle edges (b - a) in full precision
// once to avoid doing it all the time...
//////////////////////////////////////////////////////////////////////////////
//TODO: coalesce
int orient2d(int2 a, int2 b, int2 c) int orient2d(int2 a, int2 b, int2 c)
{ {
return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x)); return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
} }
kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)]], int is_clockwise(int2 p0, int2 p1, int2 p2)
{
return((p1 - p0).x*(p2 - p0).y - (p1 - p0).y*(p2 - p0).x);
}
kernel void RenderKernel(const device mg_vertex* vertexBuffer [[buffer(0)]],
const device uint* indexBuffer [[buffer(1)]],
const device mg_shape* shapeBuffer [[buffer(2)]],
const device uint* tileCounters [[buffer(3)]],
const device uint* tileArrayBuffer [[buffer(4)]],
constant float4* clearColor [[buffer(5)]],
constant int* useTexture [[buffer(6)]],
constant float* scaling [[buffer(7)]],
texture2d<float, access::write> outTexture [[texture(0)]],
texture2d<float> texAtlas [[texture(1)]], texture2d<float> texAtlas [[texture(1)]],
const device mg_vertex* vertexBuffer [[buffer(0)]],
const device mg_shape* shapeBuffer [[buffer(1)]],
device uint* tileCounters [[buffer(2)]],
const device uint* tilesArray [[buffer(3)]],
const device mg_triangle_data* triangleArray [[buffer(4)]],
const device float4* boxArray [[buffer(5)]],
constant vector_float4* clearColor [[buffer(6)]],
constant int* useTexture [[buffer(7)]],
constant float* contentsScaling [[buffer(8)]],
uint2 gid [[thread_position_in_grid]], uint2 gid [[thread_position_in_grid]],
uint2 tgid [[threadgroup_position_in_grid]], uint2 tgid [[threadgroup_position_in_grid]],
uint2 threadsPerThreadgroup [[threads_per_threadgroup]], uint2 threadsPerThreadgroup [[threads_per_threadgroup]],
uint2 gridSize [[threads_per_grid]]) uint2 gridSize [[threads_per_grid]])
{ {
//TODO: guard against thread group size not equal to tile size? //TODO: guard against thread group size not equal to tile size?
const int2 pixelCoord = int2(gid);
const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE;
const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1; const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
const uint2 tilePos = gid/RENDERER_TILE_SIZE; const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x;
const uint tileIndex = tilePos.y * tilesMatrixDim.x + tilePos.x; const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
const device uint* tileBuffer = tilesArray + tileIndex * RENDERER_TILE_BUFFER_SIZE;
const uint tileBufferSize = tileCounters[tileIndex];
#ifdef RENDERER_DEBUG_TILES #ifdef RENDERER_DEBUG_TILES
//NOTE(martin): color code debug values and show the tile grid //NOTE(martin): color code debug values and show the tile grid
uint nTileX = tilesMatrixDim.x; {
uint nTileY = tilesMatrixDim.y; float4 fragColor = float4(0);
if(tilePos.x > nTileX || tilePos.y > nTileY) if( pixelCoord.x % 16 == 0
||pixelCoord.y % 16 == 0)
{ {
outTexture.write(float4(0, 1, 1, 1), gid); fragColor = float4(0, 0, 0, 1);
return;
} }
else if(tileCounters[tileIndex] == 0xffffu)
if((gid.x % RENDERER_TILE_SIZE == 0) || (gid.y % RENDERER_TILE_SIZE == 0))
{ {
outTexture.write(float4(0, 0, 0, 1), gid); fragColor = float4(1, 0, 1, 1);
return;
} }
if(tileBufferSize <= 0) else if(tileCounter != 0u)
{ {
outTexture.write(float4(0, 1, 0, 1), gid); fragColor = float4(0, 1, 0, 1);
return;
} }
else else
{ {
outTexture.write(float4(1, 0, 0, 1), gid); fragColor = float4(1, 0, 0, 1);
}
outTexture.write(fragColor, gid);
return; return;
} }
#endif #endif
int subPixelFactor = 16; const int subPixelFactor = 16;
int2 pixelCoord = int2(gid); const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
const int sampleCount = 8; const int sampleCount = 8;
int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3), int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3),
@ -229,59 +273,75 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
centerPoint + int2(-7, 1), centerPoint + int2(-7, 1),
centerPoint + int2(3, -7), centerPoint + int2(3, -7),
centerPoint + int2(7, 7)}; centerPoint + int2(7, 7)};
int zIndices[sampleCount];
uint flipCounts[sampleCount]; float4 sampleColor[sampleCount];
float4 pixelColors[sampleCount]; float4 currentColor[sampleCount];
float4 nextColors[sampleCount]; int currentShapeIndex[sampleCount];
int flipCount[sampleCount];
for(int i=0; i<sampleCount; i++) for(int i=0; i<sampleCount; i++)
{ {
zIndices[i] = -1; currentShapeIndex[i] = -1;
flipCounts[i] = 0; flipCount[i] = 0;
pixelColors[i] = float4(0, 0, 0, 0); sampleColor[i] = float4(0, 0, 0, 0);
nextColors[i] = float4(0, 0, 0, 0); currentColor[i] = float4(0, 0, 0, 0);
} }
for(uint tileBufferIndex=0; tileBufferIndex < tileBufferSize; tileBufferIndex++) for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++)
{ {
const device mg_triangle_data* triangle = &triangleArray[tileBuffer[tileBufferIndex]]; int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex];
int2 p0 = int2(triangle->p0 * subPixelFactor); uint i0 = indexBuffer[triangleIndex];
int2 p1 = int2(triangle->p1 * subPixelFactor); uint i1 = indexBuffer[triangleIndex+1];
int2 p2 = int2(triangle->p2 * subPixelFactor); uint i2 = indexBuffer[triangleIndex+2];
int bias0 = triangle->bias0; int2 p0 = int2((vertexBuffer[i0].pos * scaling[0]) * subPixelFactor);
int bias1 = triangle->bias1; int2 p1 = int2((vertexBuffer[i1].pos * scaling[0]) * subPixelFactor);
int bias2 = triangle->bias2; int2 p2 = int2((vertexBuffer[i2].pos * scaling[0]) * subPixelFactor);
const device mg_vertex* v0 = &(vertexBuffer[triangle->i0]); int shapeIndex = vertexBuffer[i0].shapeIndex;
const device mg_vertex* v1 = &(vertexBuffer[triangle->i1]);
const device mg_vertex* v2 = &(vertexBuffer[triangle->i2]);
float4 cubic0 = v0->cubic;
float4 cubic1 = v1->cubic;
float4 cubic2 = v2->cubic;
int shapeIndex = v0->shapeIndex;
float4 color = shapeBuffer[shapeIndex].color; float4 color = shapeBuffer[shapeIndex].color;
color.rgb *= color.a; color.rgb *= color.a;
int4 clip = int4(round((shapeBuffer[shapeIndex].clip * scaling[0] + float4(0.5, 0.5, 0.5, 0.5)) * subPixelFactor));
const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform; const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform;
matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0}, matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0},
{uvTransform2x3[1], uvTransform2x3[4], 0}, {uvTransform2x3[1], uvTransform2x3[4], 0},
{uvTransform2x3[2], uvTransform2x3[5], 1}}; {uvTransform2x3[2], uvTransform2x3[5], 1}};
//NOTE(martin): reorder triangle counter-clockwise and compute bias for each edge
int cw = is_clockwise(p0, p1, p2);
if(cw < 0)
{
uint tmpIndex = i1;
i1 = i2;
i2 = tmpIndex;
int2 tmpPoint = p1;
p1 = p2;
p2 = tmpPoint;
}
float4 cubic0 = vertexBuffer[i0].cubic;
float4 cubic1 = vertexBuffer[i1].cubic;
float4 cubic2 = vertexBuffer[i2].cubic;
int bias0 = is_top_left(p1, p2) ? 0 : -1;
int bias1 = is_top_left(p2, p0) ? 0 : -1;
int bias2 = is_top_left(p0, p1) ? 0 : -1;
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
{ {
int2 samplePoint = samplePoints[sampleIndex]; int2 samplePoint = samplePoints[sampleIndex];
//NOTE(martin): cull if pixel is outside box if( samplePoint.x < clip.x
/* || samplePoint.x > clip.z
// if we use this, make sure box is in fixed points coords || samplePoint.y < clip.y
if(samplePoint.x < box.x || samplePoint.x > box.z || samplePoint.y < box.y || samplePoint.y > box.w) || samplePoint.y > clip.w)
{ {
continue; continue;
} }
*/
int w0 = orient2d(p1, p2, samplePoint); int w0 = orient2d(p1, p2, samplePoint);
int w1 = orient2d(p2, p0, samplePoint); int w1 = orient2d(p2, p0, samplePoint);
@ -291,57 +351,52 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
{ {
float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2); float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
//TODO(martin): this is a quick and dirty fix for solid polygons where we use
// cubic = (1, 1, 1, 1) on all vertices, which can cause small errors to
// flip the sign.
// We should really use another value that always lead to <= 0, but we must
// make sure we never share these vertices with bezier shapes.
// Alternatively, an ugly (but maybe less than this one) solution would be
// to check if uvs are equal on all vertices of the triangle and always render
// those triangles.
float eps = 0.0001; float eps = 0.0001;
if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps) if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
{ {
if(shapeIndex == zIndices[sampleIndex]) if(shapeIndex == currentShapeIndex[sampleIndex])
{ {
flipCounts[sampleIndex]++; flipCount[sampleIndex]++;
} }
else else
{ {
if(flipCounts[sampleIndex] & 0x01) if(flipCount[sampleIndex] & 0x01)
{ {
pixelColors[sampleIndex] = nextColors[sampleIndex]; sampleColor[sampleIndex] = currentColor[sampleIndex];
} }
float4 nextColor = color; float4 nextColor = color;
if(*useTexture)
if(useTexture[0])
{ {
float2 sampleFP = float2(samplePoint)/subPixelFactor; float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
float2 uv = (uvTransform*(float3(sampleFP/contentsScaling[0], 1))).xy; float2 uv = (uvTransform * sampleFP).xy;
constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear); constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
float4 texColor = texAtlas.sample(smp, uv); float4 texColor = texAtlas.sample(smp, uv);
texColor.rgb *= texColor.a; texColor.rgb *= texColor.a;
nextColor *= texColor; nextColor *= texColor;
} }
nextColors[sampleIndex] = pixelColors[sampleIndex]*(1-nextColor.a) + nextColor; currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
zIndices[sampleIndex] = shapeIndex; currentShapeIndex[sampleIndex] = shapeIndex;
flipCounts[sampleIndex] = 1; flipCount[sampleIndex] = 1;
} }
} }
} }
} }
} }
float4 out = float4(0, 0, 0, 0);
for(int i=0; i<sampleCount; i++) float4 pixelColor = float4(0);
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
{ {
if(flipCounts[i] & 0x01) if(flipCount[sampleIndex] & 0x01)
{ {
pixelColors[i] = nextColors[i]; sampleColor[sampleIndex] = currentColor[sampleIndex];
} }
out += pixelColors[i]; pixelColor += sampleColor[sampleIndex];
} }
out = out/sampleCount;
outTexture.write(out, gid); outTexture.write(pixelColor/float(sampleCount), gid);
} }