[wip] trying to simplify metal shader

This commit is contained in:
Martin Fouilleul 2023-03-13 18:38:30 +01:00
parent cee294d8ad
commit fd5a4d4cd8
2 changed files with 225 additions and 158 deletions

View File

@ -45,7 +45,7 @@ typedef struct mg_mtl_canvas_backend
id<MTLBuffer> vertexBuffer;
id<MTLBuffer> indexBuffer;
id<MTLBuffer> tileCounters;
id<MTLBuffer> tilesArray;
id<MTLBuffer> tileArrayBuffer;
id<MTLBuffer> triangleArray;
id<MTLBuffer> boxArray;
@ -163,6 +163,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
[blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0];
[blitEncoder endEncoding];
/*
//-----------------------------------------------------------
//NOTE(martin): encode the boxing pass
//-----------------------------------------------------------
@ -183,6 +184,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
[boxEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize];
[boxEncoder endEncoding];
*/
//-----------------------------------------------------------
//NOTE(martin): encode the tiling pass
@ -191,12 +193,19 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
id<MTLComputeCommandEncoder> tileEncoder = [surface->commandBuffer computeCommandEncoder];
tileEncoder.label = @"tiling pass";
[tileEncoder setComputePipelineState: backend->tilingPipeline];
[tileEncoder setBuffer: backend->boxArray offset:0 atIndex: 0];
[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1];
[tileEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2];
[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3];
[tileEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[tileEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[tileEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
[tileEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize];
[tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 5];
[tileEncoder setBytes: &scale length: sizeof(float) atIndex: 6];
MTLSize tileGroupSize = MTLSizeMake(backend->tilingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
MTLSize tileGridSize = MTLSizeMake(indexCount/3, 1, 1);
[tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize];
[tileEncoder endEncoding];
//-----------------------------------------------------------
@ -206,15 +215,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
id<MTLComputeCommandEncoder> sortEncoder = [surface->commandBuffer computeCommandEncoder];
sortEncoder.label = @"sorting pass";
[sortEncoder setComputePipelineState: backend->sortingPipeline];
[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0];
[sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1];
[sortEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2];
[sortEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3];
[sortEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[sortEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[sortEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
u32 nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE;
MTLSize sortGroupSize = MTLSizeMake(backend->boxingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1);
MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1);
[sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize];
@ -226,35 +236,35 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image
//TODO: remove that
vector_float4 clearColorVec4 = {backend->clearColor.r, backend->clearColor.g, backend->clearColor.b, backend->clearColor.a};
id<MTLComputeCommandEncoder> encoder = [surface->commandBuffer computeCommandEncoder];
encoder.label = @"drawing pass";
[encoder setComputePipelineState:backend->computePipeline];
[encoder setTexture: backend->outTexture atIndex: 0];
id<MTLComputeCommandEncoder> drawEncoder = [surface->commandBuffer computeCommandEncoder];
drawEncoder.label = @"drawing pass";
[drawEncoder setComputePipelineState:backend->computePipeline];
[drawEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[drawEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1];
[drawEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2];
[drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3];
[drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4];
[drawEncoder setTexture: backend->outTexture atIndex: 0];
int useTexture = 0;
if(image)
{
mg_mtl_image_data* mtlImage = (mg_mtl_image_data*)image;
[encoder setTexture: mtlImage->texture atIndex: 1];
[drawEncoder setTexture: mtlImage->texture atIndex: 1];
useTexture = 1;
}
[encoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0];
[encoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 1];
[encoder setBuffer: backend->tileCounters offset:0 atIndex: 2];
[encoder setBuffer: backend->tilesArray offset:0 atIndex: 3];
[encoder setBuffer: backend->triangleArray offset:0 atIndex: 4];
[encoder setBuffer: backend->boxArray offset:0 atIndex: 5];
[encoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 6];
[encoder setBytes: &useTexture length:sizeof(int) atIndex:7];
[encoder setBytes: &scale length: sizeof(float) atIndex: 8];
[drawEncoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 5];
[drawEncoder setBytes: &useTexture length:sizeof(int) atIndex:6];
[drawEncoder setBytes: &scale length: sizeof(float) atIndex: 7];
//TODO: check that we don't exceed maxTotalThreadsPerThreadgroup
DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup);
MTLSize threadGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1);
MTLSize threadGroupSize = MTLSizeMake(RENDERER_TILE_SIZE, RENDERER_TILE_SIZE, 1);
[encoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize];
[encoder endEncoding];
[drawEncoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize];
[drawEncoder endEncoding];
//-----------------------------------------------------------
//NOTE(martin): blit texture to framebuffer
@ -331,7 +341,7 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface)
[backend->outTexture release];
[backend->vertexBuffer release];
[backend->indexBuffer release];
[backend->tilesArray release];
[backend->tileArrayBuffer release];
[backend->triangleArray release];
[backend->boxArray release];
[backend->computePipeline release];
@ -459,7 +469,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
backend->shapeBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape)
options: bufferOptions];
backend->tilesArray = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES
options: MTLResourceStorageModePrivate];
backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data)
@ -522,6 +532,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
reflection: nil
error: &error];
/*
MTLComputePipelineDescriptor* boxingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init];
boxingPipelineDesc.computeFunction = boxingFunction;
// boxingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
@ -530,6 +541,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface)
options: MTLPipelineOptionNone
reflection: nil
error: &error];
*/
//-----------------------------------------------------------
//NOTE(martin): setup our render pipeline state
//-----------------------------------------------------------

View File

@ -31,7 +31,7 @@ bool is_top_left(float2 a, float2 b)
return( (a.y == b.y && b.x < a.x)
||(b.y < a.y));
}
/*
kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
constant uint* indexBuffer [[buffer(1)]],
constant mg_shape* shapeBuffer [[buffer(2)]],
@ -94,19 +94,43 @@ kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
triangleArray[triangleIndex].bias1 = bias1;
triangleArray[triangleIndex].bias2 = bias2;
}
*/
kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
device volatile atomic_uint* tileCounters [[buffer(1)]],
device uint* tilesArray [[buffer(2)]],
constant vector_uint2* viewport [[buffer(3)]],
kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
constant uint* indexBuffer [[buffer(1)]],
constant mg_shape* shapeBuffer [[buffer(2)]],
device volatile atomic_uint* tileCounters [[buffer(3)]],
device uint* tileArrayBuffer [[buffer(4)]],
constant uint2* viewport [[buffer(5)]],
constant float* scaling [[buffer(6)]],
uint gid [[thread_position_in_grid]])
{
uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
int nTilesX = tilesMatrixDim.x;
int nTilesY = tilesMatrixDim.y;
uint triangleIndex = gid;
int4 box = int4(floor(boxArray[triangleIndex]))/RENDERER_TILE_SIZE;
uint triangleIndex = gid * 3;
uint i0 = indexBuffer[triangleIndex];
uint i1 = indexBuffer[triangleIndex+1u];
uint i2 = indexBuffer[triangleIndex+2u];
float2 p0 = vertexBuffer[i0].pos * scaling[0];
float2 p1 = vertexBuffer[i1].pos * scaling[0];
float2 p2 = vertexBuffer[i2].pos * scaling[0];
int shapeIndex = vertexBuffer[i0].shapeIndex;
float4 clip = shapeBuffer[shapeIndex].clip * scaling[0];
float4 fbox = float4(max(min(min(p0.x, p1.x), p2.x), clip.x),
max(min(min(p0.y, p1.y), p2.y), clip.y),
min(max(max(p0.x, p1.x), p2.x), clip.z),
min(max(max(p0.y, p1.y), p2.y), clip.w));
int4 box = int4(floor(fbox))/int(RENDERER_TILE_SIZE);
//NOTE(martin): it's importat to do the computation with signed int, so that we can have negative xMax/yMax
// otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0.
int xMin = max(0, box.x);
int yMin = max(0, box.y);
int xMax = min(box.z, nTilesX-1);
@ -120,105 +144,125 @@ kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed);
if(counter < RENDERER_TILE_BUFFER_SIZE)
{
tilesArray[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex;
tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex;
}
}
}
}
kernel void SortKernel(const device uint* tileCounters [[buffer(0)]],
const device mg_triangle_data* triangleArray [[buffer(1)]],
device uint* tilesArray [[buffer(2)]],
constant vector_uint2* viewport [[buffer(3)]],
kernel void SortKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
constant uint* indexBuffer [[buffer(1)]],
constant mg_shape* shapeBuffer [[buffer(2)]],
const device uint* tileCounters [[buffer(3)]],
device uint* tileArrayBuffer [[buffer(4)]],
uint gid [[thread_position_in_grid]])
{
uint tileIndex = gid;
device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE;
uint tileBufferSize = tileCounters[tileIndex];
uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE;
uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
for(int eltIndex=0; eltIndex < (int)tileBufferSize; eltIndex++)
for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++)
{
uint elt = tileBuffer[eltIndex];
uint eltZIndex = triangleArray[elt].shapeIndex;
for(uint sortIndex = tileArrayIndex; sortIndex > 0u; sortIndex--)
{
uint triangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex]];
uint prevTriangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex - 1]];
int backIndex = eltIndex-1;
for(; backIndex >= 0; backIndex--)
{
uint backElt = tileBuffer[backIndex];
uint backEltZIndex = triangleArray[backElt].shapeIndex;
if(eltZIndex >= backEltZIndex)
int shapeIndex = vertexBuffer[triangleIndex].shapeIndex;
int prevShapeIndex = vertexBuffer[prevTriangleIndex].shapeIndex;
if(shapeIndex >= prevShapeIndex)
{
break;
}
else
{
tileBuffer[backIndex+1] = backElt;
uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex];
tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1];
tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp;
}
}
tileBuffer[backIndex+1] = elt;
}
}
bool is_top_left(int2 a, int2 b)
{
return( (a.y == b.y && b.x < a.x)
||(b.y < a.y));
}
//////////////////////////////////////////////////////////////////////////////
//TODO: we should do these computations on 64bits, because otherwise
// we might overflow for values > 2048.
// Unfortunately this is costly.
// Another way is to precompute triangle edges (b - a) in full precision
// once to avoid doing it all the time...
//////////////////////////////////////////////////////////////////////////////
//TODO: coalesce
int orient2d(int2 a, int2 b, int2 c)
{
return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
}
kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)]],
int is_clockwise(int2 p0, int2 p1, int2 p2)
{
return((p1 - p0).x*(p2 - p0).y - (p1 - p0).y*(p2 - p0).x);
}
kernel void RenderKernel(const device mg_vertex* vertexBuffer [[buffer(0)]],
const device uint* indexBuffer [[buffer(1)]],
const device mg_shape* shapeBuffer [[buffer(2)]],
const device uint* tileCounters [[buffer(3)]],
const device uint* tileArrayBuffer [[buffer(4)]],
constant float4* clearColor [[buffer(5)]],
constant int* useTexture [[buffer(6)]],
constant float* scaling [[buffer(7)]],
texture2d<float, access::write> outTexture [[texture(0)]],
texture2d<float> texAtlas [[texture(1)]],
const device mg_vertex* vertexBuffer [[buffer(0)]],
const device mg_shape* shapeBuffer [[buffer(1)]],
device uint* tileCounters [[buffer(2)]],
const device uint* tilesArray [[buffer(3)]],
const device mg_triangle_data* triangleArray [[buffer(4)]],
const device float4* boxArray [[buffer(5)]],
constant vector_float4* clearColor [[buffer(6)]],
constant int* useTexture [[buffer(7)]],
constant float* contentsScaling [[buffer(8)]],
uint2 gid [[thread_position_in_grid]],
uint2 tgid [[threadgroup_position_in_grid]],
uint2 threadsPerThreadgroup [[threads_per_threadgroup]],
uint2 gridSize [[threads_per_grid]])
{
//TODO: guard against thread group size not equal to tile size?
const int2 pixelCoord = int2(gid);
const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE;
const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
const uint2 tilePos = gid/RENDERER_TILE_SIZE;
const uint tileIndex = tilePos.y * tilesMatrixDim.x + tilePos.x;
const device uint* tileBuffer = tilesArray + tileIndex * RENDERER_TILE_BUFFER_SIZE;
const uint tileBufferSize = tileCounters[tileIndex];
const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x;
const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE);
#ifdef RENDERER_DEBUG_TILES
//NOTE(martin): color code debug values and show the tile grid
uint nTileX = tilesMatrixDim.x;
uint nTileY = tilesMatrixDim.y;
{
float4 fragColor = float4(0);
if(tilePos.x > nTileX || tilePos.y > nTileY)
if( pixelCoord.x % 16 == 0
||pixelCoord.y % 16 == 0)
{
outTexture.write(float4(0, 1, 1, 1), gid);
return;
fragColor = float4(0, 0, 0, 1);
}
if((gid.x % RENDERER_TILE_SIZE == 0) || (gid.y % RENDERER_TILE_SIZE == 0))
else if(tileCounters[tileIndex] == 0xffffu)
{
outTexture.write(float4(0, 0, 0, 1), gid);
return;
fragColor = float4(1, 0, 1, 1);
}
if(tileBufferSize <= 0)
else if(tileCounter != 0u)
{
outTexture.write(float4(0, 1, 0, 1), gid);
return;
fragColor = float4(0, 1, 0, 1);
}
else
{
outTexture.write(float4(1, 0, 0, 1), gid);
fragColor = float4(1, 0, 0, 1);
}
outTexture.write(fragColor, gid);
return;
}
#endif
int subPixelFactor = 16;
int2 pixelCoord = int2(gid);
int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
const int subPixelFactor = 16;
const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor);
const int sampleCount = 8;
int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3),
@ -229,59 +273,75 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
centerPoint + int2(-7, 1),
centerPoint + int2(3, -7),
centerPoint + int2(7, 7)};
int zIndices[sampleCount];
uint flipCounts[sampleCount];
float4 pixelColors[sampleCount];
float4 nextColors[sampleCount];
float4 sampleColor[sampleCount];
float4 currentColor[sampleCount];
int currentShapeIndex[sampleCount];
int flipCount[sampleCount];
for(int i=0; i<sampleCount; i++)
{
zIndices[i] = -1;
flipCounts[i] = 0;
pixelColors[i] = float4(0, 0, 0, 0);
nextColors[i] = float4(0, 0, 0, 0);
currentShapeIndex[i] = -1;
flipCount[i] = 0;
sampleColor[i] = float4(0, 0, 0, 0);
currentColor[i] = float4(0, 0, 0, 0);
}
for(uint tileBufferIndex=0; tileBufferIndex < tileBufferSize; tileBufferIndex++)
for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++)
{
const device mg_triangle_data* triangle = &triangleArray[tileBuffer[tileBufferIndex]];
int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex];
int2 p0 = int2(triangle->p0 * subPixelFactor);
int2 p1 = int2(triangle->p1 * subPixelFactor);
int2 p2 = int2(triangle->p2 * subPixelFactor);
uint i0 = indexBuffer[triangleIndex];
uint i1 = indexBuffer[triangleIndex+1];
uint i2 = indexBuffer[triangleIndex+2];
int bias0 = triangle->bias0;
int bias1 = triangle->bias1;
int bias2 = triangle->bias2;
int2 p0 = int2((vertexBuffer[i0].pos * scaling[0]) * subPixelFactor);
int2 p1 = int2((vertexBuffer[i1].pos * scaling[0]) * subPixelFactor);
int2 p2 = int2((vertexBuffer[i2].pos * scaling[0]) * subPixelFactor);
const device mg_vertex* v0 = &(vertexBuffer[triangle->i0]);
const device mg_vertex* v1 = &(vertexBuffer[triangle->i1]);
const device mg_vertex* v2 = &(vertexBuffer[triangle->i2]);
float4 cubic0 = v0->cubic;
float4 cubic1 = v1->cubic;
float4 cubic2 = v2->cubic;
int shapeIndex = v0->shapeIndex;
int shapeIndex = vertexBuffer[i0].shapeIndex;
float4 color = shapeBuffer[shapeIndex].color;
color.rgb *= color.a;
int4 clip = int4(round((shapeBuffer[shapeIndex].clip * scaling[0] + float4(0.5, 0.5, 0.5, 0.5)) * subPixelFactor));
const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform;
matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0},
{uvTransform2x3[1], uvTransform2x3[4], 0},
{uvTransform2x3[2], uvTransform2x3[5], 1}};
//NOTE(martin): reorder triangle counter-clockwise and compute bias for each edge
int cw = is_clockwise(p0, p1, p2);
if(cw < 0)
{
uint tmpIndex = i1;
i1 = i2;
i2 = tmpIndex;
int2 tmpPoint = p1;
p1 = p2;
p2 = tmpPoint;
}
float4 cubic0 = vertexBuffer[i0].cubic;
float4 cubic1 = vertexBuffer[i1].cubic;
float4 cubic2 = vertexBuffer[i2].cubic;
int bias0 = is_top_left(p1, p2) ? 0 : -1;
int bias1 = is_top_left(p2, p0) ? 0 : -1;
int bias2 = is_top_left(p0, p1) ? 0 : -1;
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
{
int2 samplePoint = samplePoints[sampleIndex];
//NOTE(martin): cull if pixel is outside box
/*
// if we use this, make sure box is in fixed points coords
if(samplePoint.x < box.x || samplePoint.x > box.z || samplePoint.y < box.y || samplePoint.y > box.w)
if( samplePoint.x < clip.x
|| samplePoint.x > clip.z
|| samplePoint.y < clip.y
|| samplePoint.y > clip.w)
{
continue;
}
*/
int w0 = orient2d(p1, p2, samplePoint);
int w1 = orient2d(p2, p0, samplePoint);
@ -291,57 +351,52 @@ kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)
{
float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);
//TODO(martin): this is a quick and dirty fix for solid polygons where we use
// cubic = (1, 1, 1, 1) on all vertices, which can cause small errors to
// flip the sign.
// We should really use another value that always lead to <= 0, but we must
// make sure we never share these vertices with bezier shapes.
// Alternatively, an ugly (but maybe less than this one) solution would be
// to check if uvs are equal on all vertices of the triangle and always render
// those triangles.
float eps = 0.0001;
if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
{
if(shapeIndex == zIndices[sampleIndex])
if(shapeIndex == currentShapeIndex[sampleIndex])
{
flipCounts[sampleIndex]++;
flipCount[sampleIndex]++;
}
else
{
if(flipCounts[sampleIndex] & 0x01)
if(flipCount[sampleIndex] & 0x01)
{
pixelColors[sampleIndex] = nextColors[sampleIndex];
sampleColor[sampleIndex] = currentColor[sampleIndex];
}
float4 nextColor = color;
if(*useTexture)
if(useTexture[0])
{
float2 sampleFP = float2(samplePoint)/subPixelFactor;
float2 uv = (uvTransform*(float3(sampleFP/contentsScaling[0], 1))).xy;
float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1);
float2 uv = (uvTransform * sampleFP).xy;
constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
float4 texColor = texAtlas.sample(smp, uv);
texColor.rgb *= texColor.a;
nextColor *= texColor;
}
nextColors[sampleIndex] = pixelColors[sampleIndex]*(1-nextColor.a) + nextColor;
zIndices[sampleIndex] = shapeIndex;
flipCounts[sampleIndex] = 1;
currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor;
currentShapeIndex[sampleIndex] = shapeIndex;
flipCount[sampleIndex] = 1;
}
}
}
}
}
float4 out = float4(0, 0, 0, 0);
for(int i=0; i<sampleCount; i++)
float4 pixelColor = float4(0);
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
{
if(flipCounts[i] & 0x01)
if(flipCount[sampleIndex] & 0x01)
{
pixelColors[i] = nextColors[i];
sampleColor[sampleIndex] = currentColor[sampleIndex];
}
out += pixelColors[i];
pixelColor += sampleColor[sampleIndex];
}
out = out/sampleCount;
outTexture.write(out, gid);
outTexture.write(pixelColor/float(sampleCount), gid);
}