diff --git a/src/mtl_canvas.m b/src/mtl_canvas.m index 196bece..fada813 100644 --- a/src/mtl_canvas.m +++ b/src/mtl_canvas.m @@ -45,7 +45,7 @@ typedef struct mg_mtl_canvas_backend id vertexBuffer; id indexBuffer; id tileCounters; - id tilesArray; + id tileArrayBuffer; id triangleArray; id boxArray; @@ -163,6 +163,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [blitEncoder fillBuffer: backend->tileCounters range: NSMakeRange(0, RENDERER_MAX_TILES*sizeof(uint)) value: 0]; [blitEncoder endEncoding]; + /* //----------------------------------------------------------- //NOTE(martin): encode the boxing pass //----------------------------------------------------------- @@ -183,6 +184,7 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image [boxEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize]; [boxEncoder endEncoding]; + */ //----------------------------------------------------------- //NOTE(martin): encode the tiling pass @@ -191,12 +193,19 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image id tileEncoder = [surface->commandBuffer computeCommandEncoder]; tileEncoder.label = @"tiling pass"; [tileEncoder setComputePipelineState: backend->tilingPipeline]; - [tileEncoder setBuffer: backend->boxArray offset:0 atIndex: 0]; - [tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 1]; - [tileEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2]; - [tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3]; + [tileEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0]; + [tileEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1]; + [tileEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2]; + [tileEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3]; + [tileEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4]; - [tileEncoder dispatchThreads: boxGridSize threadsPerThreadgroup: boxGroupSize]; + [tileEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 5]; + [tileEncoder setBytes: &scale length: sizeof(float) atIndex: 6]; + + MTLSize tileGroupSize = MTLSizeMake(backend->tilingPipeline.maxTotalThreadsPerThreadgroup, 1, 1); + MTLSize tileGridSize = MTLSizeMake(indexCount/3, 1, 1); + + [tileEncoder dispatchThreads: tileGridSize threadsPerThreadgroup: tileGroupSize]; [tileEncoder endEncoding]; //----------------------------------------------------------- @@ -206,15 +215,16 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image id sortEncoder = [surface->commandBuffer computeCommandEncoder]; sortEncoder.label = @"sorting pass"; [sortEncoder setComputePipelineState: backend->sortingPipeline]; - [sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 0]; - [sortEncoder setBuffer: backend->triangleArray offset:0 atIndex: 1]; - [sortEncoder setBuffer: backend->tilesArray offset:0 atIndex: 2]; - [sortEncoder setBytes: &viewportSize length: sizeof(vector_uint2) atIndex: 3]; + [sortEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0]; + [sortEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1]; + [sortEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2]; + [sortEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3]; + [sortEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4]; u32 nTilesX = (viewportSize.x + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; u32 nTilesY = (viewportSize.y + RENDERER_TILE_SIZE - 1)/RENDERER_TILE_SIZE; - MTLSize sortGroupSize = MTLSizeMake(backend->boxingPipeline.maxTotalThreadsPerThreadgroup, 1, 1); + MTLSize sortGroupSize = MTLSizeMake(backend->sortingPipeline.maxTotalThreadsPerThreadgroup, 1, 1); MTLSize sortGridSize = MTLSizeMake(nTilesX*nTilesY, 1, 1); [sortEncoder dispatchThreads: sortGridSize threadsPerThreadgroup: sortGroupSize]; @@ -226,35 +236,35 @@ void mg_mtl_canvas_draw_batch(mg_canvas_backend* interface, mg_image_data* image //TODO: remove that vector_float4 clearColorVec4 = {backend->clearColor.r, backend->clearColor.g, backend->clearColor.b, backend->clearColor.a}; - id encoder = [surface->commandBuffer computeCommandEncoder]; - encoder.label = @"drawing pass"; - [encoder setComputePipelineState:backend->computePipeline]; - [encoder setTexture: backend->outTexture atIndex: 0]; + id drawEncoder = [surface->commandBuffer computeCommandEncoder]; + drawEncoder.label = @"drawing pass"; + [drawEncoder setComputePipelineState:backend->computePipeline]; + [drawEncoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0]; + [drawEncoder setBuffer: backend->indexBuffer offset:backend->indexBufferOffset atIndex: 1]; + [drawEncoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 2]; + [drawEncoder setBuffer: backend->tileCounters offset:0 atIndex: 3]; + [drawEncoder setBuffer: backend->tileArrayBuffer offset:0 atIndex: 4]; + + [drawEncoder setTexture: backend->outTexture atIndex: 0]; int useTexture = 0; if(image) { mg_mtl_image_data* mtlImage = (mg_mtl_image_data*)image; - [encoder setTexture: mtlImage->texture atIndex: 1]; + [drawEncoder setTexture: mtlImage->texture atIndex: 1]; useTexture = 1; } - [encoder setBuffer: backend->vertexBuffer offset:backend->vertexBufferOffset atIndex: 0]; - [encoder setBuffer: backend->shapeBuffer offset:backend->shapeBufferOffset atIndex: 1]; - [encoder setBuffer: backend->tileCounters offset:0 atIndex: 2]; - [encoder setBuffer: backend->tilesArray offset:0 atIndex: 3]; - [encoder setBuffer: backend->triangleArray offset:0 atIndex: 4]; - [encoder setBuffer: backend->boxArray offset:0 atIndex: 5]; - [encoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 6]; - [encoder setBytes: &useTexture length:sizeof(int) atIndex:7]; - [encoder setBytes: &scale length: sizeof(float) atIndex: 8]; + [drawEncoder setBytes: &clearColorVec4 length: sizeof(vector_float4) atIndex: 5]; + [drawEncoder setBytes: &useTexture length:sizeof(int) atIndex:6]; + [drawEncoder setBytes: &scale length: sizeof(float) atIndex: 7]; //TODO: check that we don't exceed maxTotalThreadsPerThreadgroup DEBUG_ASSERT(RENDERER_TILE_SIZE*RENDERER_TILE_SIZE <= backend->computePipeline.maxTotalThreadsPerThreadgroup); MTLSize threadGridSize = MTLSizeMake(viewportSize.x, viewportSize.y, 1); MTLSize threadGroupSize = MTLSizeMake(RENDERER_TILE_SIZE, RENDERER_TILE_SIZE, 1); - [encoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize]; - [encoder endEncoding]; + [drawEncoder dispatchThreads: threadGridSize threadsPerThreadgroup:threadGroupSize]; + [drawEncoder endEncoding]; //----------------------------------------------------------- //NOTE(martin): blit texture to framebuffer @@ -331,7 +341,7 @@ void mg_mtl_canvas_destroy(mg_canvas_backend* interface) [backend->outTexture release]; [backend->vertexBuffer release]; [backend->indexBuffer release]; - [backend->tilesArray release]; + [backend->tileArrayBuffer release]; [backend->triangleArray release]; [backend->boxArray release]; [backend->computePipeline release]; @@ -459,7 +469,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) backend->shapeBuffer = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_shape) options: bufferOptions]; - backend->tilesArray = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES + backend->tileArrayBuffer = [metalSurface->device newBufferWithLength: RENDERER_TILE_BUFFER_SIZE*sizeof(int)*RENDERER_MAX_TILES options: MTLResourceStorageModePrivate]; backend->triangleArray = [metalSurface->device newBufferWithLength: MG_MTL_CANVAS_DEFAULT_BUFFER_LENGTH*sizeof(mg_triangle_data) @@ -522,6 +532,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) reflection: nil error: &error]; +/* MTLComputePipelineDescriptor* boxingPipelineDesc = [[MTLComputePipelineDescriptor alloc] init]; boxingPipelineDesc.computeFunction = boxingFunction; // boxingPipelineDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; @@ -530,6 +541,7 @@ mg_canvas_backend* mg_mtl_canvas_create(mg_surface surface) options: MTLPipelineOptionNone reflection: nil error: &error]; +*/ //----------------------------------------------------------- //NOTE(martin): setup our render pipeline state //----------------------------------------------------------- diff --git a/src/mtl_shader.metal b/src/mtl_shader.metal index 14082fe..04676ff 100644 --- a/src/mtl_shader.metal +++ b/src/mtl_shader.metal @@ -31,7 +31,7 @@ bool is_top_left(float2 a, float2 b) return( (a.y == b.y && b.x < a.x) ||(b.y < a.y)); } - +/* kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], constant uint* indexBuffer [[buffer(1)]], constant mg_shape* shapeBuffer [[buffer(2)]], @@ -94,19 +94,43 @@ kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], triangleArray[triangleIndex].bias1 = bias1; triangleArray[triangleIndex].bias2 = bias2; } +*/ -kernel void TileKernel(const device float4* boxArray [[buffer(0)]], - device volatile atomic_uint* tileCounters [[buffer(1)]], - device uint* tilesArray [[buffer(2)]], - constant vector_uint2* viewport [[buffer(3)]], +kernel void TileKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], + constant uint* indexBuffer [[buffer(1)]], + constant mg_shape* shapeBuffer [[buffer(2)]], + device volatile atomic_uint* tileCounters [[buffer(3)]], + device uint* tileArrayBuffer [[buffer(4)]], + constant uint2* viewport [[buffer(5)]], + constant float* scaling [[buffer(6)]], uint gid [[thread_position_in_grid]]) { uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1; int nTilesX = tilesMatrixDim.x; int nTilesY = tilesMatrixDim.y; - uint triangleIndex = gid; - int4 box = int4(floor(boxArray[triangleIndex]))/RENDERER_TILE_SIZE; + uint triangleIndex = gid * 3; + + uint i0 = indexBuffer[triangleIndex]; + uint i1 = indexBuffer[triangleIndex+1u]; + uint i2 = indexBuffer[triangleIndex+2u]; + + float2 p0 = vertexBuffer[i0].pos * scaling[0]; + float2 p1 = vertexBuffer[i1].pos * scaling[0]; + float2 p2 = vertexBuffer[i2].pos * scaling[0]; + + int shapeIndex = vertexBuffer[i0].shapeIndex; + float4 clip = shapeBuffer[shapeIndex].clip * scaling[0]; + + float4 fbox = float4(max(min(min(p0.x, p1.x), p2.x), clip.x), + max(min(min(p0.y, p1.y), p2.y), clip.y), + min(max(max(p0.x, p1.x), p2.x), clip.z), + min(max(max(p0.y, p1.y), p2.y), clip.w)); + + int4 box = int4(floor(fbox))/int(RENDERER_TILE_SIZE); + + //NOTE(martin): it's importat to do the computation with signed int, so that we can have negative xMax/yMax + // otherwise all triangles on the left or below the x/y axis are attributed to tiles on row/column 0. int xMin = max(0, box.x); int yMin = max(0, box.y); int xMax = min(box.z, nTilesX-1); @@ -120,105 +144,125 @@ kernel void TileKernel(const device float4* boxArray [[buffer(0)]], uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed); if(counter < RENDERER_TILE_BUFFER_SIZE) { - tilesArray[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex; + tileArrayBuffer[tileIndex*RENDERER_TILE_BUFFER_SIZE + counter] = triangleIndex; } } } } -kernel void SortKernel(const device uint* tileCounters [[buffer(0)]], - const device mg_triangle_data* triangleArray [[buffer(1)]], - device uint* tilesArray [[buffer(2)]], - constant vector_uint2* viewport [[buffer(3)]], +kernel void SortKernel(constant mg_vertex* vertexBuffer [[buffer(0)]], + constant uint* indexBuffer [[buffer(1)]], + constant mg_shape* shapeBuffer [[buffer(2)]], + const device uint* tileCounters [[buffer(3)]], + device uint* tileArrayBuffer [[buffer(4)]], uint gid [[thread_position_in_grid]]) { uint tileIndex = gid; - device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE; - uint tileBufferSize = tileCounters[tileIndex]; + uint tileArrayOffset = tileIndex * RENDERER_TILE_BUFFER_SIZE; + uint tileArrayCount = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE); - for(int eltIndex=0; eltIndex < (int)tileBufferSize; eltIndex++) + for(uint tileArrayIndex=1; tileArrayIndex < tileArrayCount; tileArrayIndex++) { - uint elt = tileBuffer[eltIndex]; - uint eltZIndex = triangleArray[elt].shapeIndex; - - int backIndex = eltIndex-1; - for(; backIndex >= 0; backIndex--) + for(uint sortIndex = tileArrayIndex; sortIndex > 0u; sortIndex--) { - uint backElt = tileBuffer[backIndex]; - uint backEltZIndex = triangleArray[backElt].shapeIndex; - if(eltZIndex >= backEltZIndex) + uint triangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex]]; + uint prevTriangleIndex = indexBuffer[tileArrayBuffer[tileArrayOffset + sortIndex - 1]]; + + int shapeIndex = vertexBuffer[triangleIndex].shapeIndex; + int prevShapeIndex = vertexBuffer[prevTriangleIndex].shapeIndex; + + if(shapeIndex >= prevShapeIndex) { break; } - else - { - tileBuffer[backIndex+1] = backElt; - } + uint tmp = tileArrayBuffer[tileArrayOffset + sortIndex]; + tileArrayBuffer[tileArrayOffset + sortIndex] = tileArrayBuffer[tileArrayOffset + sortIndex - 1]; + tileArrayBuffer[tileArrayOffset + sortIndex - 1] = tmp; } - tileBuffer[backIndex+1] = elt; } } + +bool is_top_left(int2 a, int2 b) +{ + return( (a.y == b.y && b.x < a.x) + ||(b.y < a.y)); +} + +////////////////////////////////////////////////////////////////////////////// +//TODO: we should do these computations on 64bits, because otherwise +// we might overflow for values > 2048. +// Unfortunately this is costly. +// Another way is to precompute triangle edges (b - a) in full precision +// once to avoid doing it all the time... +////////////////////////////////////////////////////////////////////////////// + +//TODO: coalesce int orient2d(int2 a, int2 b, int2 c) { return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x)); } -kernel void RenderKernel(texture2d outTexture [[texture(0)]], +int is_clockwise(int2 p0, int2 p1, int2 p2) +{ + return((p1 - p0).x*(p2 - p0).y - (p1 - p0).y*(p2 - p0).x); +} + + +kernel void RenderKernel(const device mg_vertex* vertexBuffer [[buffer(0)]], + const device uint* indexBuffer [[buffer(1)]], + const device mg_shape* shapeBuffer [[buffer(2)]], + const device uint* tileCounters [[buffer(3)]], + const device uint* tileArrayBuffer [[buffer(4)]], + + constant float4* clearColor [[buffer(5)]], + constant int* useTexture [[buffer(6)]], + constant float* scaling [[buffer(7)]], + + texture2d outTexture [[texture(0)]], texture2d texAtlas [[texture(1)]], - const device mg_vertex* vertexBuffer [[buffer(0)]], - const device mg_shape* shapeBuffer [[buffer(1)]], - device uint* tileCounters [[buffer(2)]], - const device uint* tilesArray [[buffer(3)]], - const device mg_triangle_data* triangleArray [[buffer(4)]], - const device float4* boxArray [[buffer(5)]], - constant vector_float4* clearColor [[buffer(6)]], - constant int* useTexture [[buffer(7)]], - constant float* contentsScaling [[buffer(8)]], + uint2 gid [[thread_position_in_grid]], uint2 tgid [[threadgroup_position_in_grid]], uint2 threadsPerThreadgroup [[threads_per_threadgroup]], uint2 gridSize [[threads_per_grid]]) { //TODO: guard against thread group size not equal to tile size? + const int2 pixelCoord = int2(gid); + const uint2 tileCoord = uint2(pixelCoord)/ RENDERER_TILE_SIZE; const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1; - const uint2 tilePos = gid/RENDERER_TILE_SIZE; - const uint tileIndex = tilePos.y * tilesMatrixDim.x + tilePos.x; - const device uint* tileBuffer = tilesArray + tileIndex * RENDERER_TILE_BUFFER_SIZE; - - const uint tileBufferSize = tileCounters[tileIndex]; + const uint tileIndex = tileCoord.y * tilesMatrixDim.x + tileCoord.x; + const uint tileCounter = min(tileCounters[tileIndex], (uint)RENDERER_TILE_BUFFER_SIZE); #ifdef RENDERER_DEBUG_TILES //NOTE(martin): color code debug values and show the tile grid - uint nTileX = tilesMatrixDim.x; - uint nTileY = tilesMatrixDim.y; + { + float4 fragColor = float4(0); - if(tilePos.x > nTileX || tilePos.y > nTileY) - { - outTexture.write(float4(0, 1, 1, 1), gid); - return; - } - - if((gid.x % RENDERER_TILE_SIZE == 0) || (gid.y % RENDERER_TILE_SIZE == 0)) - { - outTexture.write(float4(0, 0, 0, 1), gid); - return; - } - if(tileBufferSize <= 0) - { - outTexture.write(float4(0, 1, 0, 1), gid); - return; - } - else - { - outTexture.write(float4(1, 0, 0, 1), gid); + if( pixelCoord.x % 16 == 0 + ||pixelCoord.y % 16 == 0) + { + fragColor = float4(0, 0, 0, 1); + } + else if(tileCounters[tileIndex] == 0xffffu) + { + fragColor = float4(1, 0, 1, 1); + } + else if(tileCounter != 0u) + { + fragColor = float4(0, 1, 0, 1); + } + else + { + fragColor = float4(1, 0, 0, 1); + } + outTexture.write(fragColor, gid); return; } #endif - int subPixelFactor = 16; - int2 pixelCoord = int2(gid); - int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor); + const int subPixelFactor = 16; + const int2 centerPoint = int2((float2(pixelCoord) + float2(0.5, 0.5)) * subPixelFactor); const int sampleCount = 8; int2 samplePoints[sampleCount] = {centerPoint + int2(1, 3), @@ -229,59 +273,75 @@ kernel void RenderKernel(texture2d outTexture [[texture(0) centerPoint + int2(-7, 1), centerPoint + int2(3, -7), centerPoint + int2(7, 7)}; - int zIndices[sampleCount]; - uint flipCounts[sampleCount]; - float4 pixelColors[sampleCount]; - float4 nextColors[sampleCount]; - for(int i=0; ip0 * subPixelFactor); - int2 p1 = int2(triangle->p1 * subPixelFactor); - int2 p2 = int2(triangle->p2 * subPixelFactor); + for(int i=0; ibias0; - int bias1 = triangle->bias1; - int bias2 = triangle->bias2; + for(uint tileArrayIndex=0; tileArrayIndex < tileCounter; tileArrayIndex++) + { + int triangleIndex = tileArrayBuffer[RENDERER_TILE_BUFFER_SIZE * tileIndex + tileArrayIndex]; - const device mg_vertex* v0 = &(vertexBuffer[triangle->i0]); - const device mg_vertex* v1 = &(vertexBuffer[triangle->i1]); - const device mg_vertex* v2 = &(vertexBuffer[triangle->i2]); + uint i0 = indexBuffer[triangleIndex]; + uint i1 = indexBuffer[triangleIndex+1]; + uint i2 = indexBuffer[triangleIndex+2]; - float4 cubic0 = v0->cubic; - float4 cubic1 = v1->cubic; - float4 cubic2 = v2->cubic; + int2 p0 = int2((vertexBuffer[i0].pos * scaling[0]) * subPixelFactor); + int2 p1 = int2((vertexBuffer[i1].pos * scaling[0]) * subPixelFactor); + int2 p2 = int2((vertexBuffer[i2].pos * scaling[0]) * subPixelFactor); - int shapeIndex = v0->shapeIndex; + int shapeIndex = vertexBuffer[i0].shapeIndex; float4 color = shapeBuffer[shapeIndex].color; color.rgb *= color.a; + int4 clip = int4(round((shapeBuffer[shapeIndex].clip * scaling[0] + float4(0.5, 0.5, 0.5, 0.5)) * subPixelFactor)); + const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform; matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0}, {uvTransform2x3[1], uvTransform2x3[4], 0}, {uvTransform2x3[2], uvTransform2x3[5], 1}}; - for(int sampleIndex=0; sampleIndex box.z || samplePoint.y < box.y || samplePoint.y > box.w) + if( samplePoint.x < clip.x + || samplePoint.x > clip.z + || samplePoint.y < clip.y + || samplePoint.y > clip.w) { continue; } - */ int w0 = orient2d(p1, p2, samplePoint); int w1 = orient2d(p2, p0, samplePoint); @@ -291,57 +351,52 @@ kernel void RenderKernel(texture2d outTexture [[texture(0) { float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2); - //TODO(martin): this is a quick and dirty fix for solid polygons where we use - // cubic = (1, 1, 1, 1) on all vertices, which can cause small errors to - // flip the sign. - // We should really use another value that always lead to <= 0, but we must - // make sure we never share these vertices with bezier shapes. - // Alternatively, an ugly (but maybe less than this one) solution would be - // to check if uvs are equal on all vertices of the triangle and always render - // those triangles. float eps = 0.0001; if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps) { - if(shapeIndex == zIndices[sampleIndex]) + if(shapeIndex == currentShapeIndex[sampleIndex]) { - flipCounts[sampleIndex]++; + flipCount[sampleIndex]++; } else { - if(flipCounts[sampleIndex] & 0x01) + if(flipCount[sampleIndex] & 0x01) { - pixelColors[sampleIndex] = nextColors[sampleIndex]; + sampleColor[sampleIndex] = currentColor[sampleIndex]; } float4 nextColor = color; - if(*useTexture) + + if(useTexture[0]) { - float2 sampleFP = float2(samplePoint)/subPixelFactor; - float2 uv = (uvTransform*(float3(sampleFP/contentsScaling[0], 1))).xy; + float3 sampleFP = float3(float2(samplePoint).xy/(subPixelFactor*2.), 1); + float2 uv = (uvTransform * sampleFP).xy; constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear); float4 texColor = texAtlas.sample(smp, uv); + texColor.rgb *= texColor.a; nextColor *= texColor; } - nextColors[sampleIndex] = pixelColors[sampleIndex]*(1-nextColor.a) + nextColor; - zIndices[sampleIndex] = shapeIndex; - flipCounts[sampleIndex] = 1; + currentColor[sampleIndex] = sampleColor[sampleIndex]*(1.-nextColor.a) + nextColor; + currentShapeIndex[sampleIndex] = shapeIndex; + flipCount[sampleIndex] = 1; } } } } + } + + float4 pixelColor = float4(0); + for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) + { + if(flipCount[sampleIndex] & 0x01) + { + sampleColor[sampleIndex] = currentColor[sampleIndex]; + } + pixelColor += sampleColor[sampleIndex]; } - float4 out = float4(0, 0, 0, 0); - for(int i=0; i