orca/src/mtl_shader.metal


#include<metal_stdlib>
#include<simd/simd.h>

#include"mtl_shader.h"

using namespace metal;

struct vs_out
{
    float4 pos [[position]];
    float2 uv;
};

vertex vs_out VertexShader(ushort vid [[vertex_id]])
{
	vs_out out;
	out.uv = float2((vid << 1) & 2, vid & 2);
	out.pos = float4(out.uv * float2(2, -2) + float2(-1, 1), 0, 1);
	return(out);
}

fragment float4 FragmentShader(vs_out i [[stage_in]], texture2d<float> tex [[texture(0)]])
{
	constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
	return(tex.sample(smp, i.uv));
}


bool is_top_left(float2 a, float2 b)
{
	return( (a.y == b.y && b.x < a.x)
	      ||(b.y < a.y));
}

kernel void BoundingBoxKernel(constant mg_vertex* vertexBuffer [[buffer(0)]],
		              constant uint* indexBuffer [[buffer(1)]],
		              constant mg_shape* shapeBuffer [[buffer(2)]],
		              device mg_triangle_data* triangleArray [[buffer(3)]],
		              device float4* boxArray [[buffer(4)]],
		              constant float* contentsScaling [[buffer(5)]],
		              uint gid [[thread_position_in_grid]])
{
	uint triangleIndex = gid;
	uint vertexIndex = triangleIndex*3;

	uint i0 = indexBuffer[vertexIndex];
	uint i1 = indexBuffer[vertexIndex+1];
	uint i2 = indexBuffer[vertexIndex+2];

	float2 p0 = vertexBuffer[i0].pos * contentsScaling[0];
	float2 p1 = vertexBuffer[i1].pos * contentsScaling[0];
	float2 p2 = vertexBuffer[i2].pos * contentsScaling[0];

	//NOTE(martin): compute triangle bounding box
	float2 boxMin = min(min(p0, p1), p2);
	float2 boxMax = max(max(p0, p1), p2);

	//NOTE(martin): clip bounding box against clip rect
	int shapeIndex = vertexBuffer[i0].shapeIndex;

	vector_float4 clip = contentsScaling[0]*shapeBuffer[shapeIndex].clip;

	//NOTE(martin): intersect with current clip
	boxMin = max(boxMin, clip.xy);
	boxMax = min(boxMax, clip.zw);

	//NOTE(martin): reorder triangle counter-clockwise and compute bias for each edge
	float cw = (p1 - p0).x*(p2 - p0).y - (p1 - p0).y*(p2 - p0).x;
	if(cw < 0)
	{
		uint tmpIndex = i1;
		i1 = i2;
		i2 = tmpIndex;

		float2 tmpPoint = p1;
		p1 = p2;
		p2 = tmpPoint;
	}
	int bias0 = is_top_left(p1, p2) ? 0 : -1;
	int bias1 = is_top_left(p2, p0) ? 0 : -1;
	int bias2 = is_top_left(p0, p1) ? 0 : -1;

	//NOTE(martin): fill triangle data
	boxArray[triangleIndex] = float4(boxMin.x, boxMin.y, boxMax.x, boxMax.y);

	triangleArray[triangleIndex].shapeIndex = shapeIndex;
	triangleArray[triangleIndex].i0 = i0;
	triangleArray[triangleIndex].i1 = i1;
	triangleArray[triangleIndex].i2 = i2;
	triangleArray[triangleIndex].p0 = p0;
	triangleArray[triangleIndex].p1 = p1;
	triangleArray[triangleIndex].p2 = p2;
	triangleArray[triangleIndex].bias0 = bias0;
	triangleArray[triangleIndex].bias1 = bias1;
	triangleArray[triangleIndex].bias2 = bias2;
}

kernel void TileKernel(const device float4* boxArray [[buffer(0)]],
                       device volatile atomic_uint* tileCounters [[buffer(1)]],
                       device uint* tilesArray [[buffer(2)]],
                       constant vector_uint2* viewport [[buffer(3)]],
                       uint gid [[thread_position_in_grid]])
{
	uint2 tilesMatrixDim = (*viewport - 1) / RENDERER_TILE_SIZE + 1;
	uint nTilesX = tilesMatrixDim.x;
	uint nTilesY = tilesMatrixDim.y;

	uint triangleIndex = gid;
	uint4 box = uint4(floor(boxArray[triangleIndex]))/RENDERER_TILE_SIZE;
	uint xMin = max((uint)0, box.x);
	uint yMin = max((uint)0, box.y);
	uint xMax = min(box.z, nTilesX-1);
	uint yMax = min(box.w, nTilesY-1);

	for(uint y = yMin; y <= yMax; y++)
	{
		for(uint x = xMin ; x <= xMax; x++)
		{
			uint tileIndex = y*nTilesX + x;
			device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE;
			uint counter = atomic_fetch_add_explicit(&(tileCounters[tileIndex]), 1, memory_order_relaxed);
			tileBuffer[counter] = triangleIndex;
		}
	}
}

kernel void SortKernel(const device uint* tileCounters [[buffer(0)]],
                       const device mg_triangle_data* triangleArray [[buffer(1)]],
                       device uint* tilesArray [[buffer(2)]],
                       constant vector_uint2* viewport [[buffer(3)]],
                       uint gid [[thread_position_in_grid]])
{
	uint tileIndex = gid;
	device uint* tileBuffer = tilesArray + tileIndex*RENDERER_TILE_BUFFER_SIZE;
	uint tileBufferSize = tileCounters[tileIndex];

	for(int eltIndex=0; eltIndex < (int)tileBufferSize; eltIndex++)
	{
		uint elt = tileBuffer[eltIndex];
		uint eltZIndex = triangleArray[elt].shapeIndex;

		int backIndex = eltIndex-1;
		for(; backIndex >= 0; backIndex--)
		{
			uint backElt = tileBuffer[backIndex];
			uint backEltZIndex = triangleArray[backElt].shapeIndex;
			if(eltZIndex >= backEltZIndex)
			{
				break;
			}
			else
			{
				tileBuffer[backIndex+1] = backElt;
			}
		}
		tileBuffer[backIndex+1] = elt;
	}
}

float orient2d(float2 a, float2 b, float2 c)
{
	//////////////////////////////////////////////////////////////////////////////////////////
	//TODO(martin): FIX this. This is a **horrible** quick hack to fix the precision issues
	//              arising when a, b, and c are close. But it degrades when a, c, and c
	//              are big. The proper solution is to change the expression to avoid
	//              precision loss but I'm too busy/lazy to do it now.
	//////////////////////////////////////////////////////////////////////////////////////////
	a *= 10;
	b *= 10;
	c *= 10;
	return((b.x-a.x)*(c.y-a.y) - (b.y-a.y)*(c.x-a.x));
}

kernel void RenderKernel(texture2d<float, access::write> outTexture [[texture(0)]],
                         texture2d<float> texAtlas [[texture(1)]],
                         const device mg_vertex* vertexBuffer [[buffer(0)]],
                         const device mg_shape* shapeBuffer [[buffer(1)]],
                         device uint* tileCounters [[buffer(2)]],
                         const device uint* tilesArray [[buffer(3)]],
                         const device mg_triangle_data* triangleArray [[buffer(4)]],
                         const device float4* boxArray [[buffer(5)]],
                         constant vector_float4* clearColor [[buffer(6)]],
                         constant int* useTexture [[buffer(7)]],
                         constant float* contentsScaling [[buffer(8)]],
                         uint2 gid [[thread_position_in_grid]],
                         uint2 tgid [[threadgroup_position_in_grid]],
                         uint2 threadsPerThreadgroup [[threads_per_threadgroup]],
                         uint2 gridSize [[threads_per_grid]])
{
	//TODO: guard against thread group size not equal to tile size?
	const uint2 tilesMatrixDim = (gridSize - 1) / RENDERER_TILE_SIZE + 1;
//	const uint2 tilePos = tgid * threadsPerThreadgroup / RENDERER_TILE_SIZE;
	const uint2 tilePos = gid/RENDERER_TILE_SIZE;
	const uint tileIndex = tilePos.y * tilesMatrixDim.x + tilePos.x;
	const device uint* tileBuffer = tilesArray + tileIndex * RENDERER_TILE_BUFFER_SIZE;

	const uint tileBufferSize = tileCounters[tileIndex];

#ifdef RENDERER_DEBUG_TILES
	//NOTE(martin): color code debug values and show the tile grid
	uint nTileX = tilesMatrixDim.x;
	uint nTileY = tilesMatrixDim.y;

	if(tilePos.x > nTileX || tilePos.y > nTileY)
	{
		outTexture.write(float4(0, 1, 1, 1), gid);
		return;
	}

	if((gid.x % RENDERER_TILE_SIZE == 0) || (gid.y % RENDERER_TILE_SIZE == 0))
	{
		outTexture.write(float4(0, 0, 0, 1), gid);
		return;
	}
	if(tileBufferSize <= 0)
	{
		outTexture.write(float4(0, 1, 0, 1), gid);
		return;
	}
	else
	{
		outTexture.write(float4(1, 0, 0, 1), gid);
		return;
	}
#endif
	const float2 sampleOffsets[6] = { float2(5./12, 5./12),
	                                  float2(-3./12, 3./12),
	                                  float2(1./12, 1./12),
	                                  float2(3./12, -1./12),
	                                  float2(-5./12, -3./12),
	                                  float2(-1./12, -5./12)};

	int zIndices[6];
	uint flipCounts[6];
	float4 pixelColors[6];
	float4 nextColors[6];
	for(int i=0; i<6; i++)
	{
		zIndices[i] = -1;
		flipCounts[i] = 0;
		pixelColors[i] = float4(0, 0, 0, 0);
		nextColors[i] = float4(0, 0, 0, 0);
	}

	for(uint tileBufferIndex=0; tileBufferIndex < tileBufferSize; tileBufferIndex++)
	{
		float4 box = boxArray[tileBuffer[tileBufferIndex]];
		const device mg_triangle_data* triangle = &triangleArray[tileBuffer[tileBufferIndex]];

		float2 p0 = triangle->p0;
		float2 p1 = triangle->p1;
		float2 p2 = triangle->p2;

		int bias0 = triangle->bias0;
		int bias1 = triangle->bias1;
		int bias2 = triangle->bias2;

		const device mg_vertex* v0 = &(vertexBuffer[triangle->i0]);
		const device mg_vertex* v1 = &(vertexBuffer[triangle->i1]);
		const device mg_vertex* v2 = &(vertexBuffer[triangle->i2]);

		float4 cubic0 = v0->cubic;
		float4 cubic1 = v1->cubic;
		float4 cubic2 = v2->cubic;

		int shapeIndex = v0->shapeIndex;
		float4 color = shapeBuffer[shapeIndex].color;
		color.rgb *= color.a;

		const device float* uvTransform2x3 = shapeBuffer[shapeIndex].uvTransform;
		matrix_float3x3 uvTransform = {{uvTransform2x3[0], uvTransform2x3[3], 0},
		                               {uvTransform2x3[1], uvTransform2x3[4], 0},
		                               {uvTransform2x3[2], uvTransform2x3[5], 1}};

		for(int i=0; i<6; i++)
		{
			float2 samplePoint = (float2)gid + sampleOffsets[i];

			//NOTE(martin): cull if pixel is outside box
			if(samplePoint.x < box.x || samplePoint.x > box.z || samplePoint.y < box.y || samplePoint.y > box.w)
			{
				continue;
			}

			float w0 = orient2d(p1, p2, samplePoint);
			float w1 = orient2d(p2, p0, samplePoint);
			float w2 = orient2d(p0, p1, samplePoint);

			if(((int)w0+bias0) >= 0 && ((int)w1+bias1) >= 0 && ((int)w2+bias2) >= 0)
			{
				float4 cubic = (cubic0*w0 + cubic1*w1 + cubic2*w2)/(w0+w1+w2);

				//float2 uv = (uv0*w0 + uv1*w1 + uv2*w2)/(w0+w1+w2);
				float2 uv = (uvTransform*(float3(samplePoint.xy/contentsScaling[0], 1))).xy;

				float4 texColor = float4(1, 1, 1, 1);
				if(*useTexture)
				{
					constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
					texColor = texAtlas.sample(smp, uv);
					texColor.rgb *= texColor.a;
				}
				//TODO(martin): this is a quick and dirty fix for solid polygons where we use
				//              cubic = (1, 1, 1, 1) on all vertices, which can cause small errors to
				//              flip the sign.
				//              We should really use another value that always lead to <= 0, but we must
				//              make sure we never share these vertices with bezier shapes.
				//              Alternatively, an ugly (but maybe less than this one) solution would be
				//              to check if uvs are equal on all vertices of the triangle and always render
				//              those triangles.
				float eps = 0.0001;
				if(cubic.w*(cubic.x*cubic.x*cubic.x - cubic.y*cubic.z) <= eps)
				{
					if(shapeIndex == zIndices[i])
					{
						flipCounts[i]++;
					}
					else
					{
						if(flipCounts[i] & 0x01)
						{
							pixelColors[i] = nextColors[i];
						}

						float4 nextCol = color*texColor;
						nextColors[i] = pixelColors[i]*(1-nextCol.a) +nextCol.a*nextCol;

						zIndices[i] = shapeIndex;
						flipCounts[i] = 1;
					}
				}
			}
		}
	}
	float4 out = float4(0, 0, 0, 0);
	for(int i=0; i<6; i++)
	{
		if(flipCounts[i] & 0x01)
		{
			pixelColors[i] = nextColors[i];
		}
		out += pixelColors[i];
	}
	out = out/6.;
	outTexture.write(out, gid);
}