1710 lines
55 KiB
Metal
1710 lines
55 KiB
Metal
|
|
#include <metal_simdgroup>
|
|
#include <metal_stdlib>
|
|
#include <simd/simd.h>
|
|
|
|
#include "mtl_renderer.h"
|
|
|
|
using namespace metal;
|
|
|
|
typedef struct mtl_log_context
|
|
{
|
|
device char* buffer;
|
|
device volatile atomic_int* offset;
|
|
bool enabled;
|
|
} mtl_log_context;
|
|
|
|
int strlen(const constant char* msg)
|
|
{
|
|
int count = 0;
|
|
while(msg[count] != '\0')
|
|
{
|
|
count++;
|
|
}
|
|
return (count);
|
|
}
|
|
|
|
int strlen(const thread char* msg)
|
|
{
|
|
int count = 0;
|
|
while(msg[count] != '\0')
|
|
{
|
|
count++;
|
|
}
|
|
return (count);
|
|
}
|
|
|
|
void mtl_log(mtl_log_context context, const constant char* msg)
|
|
{
|
|
if(context.enabled)
|
|
{
|
|
int len = strlen(msg);
|
|
int offset = atomic_fetch_add_explicit(context.offset, len + 1, memory_order_relaxed);
|
|
|
|
for(int i = 0; i < len; i++)
|
|
{
|
|
context.buffer[offset + i] = msg[i];
|
|
}
|
|
context.buffer[offset + len] = '\0';
|
|
}
|
|
}
|
|
|
|
void mtl_log(mtl_log_context context, const thread char* msg)
|
|
{
|
|
if(context.enabled)
|
|
{
|
|
int len = strlen(msg);
|
|
int offset = atomic_fetch_add_explicit(context.offset, len + 1, memory_order_relaxed);
|
|
|
|
for(int i = 0; i < len; i++)
|
|
{
|
|
context.buffer[offset + i] = msg[i];
|
|
}
|
|
context.buffer[offset + len] = '\0';
|
|
}
|
|
}
|
|
|
|
int mtl_itoa_right_aligned(int bufSize, thread char* buffer, int64_t value, bool zeroPad)
|
|
{
|
|
// convert value to a null-terminated string at end of buffer and returns the size
|
|
// (excluding the final null).
|
|
bool minus = false;
|
|
if(value < 0)
|
|
{
|
|
minus = true;
|
|
value *= -1;
|
|
}
|
|
buffer[bufSize - 1] = '\0';
|
|
int index = bufSize - 2;
|
|
int stop = minus ? 1 : 0;
|
|
|
|
do
|
|
{
|
|
buffer[index] = '0' + (value % 10);
|
|
index--;
|
|
value /= 10;
|
|
}
|
|
while(value != 0 && index >= stop);
|
|
|
|
if(zeroPad)
|
|
{
|
|
while(index >= stop)
|
|
{
|
|
buffer[index] = '0';
|
|
index--;
|
|
}
|
|
}
|
|
|
|
if(minus)
|
|
{
|
|
buffer[index] = '-';
|
|
index--;
|
|
}
|
|
|
|
int count = bufSize - (index + 1);
|
|
return (count - 1);
|
|
}
|
|
|
|
int mtl_itoa(int bufSize, thread char* buffer, int64_t value)
|
|
{
|
|
int count = mtl_itoa_right_aligned(bufSize, buffer, value, false);
|
|
int start = bufSize - (count + 1);
|
|
|
|
for(int i = 0; i < count + 1; i++)
|
|
{
|
|
buffer[i] = buffer[start + i];
|
|
}
|
|
return (count);
|
|
}
|
|
|
|
void mtl_log_i32(mtl_log_context context, int value)
|
|
{
|
|
char buffer[12];
|
|
mtl_itoa(12, buffer, value);
|
|
mtl_log(context, buffer);
|
|
}
|
|
|
|
void mtl_log_f32(mtl_log_context context, float value)
|
|
{
|
|
bool minus = false;
|
|
if(value < 0)
|
|
{
|
|
minus = true;
|
|
value *= -1;
|
|
}
|
|
|
|
int64_t integral = (int64_t)value;
|
|
int64_t decimal = (int64_t)((value - (float)integral) * 1e6);
|
|
|
|
const int bufSize = 64;
|
|
char buffer[bufSize];
|
|
int index = 0;
|
|
if(minus)
|
|
{
|
|
buffer[index] = '-';
|
|
index++;
|
|
}
|
|
index += mtl_itoa(bufSize - index, buffer + index, integral);
|
|
|
|
if(index < bufSize && decimal)
|
|
{
|
|
buffer[index] = '.';
|
|
index++;
|
|
|
|
int width = 6;
|
|
while(decimal % 10 == 0 && width > 0)
|
|
{
|
|
decimal /= 10;
|
|
width--;
|
|
}
|
|
|
|
int decSize = min(bufSize - index, width + 1);
|
|
mtl_itoa_right_aligned(decSize, buffer + index, decimal, true);
|
|
}
|
|
buffer[bufSize - 1] = '\0';
|
|
mtl_log(context, buffer);
|
|
}
|
|
|
|
void mtl_log_point(mtl_log_context context, float2 p)
|
|
{
|
|
mtl_log(context, "(");
|
|
mtl_log_f32(context, p.x);
|
|
mtl_log(context, ", ");
|
|
mtl_log_f32(context, p.y);
|
|
mtl_log(context, ")");
|
|
}
|
|
|
|
void log_line(thread float2* p, mtl_log_context logCtx)
|
|
{
|
|
mtl_log(logCtx, "(");
|
|
mtl_log_f32(logCtx, p[0].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[0].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[1].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[1].y);
|
|
mtl_log(logCtx, ")\n");
|
|
}
|
|
|
|
void log_quadratic_bezier(thread float2* p, mtl_log_context logCtx)
|
|
{
|
|
mtl_log(logCtx, "(");
|
|
mtl_log_f32(logCtx, p[0].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[0].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[1].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[1].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[2].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[2].y);
|
|
mtl_log(logCtx, ")\n");
|
|
}
|
|
|
|
void log_cubic_bezier(thread float2* p, mtl_log_context logCtx)
|
|
{
|
|
mtl_log(logCtx, "(");
|
|
mtl_log_f32(logCtx, p[0].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[0].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[1].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[1].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[2].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[2].y);
|
|
mtl_log(logCtx, ") (");
|
|
mtl_log_f32(logCtx, p[3].x);
|
|
mtl_log(logCtx, ", ");
|
|
mtl_log_f32(logCtx, p[3].y);
|
|
mtl_log(logCtx, ")\n");
|
|
}
|
|
|
|
kernel void mtl_path_setup(constant int* pathCount [[buffer(0)]],
|
|
const device oc_mtl_path* pathBuffer [[buffer(1)]],
|
|
device oc_mtl_path_queue* pathQueueBuffer [[buffer(2)]],
|
|
device oc_mtl_tile_queue* tileQueueBuffer [[buffer(3)]],
|
|
device atomic_int* tileQueueCount [[buffer(4)]],
|
|
constant int* tileQueueMax [[buffer(5)]],
|
|
constant int* tileSize [[buffer(6)]],
|
|
constant float* scale [[buffer(7)]],
|
|
uint pathIndex [[thread_position_in_grid]])
|
|
{
|
|
const device oc_mtl_path* path = &pathBuffer[pathIndex];
|
|
|
|
//NOTE: we don't clip on the right, since we need those tiles to accurately compute
|
|
// the prefix sum of winding increments in the backprop pass.
|
|
float4 clippedBox = { max(path->box.x, path->clip.x),
|
|
max(path->box.y, path->clip.y),
|
|
path->box.z,
|
|
min(path->box.w, path->clip.w) };
|
|
|
|
int2 firstTile = int2(clippedBox.xy * scale[0]) / tileSize[0];
|
|
int2 lastTile = int2(clippedBox.zw * scale[0]) / tileSize[0];
|
|
|
|
int nTilesX = max(0, lastTile.x - firstTile.x + 1);
|
|
int nTilesY = max(0, lastTile.y - firstTile.y + 1);
|
|
int tileCount = nTilesX * nTilesY;
|
|
|
|
int tileQueuesIndex = atomic_fetch_add_explicit(tileQueueCount, tileCount, memory_order_relaxed);
|
|
|
|
if(tileQueuesIndex + tileCount >= tileQueueMax[0])
|
|
{
|
|
pathQueueBuffer[pathIndex].area = int4(0);
|
|
pathQueueBuffer[pathIndex].tileQueues = 0;
|
|
}
|
|
else
|
|
{
|
|
pathQueueBuffer[pathIndex].area = int4(firstTile.x, firstTile.y, nTilesX, nTilesY);
|
|
pathQueueBuffer[pathIndex].tileQueues = tileQueuesIndex;
|
|
|
|
device oc_mtl_tile_queue* tileQueues = &tileQueueBuffer[tileQueuesIndex];
|
|
|
|
for(int i = 0; i < tileCount; i++)
|
|
{
|
|
atomic_store_explicit(&tileQueues[i].first, -1, memory_order_relaxed);
|
|
tileQueues[i].last = -1;
|
|
atomic_store_explicit(&tileQueues[i].windingOffset, 0, memory_order_relaxed);
|
|
}
|
|
}
|
|
}
|
|
|
|
float ccw(float2 a, float2 b, float2 c)
|
|
{
|
|
return ((b.x - a.x) * (c.y - a.y) - (b.y - a.y) * (c.x - a.x));
|
|
}
|
|
|
|
int mtl_side_of_segment(float2 p, const device oc_mtl_segment* seg, mtl_log_context log = { .enabled = false })
|
|
{
|
|
int side = 0;
|
|
if(p.y > seg->box.w || p.y <= seg->box.y)
|
|
{
|
|
if(p.x > seg->box.x && p.x <= seg->box.z)
|
|
{
|
|
if(p.y > seg->box.w)
|
|
{
|
|
side = (seg->config == OC_MTL_TL || seg->config == OC_MTL_BR) ? -1 : 1;
|
|
}
|
|
else
|
|
{
|
|
side = (seg->config == OC_MTL_TL || seg->config == OC_MTL_BR) ? 1 : -1;
|
|
}
|
|
}
|
|
}
|
|
else if(p.x > seg->box.z)
|
|
{
|
|
side = 1;
|
|
}
|
|
else if(p.x <= seg->box.x)
|
|
{
|
|
side = -1;
|
|
}
|
|
else
|
|
{
|
|
float2 a, b, c;
|
|
switch(seg->config)
|
|
{
|
|
case OC_MTL_TL:
|
|
a = seg->box.xy;
|
|
b = seg->box.zw;
|
|
break;
|
|
|
|
case OC_MTL_BR:
|
|
a = seg->box.zw;
|
|
b = seg->box.xy;
|
|
break;
|
|
|
|
case OC_MTL_TR:
|
|
a = seg->box.xw;
|
|
b = seg->box.zy;
|
|
break;
|
|
|
|
case OC_MTL_BL:
|
|
a = seg->box.zy;
|
|
b = seg->box.xw;
|
|
break;
|
|
}
|
|
c = seg->hullVertex;
|
|
|
|
if(ccw(a, b, p) < 0)
|
|
{
|
|
// other side of the diagonal
|
|
side = (seg->config == OC_MTL_BR || seg->config == OC_MTL_TR) ? -1 : 1;
|
|
}
|
|
else if(ccw(b, c, p) < 0 || ccw(c, a, p) < 0)
|
|
{
|
|
// same side of the diagonal, but outside curve hull
|
|
side = (seg->config == OC_MTL_BL || seg->config == OC_MTL_TL) ? -1 : 1;
|
|
}
|
|
else
|
|
{
|
|
// inside curve hull
|
|
switch(seg->kind)
|
|
{
|
|
case OC_MTL_LINE:
|
|
side = 1;
|
|
break;
|
|
|
|
case OC_MTL_QUADRATIC:
|
|
{
|
|
float3 ph = { p.x, p.y, 1 };
|
|
float3 klm = seg->implicitMatrix * ph;
|
|
side = ((klm.x * klm.x - klm.y) * klm.z < 0) ? -1 : 1;
|
|
}
|
|
break;
|
|
|
|
case OC_MTL_CUBIC:
|
|
{
|
|
float3 ph = { p.x, p.y, 1 };
|
|
float3 klm = seg->implicitMatrix * ph;
|
|
side = (seg->sign * (klm.x * klm.x * klm.x - klm.y * klm.z) < 0) ? -1 : 1;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return (side);
|
|
}
|
|
|
|
typedef struct mtl_segment_setup_context
|
|
{
|
|
device atomic_int* segmentCount;
|
|
device oc_mtl_segment* segmentBuffer;
|
|
const device oc_mtl_path_queue* pathQueue;
|
|
device oc_mtl_tile_queue* tileQueues;
|
|
device oc_mtl_tile_op* tileOpBuffer;
|
|
device atomic_int* tileOpCount;
|
|
int tileSize;
|
|
mtl_log_context log;
|
|
|
|
int pathIndex;
|
|
|
|
int tileOpMax;
|
|
int segmentMax;
|
|
|
|
} mtl_segment_setup_context;
|
|
|
|
void mtl_segment_bin_to_tiles(thread mtl_segment_setup_context* context, device oc_mtl_segment* seg)
|
|
{
|
|
//NOTE: add segment index to the queues of tiles it overlaps with
|
|
int segIndex = seg - context->segmentBuffer;
|
|
|
|
int tileSize = context->tileSize;
|
|
int4 pathArea = context->pathQueue->area;
|
|
int4 coveredTiles = int4(seg->box) / tileSize;
|
|
int xMin = max(0, coveredTiles.x - pathArea.x);
|
|
int yMin = max(0, coveredTiles.y - pathArea.y);
|
|
int xMax = min(coveredTiles.z - pathArea.x, pathArea.z - 1);
|
|
int yMax = min(coveredTiles.w - pathArea.y, pathArea.w - 1);
|
|
|
|
for(int y = yMin; y <= yMax; y++)
|
|
{
|
|
for(int x = xMin; x <= xMax; x++)
|
|
{
|
|
float4 tileBox = (float4){ float(x + pathArea.x),
|
|
float(y + pathArea.y),
|
|
float(x + pathArea.x + 1),
|
|
float(y + pathArea.y + 1) }
|
|
* float(tileSize);
|
|
|
|
float2 bl = { tileBox.x, tileBox.y };
|
|
float2 br = { tileBox.z, tileBox.y };
|
|
float2 tr = { tileBox.z, tileBox.w };
|
|
float2 tl = { tileBox.x, tileBox.w };
|
|
|
|
int sbl = mtl_side_of_segment(bl, seg, context->log);
|
|
int sbr = mtl_side_of_segment(br, seg, context->log);
|
|
int str = mtl_side_of_segment(tr, seg, context->log);
|
|
int stl = mtl_side_of_segment(tl, seg, context->log);
|
|
|
|
bool crossL = (stl * sbl < 0);
|
|
bool crossR = (str * sbr < 0);
|
|
bool crossT = (stl * str < 0);
|
|
bool crossB = (sbl * sbr < 0);
|
|
|
|
float2 s0, s1;
|
|
if(seg->config == OC_MTL_TL || seg->config == OC_MTL_BR)
|
|
{
|
|
s0 = seg->box.xy;
|
|
s1 = seg->box.zw;
|
|
}
|
|
else
|
|
{
|
|
s0 = seg->box.xw;
|
|
s1 = seg->box.zy;
|
|
}
|
|
bool s0Inside = s0.x >= tileBox.x
|
|
&& s0.x < tileBox.z
|
|
&& s0.y >= tileBox.y
|
|
&& s0.y < tileBox.w;
|
|
|
|
bool s1Inside = s1.x >= tileBox.x
|
|
&& s1.x < tileBox.z
|
|
&& s1.y >= tileBox.y
|
|
&& s1.y < tileBox.w;
|
|
|
|
if(crossL || crossR || crossT || crossB || s0Inside || s1Inside)
|
|
{
|
|
int tileOpIndex = atomic_fetch_add_explicit(context->tileOpCount, 1, memory_order_relaxed);
|
|
|
|
if(tileOpIndex < context->tileOpMax)
|
|
{
|
|
device oc_mtl_tile_op* op = &context->tileOpBuffer[tileOpIndex];
|
|
|
|
op->kind = OC_MTL_OP_SEGMENT;
|
|
op->index = segIndex;
|
|
op->crossRight = false;
|
|
op->next = -1;
|
|
|
|
int tileIndex = y * pathArea.z + x;
|
|
device oc_mtl_tile_queue* tile = &context->tileQueues[tileIndex];
|
|
op->next = atomic_exchange_explicit(&tile->first, tileOpIndex, memory_order_relaxed);
|
|
if(op->next == -1)
|
|
{
|
|
tile->last = tileOpIndex;
|
|
}
|
|
|
|
//NOTE: if the segment crosses the tile's bottom boundary, update the tile's winding offset
|
|
if(crossB)
|
|
{
|
|
mtl_log(context->log, "cross bottom boundary, increment ");
|
|
mtl_log_f32(context->log, seg->windingIncrement);
|
|
mtl_log(context->log, "\n");
|
|
atomic_fetch_add_explicit(&tile->windingOffset, seg->windingIncrement, memory_order_relaxed);
|
|
}
|
|
|
|
//NOTE: if the segment crosses the right boundary, mark it. We reuse one of the previous tests
|
|
if(crossR)
|
|
{
|
|
op->crossRight = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
device oc_mtl_segment* mtl_segment_push(thread mtl_segment_setup_context* context, float2 p[4], oc_mtl_seg_kind kind)
|
|
{
|
|
float2 s, e, c;
|
|
|
|
switch(kind)
|
|
{
|
|
case OC_MTL_LINE:
|
|
s = p[0];
|
|
c = p[0];
|
|
e = p[1];
|
|
break;
|
|
|
|
case OC_MTL_QUADRATIC:
|
|
s = p[0];
|
|
c = p[1];
|
|
e = p[2];
|
|
break;
|
|
|
|
case OC_MTL_CUBIC:
|
|
{
|
|
s = p[0];
|
|
float sqrNorm0 = length_squared(p[1] - p[0]);
|
|
float sqrNorm1 = length_squared(p[3] - p[2]);
|
|
if(sqrNorm0 < sqrNorm1)
|
|
{
|
|
c = p[2];
|
|
}
|
|
else
|
|
{
|
|
c = p[1];
|
|
}
|
|
e = p[3];
|
|
}
|
|
break;
|
|
}
|
|
|
|
device oc_mtl_segment* seg = 0;
|
|
|
|
int segIndex = atomic_fetch_add_explicit(context->segmentCount, 1, memory_order_relaxed);
|
|
|
|
if(segIndex < context->segmentMax)
|
|
{
|
|
seg = &context->segmentBuffer[segIndex];
|
|
|
|
bool goingUp = e.y >= s.y;
|
|
bool goingRight = e.x >= s.x;
|
|
|
|
seg->kind = kind;
|
|
seg->pathIndex = context->pathIndex;
|
|
seg->windingIncrement = goingUp ? 1 : -1;
|
|
|
|
seg->box = (vector_float4){ min(s.x, e.x),
|
|
min(s.y, e.y),
|
|
max(s.x, e.x),
|
|
max(s.y, e.y) };
|
|
|
|
float dx = c.x - seg->box.x;
|
|
float dy = c.y - seg->box.y;
|
|
float alpha = (seg->box.w - seg->box.y) / (seg->box.z - seg->box.x);
|
|
float ofs = seg->box.w - seg->box.y;
|
|
|
|
if(goingUp == goingRight)
|
|
{
|
|
if(seg->kind == OC_MTL_LINE)
|
|
{
|
|
seg->config = OC_MTL_BR;
|
|
}
|
|
else if(dy > alpha * dx)
|
|
{
|
|
seg->config = OC_MTL_TL;
|
|
}
|
|
else
|
|
{
|
|
seg->config = OC_MTL_BR;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(seg->kind == OC_MTL_LINE)
|
|
{
|
|
seg->config = OC_MTL_TR;
|
|
}
|
|
else if(dy < ofs - alpha * dx)
|
|
{
|
|
seg->config = OC_MTL_BL;
|
|
}
|
|
else
|
|
{
|
|
seg->config = OC_MTL_TR;
|
|
}
|
|
}
|
|
}
|
|
return (seg);
|
|
}
|
|
|
|
#define square(x) ((x) * (x))
|
|
#define cube(x) ((x) * (x) * (x))
|
|
|
|
void mtl_line_setup(thread mtl_segment_setup_context* context, float2 p[2])
|
|
{
|
|
device oc_mtl_segment* seg = mtl_segment_push(context, p, OC_MTL_LINE);
|
|
if(seg)
|
|
{
|
|
seg->hullVertex = p[0];
|
|
mtl_segment_bin_to_tiles(context, seg);
|
|
}
|
|
}
|
|
|
|
float2 mtl_quadratic_blossom(float2 p[3], float u, float v)
|
|
{
|
|
float2 b10 = u * p[1] + (1 - u) * p[0];
|
|
float2 b11 = u * p[2] + (1 - u) * p[1];
|
|
float2 b20 = v * b11 + (1 - v) * b10;
|
|
return (b20);
|
|
}
|
|
|
|
void mtl_quadratic_slice(float2 p[3], float s0, float s1, float2 sp[3])
|
|
{
|
|
/*NOTE: using blossoms to compute sub-curve control points ensure that the fourth point
|
|
of sub-curve (s0, s1) and the first point of sub-curve (s1, s3) match.
|
|
However, due to numerical errors, the evaluation of B(s=0) might not be equal to
|
|
p[0] (and likewise, B(s=1) might not equal p[3]).
|
|
We handle that case explicitly to ensure that we don't create gaps in the paths.
|
|
*/
|
|
sp[0] = (s0 == 0) ? p[0] : mtl_quadratic_blossom(p, s0, s0);
|
|
sp[1] = mtl_quadratic_blossom(p, s0, s1);
|
|
sp[2] = (s1 == 1) ? p[2] : mtl_quadratic_blossom(p, s1, s1);
|
|
}
|
|
|
|
int mtl_quadratic_monotonize(float2 p[3], float splits[4])
|
|
{
|
|
//NOTE: compute split points
|
|
int count = 0;
|
|
splits[0] = 0;
|
|
count++;
|
|
|
|
float2 r = (p[0] - p[1]) / (p[2] - 2 * p[1] + p[0]);
|
|
if(r.x > r.y)
|
|
{
|
|
float tmp = r.x;
|
|
r.x = r.y;
|
|
r.y = tmp;
|
|
}
|
|
if(r.x > 0 && r.x < 1)
|
|
{
|
|
splits[count] = r.x;
|
|
count++;
|
|
}
|
|
if(r.y > 0 && r.y < 1)
|
|
{
|
|
splits[count] = r.y;
|
|
count++;
|
|
}
|
|
splits[count] = 1;
|
|
count++;
|
|
return (count);
|
|
}
|
|
|
|
matrix_float3x3 mtl_barycentric_matrix(float2 v0, float2 v1, float2 v2)
|
|
{
|
|
float det = v0.x * (v1.y - v2.y) + v1.x * (v2.y - v0.y) + v2.x * (v0.y - v1.y);
|
|
matrix_float3x3 B = { { v1.y - v2.y, v2.y - v0.y, v0.y - v1.y },
|
|
{ v2.x - v1.x, v0.x - v2.x, v1.x - v0.x },
|
|
{ v1.x * v2.y - v2.x * v1.y, v2.x * v0.y - v0.x * v2.y, v0.x * v1.y - v1.x * v0.y } };
|
|
B *= (1 / det);
|
|
return (B);
|
|
}
|
|
|
|
void mtl_quadratic_emit(thread mtl_segment_setup_context* context,
|
|
thread float2* p)
|
|
{
|
|
device oc_mtl_segment* seg = mtl_segment_push(context, p, OC_MTL_QUADRATIC);
|
|
|
|
if(seg)
|
|
{
|
|
//NOTE: compute implicit equation matrix
|
|
float det = p[0].x * (p[1].y - p[2].y) + p[1].x * (p[2].y - p[0].y) + p[2].x * (p[0].y - p[1].y);
|
|
|
|
float a = p[0].y - p[1].y + 0.5 * (p[2].y - p[0].y);
|
|
float b = p[1].x - p[0].x + 0.5 * (p[0].x - p[2].x);
|
|
float c = p[0].x * p[1].y - p[1].x * p[0].y + 0.5 * (p[2].x * p[0].y - p[0].x * p[2].y);
|
|
float d = p[0].y - p[1].y;
|
|
float e = p[1].x - p[0].x;
|
|
float f = p[0].x * p[1].y - p[1].x * p[0].y;
|
|
|
|
float flip = (seg->config == OC_MTL_TL || seg->config == OC_MTL_BL) ? -1 : 1;
|
|
float g = flip * (p[2].x * (p[0].y - p[1].y) + p[0].x * (p[1].y - p[2].y) + p[1].x * (p[2].y - p[0].y));
|
|
|
|
seg->implicitMatrix = (1 / det) * matrix_float3x3({ a, d, 0. }, { b, e, 0. }, { c, f, g });
|
|
|
|
seg->hullVertex = p[1];
|
|
|
|
mtl_segment_bin_to_tiles(context, seg);
|
|
}
|
|
}
|
|
|
|
void mtl_quadratic_setup(thread mtl_segment_setup_context* context, thread float2* p)
|
|
{
|
|
float splits[4];
|
|
int splitCount = mtl_quadratic_monotonize(p, splits);
|
|
|
|
//NOTE: produce bézier curve for each consecutive pair of roots
|
|
for(int sliceIndex = 0; sliceIndex < splitCount - 1; sliceIndex++)
|
|
{
|
|
float2 sp[3];
|
|
mtl_quadratic_slice(p, splits[sliceIndex], splits[sliceIndex + 1], sp);
|
|
mtl_quadratic_emit(context, sp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
diff_of_products() computes a*b-c*d with a maximum error <= 1.5 ulp
|
|
|
|
Claude-Pierre Jeannerod, Nicolas Louvet, and Jean-Michel Muller,
|
|
"Further Analysis of Kahan's Algorithm for the Accurate Computation
|
|
of 2x2 Determinants". Mathematics of Computation, Vol. 82, No. 284,
|
|
Oct. 2013, pp. 2245-2264
|
|
*/
|
|
float diff_of_products(float a, float b, float c, float d)
|
|
{
|
|
float w = d * c;
|
|
float e = fma(-d, c, w);
|
|
float f = fma(a, b, -w);
|
|
return (f + e);
|
|
}
|
|
|
|
int mtl_quadratic_roots_with_det(float a, float b, float c, float det, thread float* r, mtl_log_context log = { .enabled = false })
|
|
{
|
|
int count = 0;
|
|
|
|
if(a == 0)
|
|
{
|
|
if(b)
|
|
{
|
|
count = 1;
|
|
r[0] = -c / b;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
b /= 2.0;
|
|
|
|
if(det >= 0)
|
|
{
|
|
count = (det == 0) ? 1 : 2;
|
|
|
|
if(b > 0)
|
|
{
|
|
float q = b + sqrt(det);
|
|
r[0] = -c / q;
|
|
r[1] = -q / a;
|
|
}
|
|
else if(b < 0)
|
|
{
|
|
float q = -b + sqrt(det);
|
|
r[0] = q / a;
|
|
r[1] = c / q;
|
|
}
|
|
else
|
|
{
|
|
float q = sqrt(-a * c);
|
|
if(fabs(a) >= fabs(c))
|
|
{
|
|
r[0] = q / a;
|
|
r[1] = -q / a;
|
|
}
|
|
else
|
|
{
|
|
r[0] = -c / q;
|
|
r[1] = c / q;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(count > 1 && r[0] > r[1])
|
|
{
|
|
float tmp = r[0];
|
|
r[0] = r[1];
|
|
r[1] = tmp;
|
|
}
|
|
return (count);
|
|
}
|
|
|
|
int mtl_quadratic_roots(float a, float b, float c, thread float* r, mtl_log_context log = { .enabled = false })
|
|
{
|
|
//float det = diff_of_products(b, b, a, c);
|
|
float det = square(b) / 4. - a * c;
|
|
return (mtl_quadratic_roots_with_det(a, b, c, det, r, log));
|
|
}
|
|
|
|
float2 mtl_cubic_blossom(float2 p[4], float u, float v, float w)
|
|
{
|
|
float2 b10 = u * p[1] + (1 - u) * p[0];
|
|
float2 b11 = u * p[2] + (1 - u) * p[1];
|
|
float2 b12 = u * p[3] + (1 - u) * p[2];
|
|
float2 b20 = v * b11 + (1 - v) * b10;
|
|
float2 b21 = v * b12 + (1 - v) * b11;
|
|
float2 b30 = w * b21 + (1 - w) * b20;
|
|
return (b30);
|
|
}
|
|
|
|
void mtl_cubic_slice(float2 p[4], float s0, float s1, float2 sp[4])
|
|
{
|
|
/*NOTE: using blossoms to compute sub-curve control points ensure that the fourth point
|
|
of sub-curve (s0, s1) and the first point of sub-curve (s1, s3) match.
|
|
However, due to numerical errors, the evaluation of B(s=0) might not be equal to
|
|
p[0] (and likewise, B(s=1) might not equal p[3]).
|
|
We handle that case explicitly to ensure that we don't create gaps in the paths.
|
|
*/
|
|
sp[0] = (s0 == 0) ? p[0] : mtl_cubic_blossom(p, s0, s0, s0);
|
|
sp[1] = mtl_cubic_blossom(p, s0, s0, s1);
|
|
sp[2] = mtl_cubic_blossom(p, s0, s1, s1);
|
|
sp[3] = (s1 == 1) ? p[3] : mtl_cubic_blossom(p, s1, s1, s1);
|
|
}
|
|
|
|
typedef enum
|
|
{
|
|
MTL_CUBIC_ERROR,
|
|
MTL_CUBIC_SERPENTINE,
|
|
MTL_CUBIC_CUSP,
|
|
MTL_CUBIC_CUSP_INFINITY,
|
|
MTL_CUBIC_LOOP,
|
|
MTL_CUBIC_DEGENERATE_QUADRATIC,
|
|
MTL_CUBIC_DEGENERATE_LINE,
|
|
|
|
} mtl_cubic_kind;
|
|
|
|
typedef struct mtl_cubic_info
|
|
{
|
|
mtl_cubic_kind kind;
|
|
matrix_float4x4 K;
|
|
float2 ts[2];
|
|
float d1;
|
|
float d2;
|
|
float d3;
|
|
|
|
} mtl_cubic_info;
|
|
|
|
mtl_cubic_info mtl_cubic_classify(thread float2* c, mtl_log_context log = { .enabled = false })
|
|
{
|
|
mtl_cubic_info result = { MTL_CUBIC_ERROR };
|
|
matrix_float4x4 F;
|
|
|
|
/*NOTE(martin):
|
|
now, compute determinants d0, d1, d2, d3, which gives the coefficients of the
|
|
inflection points polynomial:
|
|
|
|
I(t, s) = d0*t^3 - 3*d1*t^2*s + 3*d2*t*s^2 - d3*s^3
|
|
|
|
The roots of this polynomial are the inflection points of the parametric curve, in homogeneous
|
|
coordinates (ie we can have an inflection point at inifinity with s=0).
|
|
|
|
|x3 y3 w3| |x3 y3 w3| |x3 y3 w3| |x2 y2 w2|
|
|
d0 = det |x2 y2 w2| d1 = -det |x2 y2 w2| d2 = det |x1 y1 w1| d3 = -det |x1 y1 w1|
|
|
|x1 y1 w1| |x0 y0 w0| |x0 y0 w0| |x0 y0 w0|
|
|
|
|
In our case, the pi.w equal 1 (no point at infinity), so _in_the_power_basis_, w1 = w2 = w3 = 0 and w0 = 1
|
|
(which also means d0 = 0)
|
|
|
|
//WARN: there seems to be a mismatch between the signs of the d_i and the orientation test in the Loop-Blinn paper?
|
|
// flipping the sign of the d_i doesn't change the roots (and the implicit matrix), but it does change the orientation.
|
|
// Keeping the signs of the paper puts the interior on the left of parametric travel, unlike what's stated in the paper.
|
|
// this may very well be an error on my part that's cancelled by flipping the signs of the d_i though!
|
|
*/
|
|
|
|
float d1 = -(c[3].y * c[2].x - c[3].x * c[2].y);
|
|
float d2 = -(c[3].x * c[1].y - c[3].y * c[1].x);
|
|
float d3 = -(c[2].y * c[1].x - c[2].x * c[1].y);
|
|
|
|
result.d1 = d1;
|
|
result.d2 = d2;
|
|
result.d3 = d3;
|
|
|
|
// mtl_log(log, "d1 = ");
|
|
/* mtl_log_f32(log, d1);
|
|
mtl_log(log, ", d2 = ");
|
|
mtl_log_f32(log, d2);
|
|
mtl_log(log, ", d3 = ");
|
|
mtl_log_f32(log, d3);
|
|
mtl_log(log, "\n");
|
|
*/
|
|
//NOTE(martin): compute the second factor of the discriminant discr(I) = d1^2*(3*d2^2 - 4*d3*d1)
|
|
float discrFactor2 = 3.0 * square(d2) - 4.0 * d3 * d1;
|
|
|
|
//NOTE(martin): each following case gives the number of roots, hence the category of the parametric curve
|
|
if(fabs(d1) <= 1e-6 && fabs(d2) <= 1e-6 && fabs(d3) > 1e-6)
|
|
{
|
|
//NOTE(martin): quadratic degenerate case
|
|
//NOTE(martin): compute quadratic curve control point, which is at p0 + 1.5*(p1-p0) = 1.5*p1 - 0.5*p0
|
|
result.kind = MTL_CUBIC_DEGENERATE_QUADRATIC;
|
|
}
|
|
else if((discrFactor2 > 0 && fabs(d1) > 1e-6)
|
|
|| (discrFactor2 == 0 && fabs(d1) > 1e-6))
|
|
{
|
|
//mtl_log(log, "cusp or serpentine\n");
|
|
|
|
//NOTE(martin): serpentine curve or cusp with inflection at infinity
|
|
// (these two cases are handled the same way).
|
|
//NOTE(martin): compute the solutions (tl, sl), (tm, sm), and (tn, sn) of the inflection point equation
|
|
float tmtl[2];
|
|
mtl_quadratic_roots_with_det(1, -2 * d2, (4. / 3. * d1 * d3), (1. / 3.) * discrFactor2, tmtl);
|
|
|
|
float tm = tmtl[0];
|
|
float sm = 2 * d1;
|
|
float tl = tmtl[1];
|
|
float sl = 2 * d1;
|
|
|
|
float invNorm = 1 / sqrt(square(tm) + square(sm));
|
|
tm *= invNorm;
|
|
sm *= invNorm;
|
|
|
|
invNorm = 1 / sqrt(square(tl) + square(sl));
|
|
tl *= invNorm;
|
|
sl *= invNorm;
|
|
|
|
/*NOTE(martin):
|
|
the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:
|
|
|
|
| tl*tm tl^3 tm^3 1 |
|
|
| -sm*tl - sl*tm -3sl*tl^2 -3*sm*tm^2 0 |
|
|
| sl*sm 3*sl^2*tl 3*sm^2*tm 0 |
|
|
| 0 -sl^3 -sm^3 0 |
|
|
*/
|
|
result.kind = (discrFactor2 > 0 && d1 != 0) ? MTL_CUBIC_SERPENTINE : MTL_CUBIC_CUSP;
|
|
|
|
F = (matrix_float4x4){ { tl * tm, -sm * tl - sl * tm, sl * sm, 0 },
|
|
{ cube(tl), -3 * sl * square(tl), 3 * square(sl) * tl, -cube(sl) },
|
|
{ cube(tm), -3 * sm * square(tm), 3 * square(sm) * tm, -cube(sm) },
|
|
{ 1, 0, 0, 0 } };
|
|
|
|
result.ts[0] = (float2){ tm, sm };
|
|
result.ts[1] = (float2){ tl, sl };
|
|
}
|
|
else if(discrFactor2 < 0 && fabs(d1) > 1e-6)
|
|
{
|
|
// mtl_log(log, "loop\n");
|
|
|
|
//NOTE(martin): loop curve
|
|
result.kind = MTL_CUBIC_LOOP;
|
|
|
|
float tetd[2];
|
|
mtl_quadratic_roots_with_det(1, -2 * d2, 4 * (square(d2) - d1 * d3), -discrFactor2, tetd, log);
|
|
|
|
float td = tetd[1];
|
|
float sd = 2 * d1;
|
|
float te = tetd[0];
|
|
float se = 2 * d1;
|
|
|
|
float invNorm = 1 / sqrt(square(td) + square(sd));
|
|
td *= invNorm;
|
|
sd *= invNorm;
|
|
|
|
invNorm = 1 / sqrt(square(te) + square(se));
|
|
te *= invNorm;
|
|
se *= invNorm;
|
|
|
|
//NOTE(martin): if one of the parameters (td/sd) or (te/se) is in the interval [0,1], the double point
|
|
// is inside the control points convex hull and would cause a shading anomaly. If this is
|
|
// the case, subdivide the curve at that point
|
|
/*
|
|
mtl_log(log, "td = ");
|
|
mtl_log_f32(log, td);
|
|
mtl_log(log, ", sd = ");
|
|
mtl_log_f32(log, sd);
|
|
mtl_log(log, ", te = ");
|
|
mtl_log_f32(log, te);
|
|
mtl_log(log, ", se = ");
|
|
mtl_log_f32(log, td);
|
|
mtl_log(log, ", td/sd = ");
|
|
mtl_log_f32(log, td/sd);
|
|
mtl_log(log, ", te/se = ");
|
|
mtl_log_f32(log, te/se);
|
|
mtl_log(log, "\n");
|
|
//*/
|
|
/*NOTE(martin):
|
|
the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:
|
|
|
|
| td*te td^2*te td*te^2 1 |
|
|
| -se*td - sd*te -se*td^2 - 2sd*te*td -sd*te^2 - 2*se*td*te 0 |
|
|
| sd*se te*sd^2 + 2*se*td*sd td*se^2 + 2*sd*te*se 0 |
|
|
| 0 -sd^2*se -sd*se^2 0 |
|
|
*/
|
|
F = (matrix_float4x4){ { td * te, -se * td - sd * te, sd * se, 0 },
|
|
{ square(td) * te, -se * square(td) - 2 * sd * te * td, te * square(sd) + 2 * se * td * sd, -square(sd) * se },
|
|
{ td * square(te), -sd * square(te) - 2 * se * td * te, td * square(se) + 2 * sd * te * se, -sd * square(se) },
|
|
{ 1, 0, 0, 0 } };
|
|
|
|
result.ts[0] = (float2){ td, sd };
|
|
result.ts[1] = (float2){ te, se };
|
|
}
|
|
else if(d2 != 0)
|
|
{
|
|
//NOTE(martin): cusp with cusp at infinity
|
|
|
|
// mtl_log(log, "cusp at infinity\n");
|
|
|
|
float tl = d3;
|
|
float sl = 3 * d2;
|
|
|
|
float invNorm = 1 / sqrt(square(tl) + square(sl));
|
|
tl *= invNorm;
|
|
sl *= invNorm;
|
|
|
|
/*NOTE(martin):
|
|
the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:
|
|
|
|
| tl tl^3 1 1 |
|
|
| -sl -3sl*tl^2 0 0 |
|
|
| 0 3*sl^2*tl 0 0 |
|
|
| 0 -sl^3 0 0 |
|
|
*/
|
|
result.kind = MTL_CUBIC_CUSP_INFINITY;
|
|
|
|
F = (matrix_float4x4){ { tl, -sl, 0, 0 },
|
|
{ cube(tl), -3 * sl * square(tl), 3 * square(sl) * tl, -cube(sl) },
|
|
{ 1, 0, 0, 0 },
|
|
{ 1, 0, 0, 0 } };
|
|
|
|
result.ts[0] = (float2){ tl, sl };
|
|
result.ts[1] = (float2){ 0, 0 };
|
|
}
|
|
else
|
|
{
|
|
//NOTE(martin): line or point degenerate case
|
|
result.kind = MTL_CUBIC_DEGENERATE_LINE;
|
|
}
|
|
|
|
/*
|
|
F is then multiplied by M3^(-1) on the left which yelds the bezier coefficients k, l, m, n
|
|
at the control points.
|
|
|
|
| 1 0 0 0 |
|
|
M3^(-1) = | 1 1/3 0 0 |
|
|
| 1 2/3 1/3 0 |
|
|
| 1 1 1 1 |
|
|
*/
|
|
matrix_float4x4 invM3 = { { 1, 1, 1, 1 },
|
|
{ 0, 1. / 3., 2. / 3., 1 },
|
|
{ 0, 0, 1. / 3., 1 },
|
|
{ 0, 0, 0, 1 } };
|
|
|
|
result.K = transpose(invM3 * F);
|
|
|
|
return (result);
|
|
}
|
|
|
|
float2 mtl_select_hull_vertex(float2 p0, float2 p1, float2 p2, float2 p3, mtl_log_context log)
|
|
{
|
|
/*NOTE: check intersection of lines (p1-p0) and (p3-p2)
|
|
P = p0 + u(p1-p0)
|
|
P = p2 + w(p3-p2)
|
|
|
|
control points are inside a right triangle so we should always find an intersection
|
|
*/
|
|
float2 pm;
|
|
|
|
float det = (p1.x - p0.x) * (p3.y - p2.y) - (p1.y - p0.y) * (p3.x - p2.x);
|
|
float sqrNorm0 = length_squared(p1 - p0);
|
|
float sqrNorm1 = length_squared(p2 - p3);
|
|
|
|
if(fabs(det) < 1e-3 || sqrNorm0 < 0.1 || sqrNorm1 < 0.1)
|
|
{
|
|
if(sqrNorm0 < sqrNorm1)
|
|
{
|
|
pm = p2;
|
|
}
|
|
else
|
|
{
|
|
pm = p1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
float u = ((p0.x - p2.x) * (p2.y - p3.y) - (p0.y - p2.y) * (p2.x - p3.x)) / det;
|
|
pm = p0 + u * (p1 - p0);
|
|
}
|
|
return (pm);
|
|
}
|
|
|
|
void mtl_cubic_emit(thread mtl_segment_setup_context* context, mtl_cubic_info curve, float2 p[4], float s0, float s1, float2 sp[4])
|
|
{
|
|
device oc_mtl_segment* seg = mtl_segment_push(context, sp, OC_MTL_CUBIC);
|
|
|
|
if(seg)
|
|
{
|
|
float2 v0 = p[0];
|
|
float2 v1 = p[3];
|
|
float2 v2;
|
|
matrix_float3x3 K;
|
|
|
|
float sqrNorm0 = length_squared(p[1] - p[0]);
|
|
float sqrNorm1 = length_squared(p[2] - p[3]);
|
|
|
|
//TODO: should not be the local sub-curve, but the global curve!!!
|
|
if(length_squared(p[0] - p[3]) > 1e-5)
|
|
{
|
|
if(sqrNorm0 >= sqrNorm1)
|
|
{
|
|
v2 = p[1];
|
|
K = { curve.K[0].xyz, curve.K[3].xyz, curve.K[1].xyz };
|
|
}
|
|
else
|
|
{
|
|
v2 = p[2];
|
|
K = { curve.K[0].xyz, curve.K[3].xyz, curve.K[2].xyz };
|
|
}
|
|
}
|
|
else
|
|
{
|
|
v1 = p[1];
|
|
v2 = p[2];
|
|
K = { curve.K[0].xyz, curve.K[1].xyz, curve.K[2].xyz };
|
|
}
|
|
//NOTE: set matrices
|
|
|
|
//TODO: should we compute matrix relative to a base point to avoid loss of precision
|
|
// when computing barycentric matrix?
|
|
|
|
matrix_float3x3 B = mtl_barycentric_matrix(v0, v1, v2);
|
|
seg->implicitMatrix = K * B;
|
|
seg->hullVertex = mtl_select_hull_vertex(sp[0], sp[1], sp[2], sp[3], context->log);
|
|
|
|
//NOTE: compute sign flip
|
|
seg->sign = 1;
|
|
|
|
if(curve.kind == MTL_CUBIC_SERPENTINE
|
|
|| curve.kind == MTL_CUBIC_CUSP)
|
|
{
|
|
seg->sign = (curve.d1 < 0) ? -1 : 1;
|
|
}
|
|
else if(curve.kind == MTL_CUBIC_LOOP)
|
|
{
|
|
float d1 = curve.d1;
|
|
float d2 = curve.d2;
|
|
float d3 = curve.d3;
|
|
|
|
float H0 = d3 * d1 - square(d2) + d1 * d2 * s0 - square(d1) * square(s0);
|
|
float H1 = d3 * d1 - square(d2) + d1 * d2 * s1 - square(d1) * square(s1);
|
|
float H = (abs(H0) > abs(H1)) ? H0 : H1;
|
|
seg->sign = (H * d1 > 0) ? -1 : 1;
|
|
}
|
|
|
|
if(sp[3].y > sp[0].y)
|
|
{
|
|
seg->sign *= -1;
|
|
}
|
|
|
|
//NOTE: bin to tiles
|
|
mtl_segment_bin_to_tiles(context, seg);
|
|
}
|
|
}
|
|
|
|
void mtl_cubic_setup(thread mtl_segment_setup_context* context, float2 p[4])
|
|
{
|
|
/*NOTE(martin): first convert the control points to power basis, multiplying by M3
|
|
|
|
| 1 0 0 0| |p0| |c0|
|
|
M3 = |-3 3 0 0|, B = |p1|, C = |c1| = M3*B
|
|
| 3 -6 3 0| |p2| |c2|
|
|
|-1 3 -3 1| |p3| |c3|
|
|
*/
|
|
float2 c[4] = {
|
|
p[0],
|
|
3.0 * (p[1] - p[0]),
|
|
3.0 * (p[0] + p[2] - 2 * p[1]),
|
|
3.0 * (p[1] - p[2]) + p[3] - p[0]
|
|
};
|
|
|
|
/*
|
|
mtl_log(context->log, "bezier basis: ");
|
|
log_cubic_bezier(p, context->log);
|
|
|
|
mtl_log(context->log, "power basis: ");
|
|
log_cubic_bezier(c, context->log);
|
|
*/
|
|
//NOTE: get classification, implicit matrix, double points and inflection points
|
|
mtl_cubic_info curve = mtl_cubic_classify(c, context->log);
|
|
|
|
if(curve.kind == MTL_CUBIC_DEGENERATE_LINE)
|
|
{
|
|
float2 l[2] = { p[0], p[3] };
|
|
mtl_line_setup(context, l);
|
|
return;
|
|
}
|
|
else if(curve.kind == MTL_CUBIC_DEGENERATE_QUADRATIC)
|
|
{
|
|
float2 quadPoint = float2(1.5 * p[1].x - 0.5 * p[0].x, 1.5 * p[1].y - 0.5 * p[0].y);
|
|
float2 q[3] = { p[0], quadPoint, p[3] };
|
|
mtl_quadratic_setup(context, q);
|
|
return;
|
|
}
|
|
|
|
//NOTE: get the roots of B'(s) = 3.c3.s^2 + 2.c2.s + c1
|
|
float roots[6];
|
|
int rootCount = mtl_quadratic_roots(3 * c[3].x, 2 * c[2].x, c[1].x, roots);
|
|
rootCount += mtl_quadratic_roots(3 * c[3].y, 2 * c[2].y, c[1].y, roots + rootCount);
|
|
|
|
//NOTE: add double points and inflection points to roots if finite
|
|
for(int i = 0; i < 2; i++)
|
|
{
|
|
if(curve.ts[i].y)
|
|
{
|
|
roots[rootCount] = curve.ts[i].x / curve.ts[i].y;
|
|
rootCount++;
|
|
}
|
|
}
|
|
|
|
//NOTE: sort roots
|
|
for(int i = 1; i < rootCount; i++)
|
|
{
|
|
float tmp = roots[i];
|
|
int j = i - 1;
|
|
while(j >= 0 && roots[j] > tmp)
|
|
{
|
|
roots[j + 1] = roots[j];
|
|
j--;
|
|
}
|
|
roots[j + 1] = tmp;
|
|
}
|
|
|
|
//NOTE: compute split points
|
|
float splits[8];
|
|
int splitCount = 0;
|
|
splits[0] = 0;
|
|
splitCount++;
|
|
for(int i = 0; i < rootCount; i++)
|
|
{
|
|
if(roots[i] > 0 && roots[i] < 1)
|
|
{
|
|
splits[splitCount] = roots[i];
|
|
splitCount++;
|
|
}
|
|
}
|
|
splits[splitCount] = 1;
|
|
splitCount++;
|
|
|
|
mtl_log(context->log, "monotonic segment count = ");
|
|
mtl_log_i32(context->log, splitCount - 1);
|
|
mtl_log(context->log, "\n");
|
|
|
|
//NOTE: for each monotonic segment, compute hull matrix and sign, and emit segment
|
|
for(int sliceIndex = 0; sliceIndex < splitCount - 1; sliceIndex++)
|
|
{
|
|
float s0 = splits[sliceIndex];
|
|
float s1 = splits[sliceIndex + 1];
|
|
float2 sp[4];
|
|
mtl_cubic_slice(p, s0, s1, sp);
|
|
|
|
/////////////////////// we should ensure that these have the same endpoints as the original curve and all endpoints are joining
|
|
|
|
mtl_log(context->log, "monotonic slice ");
|
|
mtl_log_i32(context->log, sliceIndex);
|
|
mtl_log(context->log, " ( ");
|
|
mtl_log_f32(context->log, s0);
|
|
mtl_log(context->log, " <= s <= ");
|
|
mtl_log_f32(context->log, s1);
|
|
mtl_log(context->log, " ): ");
|
|
log_cubic_bezier(sp, context->log);
|
|
|
|
mtl_cubic_emit(context, curve, p, s0, s1, sp);
|
|
}
|
|
}
|
|
|
|
kernel void mtl_segment_setup(constant int* elementCount [[buffer(0)]],
|
|
const device oc_mtl_path_elt* elementBuffer [[buffer(1)]],
|
|
device atomic_int* segmentCount [[buffer(2)]],
|
|
device oc_mtl_segment* segmentBuffer [[buffer(3)]],
|
|
const device oc_mtl_path_queue* pathQueueBuffer [[buffer(4)]],
|
|
device oc_mtl_tile_queue* tileQueueBuffer [[buffer(5)]],
|
|
device oc_mtl_tile_op* tileOpBuffer [[buffer(6)]],
|
|
device atomic_int* tileOpCount [[buffer(7)]],
|
|
constant int* tileOpMax [[buffer(8)]],
|
|
constant int* segmentMax [[buffer(9)]],
|
|
constant int* tileSize [[buffer(10)]],
|
|
constant float* scale [[buffer(11)]],
|
|
|
|
device char* logBuffer [[buffer(12)]],
|
|
device atomic_int* logOffsetBuffer [[buffer(13)]],
|
|
uint eltIndex [[thread_position_in_grid]])
|
|
{
|
|
const device oc_mtl_path_elt* elt = &elementBuffer[eltIndex];
|
|
const device oc_mtl_path_queue* pathQueue = &pathQueueBuffer[elt->pathIndex];
|
|
device oc_mtl_tile_queue* tileQueues = &tileQueueBuffer[pathQueue->tileQueues];
|
|
|
|
mtl_segment_setup_context setupCtx = {
|
|
.pathIndex = elt->pathIndex,
|
|
.segmentCount = segmentCount,
|
|
.segmentBuffer = segmentBuffer,
|
|
.pathQueue = pathQueue,
|
|
.tileQueues = tileQueues,
|
|
.tileOpBuffer = tileOpBuffer,
|
|
.tileOpCount = tileOpCount,
|
|
.tileOpMax = tileOpMax[0],
|
|
.segmentMax = segmentMax[0],
|
|
.tileSize = tileSize[0],
|
|
.log.buffer = logBuffer,
|
|
.log.offset = logOffsetBuffer,
|
|
.log.enabled = false,
|
|
};
|
|
|
|
switch(elt->kind)
|
|
{
|
|
case OC_MTL_LINE:
|
|
{
|
|
float2 p[2] = { elt->p[0] * scale[0], elt->p[1] * scale[0] };
|
|
mtl_log(setupCtx.log, "line: ");
|
|
log_line(p, setupCtx.log);
|
|
mtl_line_setup(&setupCtx, p);
|
|
}
|
|
break;
|
|
|
|
case OC_MTL_QUADRATIC:
|
|
{
|
|
float2 p[3] = { elt->p[0] * scale[0], elt->p[1] * scale[0], elt->p[2] * scale[0] };
|
|
mtl_log(setupCtx.log, "quadratic: ");
|
|
log_quadratic_bezier(p, setupCtx.log);
|
|
mtl_quadratic_setup(&setupCtx, p);
|
|
}
|
|
break;
|
|
|
|
case OC_MTL_CUBIC:
|
|
{
|
|
float2 p[4] = { elt->p[0] * scale[0], elt->p[1] * scale[0], elt->p[2] * scale[0], elt->p[3] * scale[0] };
|
|
mtl_log(setupCtx.log, "cubic: ");
|
|
log_cubic_bezier(p, setupCtx.log);
|
|
mtl_cubic_setup(&setupCtx, p);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
kernel void mtl_backprop(const device oc_mtl_path_queue* pathQueueBuffer [[buffer(0)]],
|
|
device oc_mtl_tile_queue* tileQueueBuffer [[buffer(1)]],
|
|
device char* logBuffer [[buffer(2)]],
|
|
device atomic_int* logOffsetBuffer [[buffer(3)]],
|
|
device int* segmentCount [[buffer(4)]],
|
|
uint pathIndex [[threadgroup_position_in_grid]],
|
|
uint localID [[thread_position_in_threadgroup]],
|
|
uint uid [[thread_position_in_grid]])
|
|
{
|
|
mtl_log_context log = { .buffer = logBuffer, .offset = logOffsetBuffer, .enabled = false };
|
|
|
|
if(uid == 0)
|
|
{
|
|
mtl_log(log, "segmentCount = ");
|
|
mtl_log_i32(log, segmentCount[0]);
|
|
mtl_log(log, "\n");
|
|
}
|
|
|
|
threadgroup atomic_int nextRowIndex;
|
|
if(localID == 0)
|
|
{
|
|
atomic_store_explicit(&nextRowIndex, 0, memory_order_relaxed);
|
|
}
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
int rowIndex = 0;
|
|
const device oc_mtl_path_queue* pathQueue = &pathQueueBuffer[pathIndex];
|
|
device oc_mtl_tile_queue* tiles = &tileQueueBuffer[pathQueue->tileQueues];
|
|
int rowSize = pathQueue->area.z;
|
|
int rowCount = pathQueue->area.w;
|
|
|
|
rowIndex = atomic_fetch_add_explicit(&nextRowIndex, 1, memory_order_relaxed);
|
|
while(rowIndex < rowCount)
|
|
{
|
|
device oc_mtl_tile_queue* row = &tiles[rowIndex * rowSize];
|
|
int sum = 0;
|
|
for(int x = rowSize - 1; x >= 0; x--)
|
|
{
|
|
device oc_mtl_tile_queue* tile = &row[x];
|
|
int offset = *(device int*)&tile->windingOffset;
|
|
*(device int*)(&tile->windingOffset) = sum;
|
|
sum += offset;
|
|
}
|
|
rowIndex = atomic_fetch_add_explicit(&nextRowIndex, 1, memory_order_relaxed);
|
|
}
|
|
}
|
|
|
|
kernel void mtl_merge(constant int* pathCount [[buffer(0)]],
|
|
const device oc_mtl_path* pathBuffer [[buffer(1)]],
|
|
const device oc_mtl_path_queue* pathQueueBuffer [[buffer(2)]],
|
|
const device oc_mtl_tile_queue* tileQueueBuffer [[buffer(3)]],
|
|
device oc_mtl_tile_op* tileOpBuffer [[buffer(4)]],
|
|
device atomic_int* tileOpCount [[buffer(5)]],
|
|
device MTLDispatchThreadgroupsIndirectArguments* dispatchBuffer [[buffer(6)]],
|
|
device oc_mtl_screen_tile* screenTilesBuffer [[buffer(7)]],
|
|
constant int* tileOpMax [[buffer(8)]],
|
|
constant int* tileSize [[buffer(9)]],
|
|
constant float* scale [[buffer(10)]],
|
|
device char* logBuffer [[buffer(11)]],
|
|
device atomic_int* logOffsetBuffer [[buffer(12)]],
|
|
uint2 threadCoord [[thread_position_in_grid]],
|
|
uint2 gridSize [[threads_per_grid]])
|
|
{
|
|
int2 tileCoord = int2(threadCoord);
|
|
int tileIndex = -1;
|
|
device int* nextLink = 0;
|
|
|
|
/*
|
|
mtl_log_context log = {.buffer = logBuffer,
|
|
.offset = logOffsetBuffer,
|
|
.enabled = true};
|
|
*/
|
|
dispatchBuffer[0].threadgroupsPerGrid[1] = 1;
|
|
dispatchBuffer[0].threadgroupsPerGrid[2] = 1;
|
|
|
|
for(int pathIndex = 0; pathIndex < pathCount[0]; pathIndex++)
|
|
{
|
|
const device oc_mtl_path_queue* pathQueue = &pathQueueBuffer[pathIndex];
|
|
int2 pathTileCoord = tileCoord - pathQueue->area.xy;
|
|
|
|
const device oc_mtl_path* path = &pathBuffer[pathIndex];
|
|
float xMax = min(path->box.z, path->clip.z);
|
|
int tileMaxX = xMax * scale[0] / tileSize[0];
|
|
int pathTileMaxX = tileMaxX - pathQueue->area.x;
|
|
|
|
if(pathTileCoord.x >= 0
|
|
&& pathTileCoord.x <= pathTileMaxX
|
|
&& pathTileCoord.y >= 0
|
|
&& pathTileCoord.y < pathQueue->area.w)
|
|
{
|
|
if(tileIndex < 0)
|
|
{
|
|
tileIndex = atomic_fetch_add_explicit((device atomic_uint*)&dispatchBuffer[0].threadgroupsPerGrid[0], 1, memory_order_relaxed);
|
|
screenTilesBuffer[tileIndex].tileCoord = uint2(tileCoord);
|
|
nextLink = &screenTilesBuffer[tileIndex].first;
|
|
*nextLink = -1;
|
|
}
|
|
|
|
int pathTileIndex = pathTileCoord.y * pathQueue->area.z + pathTileCoord.x;
|
|
const device oc_mtl_tile_queue* tileQueue = &tileQueueBuffer[pathQueue->tileQueues + pathTileIndex];
|
|
|
|
int windingOffset = atomic_load_explicit(&tileQueue->windingOffset, memory_order_relaxed);
|
|
int firstOpIndex = atomic_load_explicit(&tileQueue->first, memory_order_relaxed);
|
|
|
|
float4 tileBox = float4(tileCoord.x, tileCoord.y, tileCoord.x + 1, tileCoord.y + 1);
|
|
tileBox *= tileSize[0];
|
|
float4 clip = pathBuffer[pathIndex].clip * scale[0];
|
|
|
|
if(tileBox.x >= clip.z
|
|
|| tileBox.z < clip.x
|
|
|| tileBox.y >= clip.w
|
|
|| tileBox.w < clip.y)
|
|
{
|
|
//NOTE: tile is fully outside clip, cull it
|
|
}
|
|
else if(firstOpIndex == -1)
|
|
{
|
|
if(windingOffset & 1)
|
|
{
|
|
//NOTE: tile is full covered. Add path start op (with winding offset).
|
|
// Additionally if color is opaque and tile is fully inside clip, trim tile list.
|
|
int pathOpIndex = atomic_fetch_add_explicit(tileOpCount, 1, memory_order_relaxed);
|
|
|
|
if(pathOpIndex >= tileOpMax[0])
|
|
{
|
|
return;
|
|
}
|
|
|
|
device oc_mtl_tile_op* pathOp = &tileOpBuffer[pathOpIndex];
|
|
pathOp->kind = OC_MTL_OP_CLIP_FILL;
|
|
pathOp->next = -1;
|
|
pathOp->index = pathIndex;
|
|
pathOp->windingOffset = windingOffset;
|
|
|
|
*nextLink = pathOpIndex;
|
|
|
|
if(tileBox.x >= clip.x
|
|
&& tileBox.z < clip.z
|
|
&& tileBox.y >= clip.y
|
|
&& tileBox.w < clip.w)
|
|
{
|
|
pathOp->kind = OC_MTL_OP_FILL;
|
|
|
|
if(pathBuffer[pathIndex].color.a == 1 && pathBuffer[pathIndex].texture < 0)
|
|
{
|
|
screenTilesBuffer[tileIndex].first = pathOpIndex;
|
|
}
|
|
}
|
|
nextLink = &pathOp->next;
|
|
}
|
|
// else, tile is fully uncovered, skip path
|
|
}
|
|
else
|
|
{
|
|
//NOTE: add path start op (with winding offset)
|
|
int startOpIndex = atomic_fetch_add_explicit(tileOpCount, 1, memory_order_relaxed);
|
|
if(startOpIndex >= tileOpMax[0])
|
|
{
|
|
return;
|
|
}
|
|
|
|
device oc_mtl_tile_op* startOp = &tileOpBuffer[startOpIndex];
|
|
startOp->kind = OC_MTL_OP_START;
|
|
startOp->next = -1;
|
|
startOp->index = pathIndex;
|
|
startOp->windingOffset = windingOffset;
|
|
|
|
*nextLink = startOpIndex;
|
|
nextLink = &startOp->next;
|
|
|
|
//NOTE: chain remaining path ops to end of tile list
|
|
int lastOpIndex = tileQueue->last;
|
|
device oc_mtl_tile_op* lastOp = &tileOpBuffer[lastOpIndex];
|
|
*nextLink = firstOpIndex;
|
|
nextLink = &lastOp->next;
|
|
|
|
//NOTE: add path end op
|
|
int endOpIndex = atomic_fetch_add_explicit(tileOpCount, 1, memory_order_relaxed);
|
|
if(endOpIndex >= tileOpMax[0])
|
|
{
|
|
return;
|
|
}
|
|
|
|
device oc_mtl_tile_op* endOp = &tileOpBuffer[endOpIndex];
|
|
endOp->kind = OC_MTL_OP_END;
|
|
endOp->next = -1;
|
|
endOp->index = pathIndex;
|
|
|
|
*nextLink = endOpIndex;
|
|
nextLink = &endOp->next;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
kernel void mtl_raster(const device oc_mtl_screen_tile* screenTilesBuffer [[buffer(0)]],
|
|
const device oc_mtl_tile_op* tileOpBuffer [[buffer(1)]],
|
|
const device oc_mtl_path* pathBuffer [[buffer(2)]],
|
|
const device oc_mtl_segment* segmentBuffer [[buffer(3)]],
|
|
constant int* tileSize [[buffer(4)]],
|
|
constant float* scale [[buffer(5)]],
|
|
constant int* sampleCountBuffer [[buffer(6)]],
|
|
device char* logBuffer [[buffer(7)]],
|
|
device atomic_int* logOffsetBuffer [[buffer(8)]],
|
|
texture2d<float, access::write> outTexture [[texture(0)]],
|
|
array<texture2d<float>, OC_MTL_MAX_IMAGES_PER_BATCH> srcTextures [[texture(1)]],
|
|
uint2 threadGroupCoord [[threadgroup_position_in_grid]],
|
|
uint2 localCoord [[thread_position_in_threadgroup]])
|
|
{
|
|
/*
|
|
mtl_log_context log = {.buffer = logBuffer,
|
|
.offset = logOffsetBuffer,
|
|
.enabled = true};
|
|
*/
|
|
int tileIndex = int(threadGroupCoord.x);
|
|
uint2 tileCoord = screenTilesBuffer[tileIndex].tileCoord;
|
|
uint2 pixelCoord = tileCoord * tileSize[0] + localCoord.xy;
|
|
|
|
int opIndex = screenTilesBuffer[tileIndex].first;
|
|
|
|
const int OC_MTL_MAX_SAMPLE_COUNT = 8;
|
|
float2 sampleCoords[OC_MTL_MAX_SAMPLE_COUNT];
|
|
int sampleCount = sampleCountBuffer[0];
|
|
float2 centerCoord = float2(pixelCoord) + float2(0.5, 0.5);
|
|
|
|
if(sampleCount == 8)
|
|
{
|
|
sampleCount = 8;
|
|
sampleCoords[0] = centerCoord + float2(1, 3) / 16;
|
|
sampleCoords[1] = centerCoord + float2(-1, -3) / 16;
|
|
sampleCoords[2] = centerCoord + float2(5, -1) / 16;
|
|
sampleCoords[3] = centerCoord + float2(-3, 5) / 16;
|
|
sampleCoords[4] = centerCoord + float2(-5, -5) / 16;
|
|
sampleCoords[5] = centerCoord + float2(-7, 1) / 16;
|
|
sampleCoords[6] = centerCoord + float2(3, -7) / 16;
|
|
sampleCoords[7] = centerCoord + float2(7, 7) / 16;
|
|
}
|
|
else
|
|
{
|
|
sampleCount = 1;
|
|
sampleCoords[0] = centerCoord;
|
|
}
|
|
|
|
const int OC_MTL_MAX_SRC_SAMPLE_COUNT = 4;
|
|
const int srcSampleCount = 2;
|
|
|
|
const float2 imgSampleCoords[OC_MTL_MAX_SRC_SAMPLE_COUNT] = {
|
|
centerCoord + float2(-0.25, 0.25),
|
|
centerCoord + float2(+0.25, +0.25),
|
|
centerCoord + float2(+0.25, -0.25),
|
|
centerCoord + float2(-0.25, +0.25)
|
|
};
|
|
|
|
float4 color = { 0 };
|
|
int winding[OC_MTL_MAX_SAMPLE_COUNT] = { 0 };
|
|
|
|
while(opIndex != -1)
|
|
{
|
|
const device oc_mtl_tile_op* op = &tileOpBuffer[opIndex];
|
|
int pathIndex = op->index;
|
|
|
|
if(op->kind == OC_MTL_OP_START)
|
|
{
|
|
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
|
|
{
|
|
winding[sampleIndex] = op->windingOffset;
|
|
}
|
|
}
|
|
else if(op->kind == OC_MTL_OP_SEGMENT)
|
|
{
|
|
const device oc_mtl_segment* seg = &segmentBuffer[op->index];
|
|
|
|
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
|
|
{
|
|
float2 sampleCoord = sampleCoords[sampleIndex];
|
|
|
|
if((sampleCoord.y > seg->box.y)
|
|
&& (sampleCoord.y <= seg->box.w)
|
|
&& (mtl_side_of_segment(sampleCoord, seg) < 0))
|
|
{
|
|
winding[sampleIndex] += seg->windingIncrement;
|
|
}
|
|
|
|
if(op->crossRight)
|
|
{
|
|
if((seg->config == OC_MTL_BR || seg->config == OC_MTL_TL)
|
|
&& (sampleCoord.y > seg->box.w))
|
|
{
|
|
winding[sampleIndex] += seg->windingIncrement;
|
|
}
|
|
else if((seg->config == OC_MTL_BL || seg->config == OC_MTL_TR)
|
|
&& (sampleCoord.y > seg->box.y))
|
|
{
|
|
winding[sampleIndex] -= seg->windingIncrement;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
float4 nextColor = pathBuffer[pathIndex].color;
|
|
nextColor.rgb *= nextColor.a;
|
|
|
|
int textureIndex = pathBuffer[pathIndex].texture;
|
|
if(textureIndex >= 0 && textureIndex < OC_MTL_MAX_IMAGES_PER_BATCH)
|
|
{
|
|
constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
|
|
|
|
float4 texColor = { 0 };
|
|
for(int sampleIndex = 0; sampleIndex < srcSampleCount; sampleIndex++)
|
|
{
|
|
float2 sampleCoord = imgSampleCoords[sampleIndex];
|
|
float3 ph = float3(sampleCoord.xy, 1);
|
|
float2 uv = (pathBuffer[pathIndex].uvTransform * ph).xy;
|
|
|
|
texColor += srcTextures[textureIndex].sample(smp, uv);
|
|
}
|
|
texColor /= srcSampleCount;
|
|
texColor.rgb *= texColor.a;
|
|
nextColor *= texColor;
|
|
}
|
|
|
|
if(op->kind == OC_MTL_OP_FILL)
|
|
{
|
|
color = color * (1 - nextColor.a) + nextColor;
|
|
}
|
|
else
|
|
{
|
|
float4 clip = pathBuffer[pathIndex].clip * scale[0];
|
|
float coverage = 0;
|
|
|
|
for(int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
|
|
{
|
|
float2 sampleCoord = sampleCoords[sampleIndex];
|
|
|
|
if(sampleCoord.x >= clip.x
|
|
&& sampleCoord.x < clip.z
|
|
&& sampleCoord.y >= clip.y
|
|
&& sampleCoord.y < clip.w)
|
|
{
|
|
bool filled = op->kind == OC_MTL_OP_CLIP_FILL
|
|
|| (pathBuffer[pathIndex].cmd == OC_MTL_FILL && (winding[sampleIndex] & 1))
|
|
|| (pathBuffer[pathIndex].cmd == OC_MTL_STROKE && (winding[sampleIndex] != 0));
|
|
if(filled)
|
|
{
|
|
coverage++;
|
|
}
|
|
}
|
|
}
|
|
coverage /= sampleCount;
|
|
color = coverage * (color * (1 - nextColor.a) + nextColor) + (1 - coverage) * color;
|
|
}
|
|
}
|
|
opIndex = op->next;
|
|
}
|
|
|
|
/*
|
|
if((pixelCoord.x % tileSize[0] == 0) || (pixelCoord.y % tileSize[0] == 0))
|
|
{
|
|
color = float4(0, 0, 0, 1);
|
|
}
|
|
//*/
|
|
outTexture.write(color, pixelCoord);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------------
|
|
// Blit shader
|
|
//------------------------------------------------------------------------------------
|
|
struct vs_out
|
|
{
|
|
float4 pos [[position]];
|
|
float2 uv;
|
|
};
|
|
|
|
vertex vs_out mtl_vertex_shader(ushort vid [[vertex_id]])
|
|
{
|
|
vs_out out;
|
|
out.uv = float2((vid << 1) & 2, vid & 2);
|
|
out.pos = float4(out.uv * float2(2, -2) + float2(-1, 1), 0, 1);
|
|
return (out);
|
|
}
|
|
|
|
fragment float4 mtl_fragment_shader(vs_out i [[stage_in]], texture2d<float, access::sample> tex [[texture(0)]])
|
|
{
|
|
constexpr sampler smp(mip_filter::nearest, mag_filter::linear, min_filter::linear);
|
|
return (tex.sample(smp, i.uv));
|
|
}
|