layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

precision mediump float;
layout(std430) buffer;

layout(binding = 0) restrict readonly buffer elementBufferSSBO
{
    oc_gl_path_elt elements[];
}

elementBuffer;

layout(binding = 1) coherent restrict buffer segmentCountBufferSSBO
{
    int elements[];
}

segmentCountBuffer;

layout(binding = 2) restrict buffer segmentBufferSSBO
{
    oc_gl_segment elements[];
}

segmentBuffer;

layout(binding = 3) restrict buffer pathQueueBufferSSBO
{
    oc_gl_path_queue elements[];
}

pathQueueBuffer;

layout(binding = 4) coherent restrict buffer tileQueueBufferSSBO
{
    oc_gl_tile_queue elements[];
}

tileQueueBuffer;

layout(binding = 5) coherent restrict buffer tileOpCountBufferSSBO
{
    int elements[];
}

tileOpCountBuffer;

layout(binding = 6) restrict buffer tileOpBufferSSBO
{
    oc_gl_tile_op elements[];
}

tileOpBuffer;

layout(location = 0) uniform float scale;
layout(location = 1) uniform uint tileSize;
layout(location = 2) uniform int elementBufferStart;

void bin_to_tiles(int segIndex)
{
    //NOTE: add segment index to the queues of tiles it overlaps with
    const oc_gl_segment seg = segmentBuffer.elements[segIndex];
    const oc_gl_path_queue pathQueue = pathQueueBuffer.elements[seg.pathIndex];

    ivec4 pathArea = pathQueue.area;
    ivec4 coveredTiles = ivec4(seg.box) / int(tileSize);
    int xMin = max(0, coveredTiles.x - pathArea.x);
    int yMin = max(0, coveredTiles.y - pathArea.y);
    int xMax = min(coveredTiles.z - pathArea.x, pathArea.z - 1);
    int yMax = min(coveredTiles.w - pathArea.y, pathArea.w - 1);

    for(int y = yMin; y <= yMax; y++)
    {
        for(int x = xMin; x <= xMax; x++)
        {
            vec4 tileBox = vec4(float(x + pathArea.x),
                                float(y + pathArea.y),
                                float(x + pathArea.x + 1),
                                float(y + pathArea.y + 1))
                         * float(tileSize);

            vec2 bl = { tileBox.x, tileBox.y };
            vec2 br = { tileBox.z, tileBox.y };
            vec2 tr = { tileBox.z, tileBox.w };
            vec2 tl = { tileBox.x, tileBox.w };

            int sbl = side_of_segment(bl, seg);
            int sbr = side_of_segment(br, seg);
            int str = side_of_segment(tr, seg);
            int stl = side_of_segment(tl, seg);

            bool crossL = (stl * sbl < 0);
            bool crossR = (str * sbr < 0);
            bool crossT = (stl * str < 0);
            bool crossB = (sbl * sbr < 0);

            vec2 s0, s1;
            if(seg.config == OC_GL_TL || seg.config == OC_GL_BR)
            {
                s0 = seg.box.xy;
                s1 = seg.box.zw;
            }
            else
            {
                s0 = seg.box.xw;
                s1 = seg.box.zy;
            }
            bool s0Inside = s0.x >= tileBox.x
                         && s0.x < tileBox.z
                         && s0.y >= tileBox.y
                         && s0.y < tileBox.w;

            bool s1Inside = s1.x >= tileBox.x
                         && s1.x < tileBox.z
                         && s1.y >= tileBox.y
                         && s1.y < tileBox.w;

            if(crossL || crossR || crossT || crossB || s0Inside || s1Inside)
            {
                int tileOpIndex = atomicAdd(tileOpCountBuffer.elements[0], 1);

                if(tileOpIndex < tileOpBuffer.elements.length())
                {
                    tileOpBuffer.elements[tileOpIndex].kind = OC_GL_OP_SEGMENT;
                    tileOpBuffer.elements[tileOpIndex].index = segIndex;
                    tileOpBuffer.elements[tileOpIndex].windingOffsetOrCrossRight = 0;
                    tileOpBuffer.elements[tileOpIndex].next = -1;

                    int tileQueueIndex = pathQueue.tileQueues + y * pathArea.z + x;

                    tileOpBuffer.elements[tileOpIndex].next = atomicExchange(tileQueueBuffer.elements[tileQueueIndex].first,
                                                                             tileOpIndex);
                    if(tileOpBuffer.elements[tileOpIndex].next == -1)
                    {
                        tileQueueBuffer.elements[tileQueueIndex].last = tileOpIndex;
                    }

                    //NOTE: if the segment crosses the tile's bottom boundary, update the tile's winding offset
                    if(crossB)
                    {
                        atomicAdd(tileQueueBuffer.elements[tileQueueIndex].windingOffset, seg.windingIncrement);
                    }

                    //NOTE: if the segment crosses the right boundary, mark it.
                    if(crossR)
                    {
                        tileOpBuffer.elements[tileOpIndex].windingOffsetOrCrossRight = 1;
                    }
                }
            }
        }
    }
}

int push_segment(in vec2 p[4], int kind, int pathIndex)
{
    int segIndex = atomicAdd(segmentCountBuffer.elements[0], 1);

    if(segIndex < segmentBuffer.elements.length())
    {
        vec2 s, c, e;

        switch(kind)
        {
            case OC_GL_LINE:
                s = p[0];
                c = p[0];
                e = p[1];
                break;

            case OC_GL_QUADRATIC:
                s = p[0];
                c = p[1];
                e = p[2];
                break;

            case OC_GL_CUBIC:
            {
                s = p[0];
                float sqrNorm0 = dot(p[1] - p[0], p[1] - p[0]);
                float sqrNorm1 = dot(p[3] - p[2], p[3] - p[2]);
                if(sqrNorm0 < sqrNorm1)
                {
                    c = p[2];
                }
                else
                {
                    c = p[1];
                }
                e = p[3];
            }
            break;
        }

        bool goingUp = e.y >= s.y;
        bool goingRight = e.x >= s.x;

        vec4 box = vec4(min(s.x, e.x),
                        min(s.y, e.y),
                        max(s.x, e.x),
                        max(s.y, e.y));

        segmentBuffer.elements[segIndex].kind = kind;
        segmentBuffer.elements[segIndex].pathIndex = pathIndex;
        segmentBuffer.elements[segIndex].windingIncrement = goingUp ? 1 : -1;
        segmentBuffer.elements[segIndex].box = box;

        float dx = c.x - box.x;
        float dy = c.y - box.y;
        float alpha = (box.w - box.y) / (box.z - box.x);
        float ofs = box.w - box.y;

        if(goingUp == goingRight)
        {
            if(kind == OC_GL_LINE)
            {
                segmentBuffer.elements[segIndex].config = OC_GL_BR;
            }
            else if(dy > alpha * dx)
            {
                segmentBuffer.elements[segIndex].config = OC_GL_TL;
            }
            else
            {
                segmentBuffer.elements[segIndex].config = OC_GL_BR;
            }
        }
        else
        {
            if(kind == OC_GL_LINE)
            {
                segmentBuffer.elements[segIndex].config = OC_GL_TR;
            }
            else if(dy < ofs - alpha * dx)
            {
                segmentBuffer.elements[segIndex].config = OC_GL_BL;
            }
            else
            {
                segmentBuffer.elements[segIndex].config = OC_GL_TR;
            }
        }
    }
    return (segIndex);
}

#define square(x) ((x) * (x))
#define cube(x) ((x) * (x) * (x))

void line_setup(vec2 p[4], int pathIndex)
{
    int segIndex = push_segment(p, OC_GL_LINE, pathIndex);
    if(segIndex < segmentBuffer.elements.length())
    {
        segmentBuffer.elements[segIndex].hullVertex = p[0];
        bin_to_tiles(segIndex);
    }
}

vec2 quadratic_blossom(vec2 p[4], float u, float v)
{
    vec2 b10 = u * p[1] + (1 - u) * p[0];
    vec2 b11 = u * p[2] + (1 - u) * p[1];
    vec2 b20 = v * b11 + (1 - v) * b10;
    return (b20);
}

void quadratic_slice(vec2 p[4], float s0, float s1, out vec2 sp[4])
{
    /*NOTE: using blossoms to compute sub-curve control points ensure that the fourth point
	        of sub-curve (s0, s1) and the first point of sub-curve (s1, s3) match.
	        However, due to numerical errors, the evaluation of B(s=0) might not be equal to
	        p[0] (and likewise, B(s=1) might not equal p[3]).
	        We handle that case explicitly to ensure that we don't create gaps in the paths.
	*/
    sp[0] = (s0 == 0) ? p[0] : quadratic_blossom(p, s0, s0);
    sp[1] = quadratic_blossom(p, s0, s1);
    sp[2] = (s1 == 1) ? p[2] : quadratic_blossom(p, s1, s1);
}

int quadratic_monotonize(vec2 p[4], out float splits[4])
{
    //NOTE: compute split points
    int count = 0;
    splits[0] = 0;
    count++;

    vec2 r = (p[0] - p[1]) / (p[2] - 2 * p[1] + p[0]);
    if(r.x > r.y)
    {
        float tmp = r.x;
        r.x = r.y;
        r.y = tmp;
    }
    if(r.x > 0 && r.x < 1)
    {
        splits[count] = r.x;
        count++;
    }
    if(r.y > 0 && r.y < 1)
    {
        splits[count] = r.y;
        count++;
    }
    splits[count] = 1;
    count++;
    return (count);
}

mat3 barycentric_matrix(vec2 v0, vec2 v1, vec2 v2)
{
    float det = v0.x * (v1.y - v2.y) + v1.x * (v2.y - v0.y) + v2.x * (v0.y - v1.y);
    mat3 B = { { v1.y - v2.y, v2.y - v0.y, v0.y - v1.y },
               { v2.x - v1.x, v0.x - v2.x, v1.x - v0.x },
               { v1.x * v2.y - v2.x * v1.y, v2.x * v0.y - v0.x * v2.y, v0.x * v1.y - v1.x * v0.y } };
    B *= (1 / det);
    return (B);
}

void quadratic_emit(vec2 p[4], int pathIndex)
{
    int segIndex = push_segment(p, OC_GL_QUADRATIC, pathIndex);

    if(segIndex < segmentBuffer.elements.length())
    {
        //NOTE: compute implicit equation matrix
        float det = p[0].x * (p[1].y - p[2].y) + p[1].x * (p[2].y - p[0].y) + p[2].x * (p[0].y - p[1].y);

        float a = p[0].y - p[1].y + 0.5 * (p[2].y - p[0].y);
        float b = p[1].x - p[0].x + 0.5 * (p[0].x - p[2].x);
        float c = p[0].x * p[1].y - p[1].x * p[0].y + 0.5 * (p[2].x * p[0].y - p[0].x * p[2].y);
        float d = p[0].y - p[1].y;
        float e = p[1].x - p[0].x;
        float f = p[0].x * p[1].y - p[1].x * p[0].y;

        float flip = (segmentBuffer.elements[segIndex].config == OC_GL_TL
                      || segmentBuffer.elements[segIndex].config == OC_GL_BL)
                       ? -1
                       : 1;

        float g = flip * (p[2].x * (p[0].y - p[1].y) + p[0].x * (p[1].y - p[2].y) + p[1].x * (p[2].y - p[0].y));

        segmentBuffer.elements[segIndex].implicitMatrix = (1 / det) * mat3(a, d, 0., b, e, 0., c, f, g);
        segmentBuffer.elements[segIndex].hullVertex = p[1];

        bin_to_tiles(segIndex);
    }
}

void quadratic_setup(vec2 p[4], int pathIndex)
{
    float splits[4];
    int splitCount = quadratic_monotonize(p, splits);

    //NOTE: produce bézier curve for each consecutive pair of roots
    for(int sliceIndex = 0; sliceIndex < splitCount - 1; sliceIndex++)
    {
        vec2 sp[4];
        quadratic_slice(p, splits[sliceIndex], splits[sliceIndex + 1], sp);
        quadratic_emit(sp, pathIndex);
    }
}

int quadratic_roots_with_det(float a, float b, float c, float det, out float r[2])
{
    int count = 0;

    if(a == 0)
    {
        if(b != 0)
        {
            count = 1;
            r[0] = -c / b;
        }
    }
    else
    {
        b /= 2.0;

        if(det >= 0)
        {
            count = (det == 0) ? 1 : 2;

            if(b > 0)
            {
                float q = b + sqrt(det);
                r[0] = -c / q;
                r[1] = -q / a;
            }
            else if(b < 0)
            {
                float q = -b + sqrt(det);
                r[0] = q / a;
                r[1] = c / q;
            }
            else
            {
                float q = sqrt(-a * c);
                if(abs(a) >= abs(c))
                {
                    r[0] = q / a;
                    r[1] = -q / a;
                }
                else
                {
                    r[0] = -c / q;
                    r[1] = c / q;
                }
            }
        }
    }
    if(count > 1 && r[0] > r[1])
    {
        float tmp = r[0];
        r[0] = r[1];
        r[1] = tmp;
    }
    return (count);
}

int quadratic_roots(float a, float b, float c, out float r[2])
{
    float det = square(b) / 4. - a * c;
    return (quadratic_roots_with_det(a, b, c, det, r));
}

vec2 cubic_blossom(vec2 p[4], float u, float v, float w)
{
    vec2 b10 = u * p[1] + (1 - u) * p[0];
    vec2 b11 = u * p[2] + (1 - u) * p[1];
    vec2 b12 = u * p[3] + (1 - u) * p[2];
    vec2 b20 = v * b11 + (1 - v) * b10;
    vec2 b21 = v * b12 + (1 - v) * b11;
    vec2 b30 = w * b21 + (1 - w) * b20;
    return (b30);
}

void cubic_slice(vec2 p[4], float s0, float s1, out vec2 sp[4])
{
    /*NOTE: using blossoms to compute sub-curve control points ensure that the fourth point
	        of sub-curve (s0, s1) and the first point of sub-curve (s1, s3) match.
	        However, due to numerical errors, the evaluation of B(s=0) might not be equal to
	        p[0] (and likewise, B(s=1) might not equal p[3]).
	        We handle that case explicitly to ensure that we don't create gaps in the paths.
	*/
    sp[0] = (s0 == 0) ? p[0] : cubic_blossom(p, s0, s0, s0);
    sp[1] = cubic_blossom(p, s0, s0, s1);
    sp[2] = cubic_blossom(p, s0, s1, s1);
    sp[3] = (s1 == 1) ? p[3] : cubic_blossom(p, s1, s1, s1);
}

#define CUBIC_ERROR 0
#define CUBIC_SERPENTINE 1
#define CUBIC_CUSP 2
#define CUBIC_CUSP_INFINITY 3
#define CUBIC_LOOP 4
#define CUBIC_DEGENERATE_QUADRATIC 5
#define CUBIC_DEGENERATE_LINE 6

struct cubic_info
{
    int kind;
    mat4 K;
    vec2 ts[2];
    float d1;
    float d2;
    float d3;
};

cubic_info cubic_classify(vec2 c[4])
{
    cubic_info result;
    result.kind = CUBIC_ERROR;
    mat4 F;

    /*NOTE(martin):
		now, compute determinants d0, d1, d2, d3, which gives the coefficients of the
	        inflection points polynomial:

		I(t, s) = d0*t^3 - 3*d1*t^2*s + 3*d2*t*s^2 - d3*s^3

		The roots of this polynomial are the inflection points of the parametric curve, in homogeneous
		coordinates (ie we can have an inflection point at inifinity with s=0).

		         |x3 y3 w3|              |x3 y3 w3|             |x3 y3 w3|              |x2 y2 w2|
		d0 = det |x2 y2 w2|    d1 = -det |x2 y2 w2|    d2 = det |x1 y1 w1|    d3 = -det |x1 y1 w1|
		         |x1 y1 w1|              |x0 y0 w0|             |x0 y0 w0|              |x0 y0 w0|

		In our case, the pi.w equal 1 (no point at infinity), so _in_the_power_basis_, w1 = w2 = w3 = 0 and w0 = 1
		(which also means d0 = 0)

		//WARN: there seems to be a mismatch between the signs of the d_i and the orientation test in the Loop-Blinn paper?
		//      flipping the sign of the d_i doesn't change the roots (and the implicit matrix), but it does change the orientation.
		//      Keeping the signs of the paper puts the interior on the left of parametric travel, unlike what's stated in the paper.
		//      this may very well be an error on my part that's cancelled by flipping the signs of the d_i though!
	*/

    float d1 = -(c[3].y * c[2].x - c[3].x * c[2].y);
    float d2 = -(c[3].x * c[1].y - c[3].y * c[1].x);
    float d3 = -(c[2].y * c[1].x - c[2].x * c[1].y);

    result.d1 = d1;
    result.d2 = d2;
    result.d3 = d3;

    //NOTE(martin): compute the second factor of the discriminant discr(I) = d1^2*(3*d2^2 - 4*d3*d1)
    float discrFactor2 = 3.0 * square(d2) - 4.0 * d3 * d1;

    //NOTE(martin): each following case gives the number of roots, hence the category of the parametric curve
    if(abs(d1) <= 1e-6 && abs(d2) <= 1e-6 && abs(d3) > 1e-6)
    {
        //NOTE(martin): quadratic degenerate case
        //NOTE(martin): compute quadratic curve control point, which is at p0 + 1.5*(p1-p0) = 1.5*p1 - 0.5*p0
        result.kind = CUBIC_DEGENERATE_QUADRATIC;
    }
    else if((discrFactor2 > 0 && abs(d1) > 1e-6)
            || (discrFactor2 == 0 && abs(d1) > 1e-6))
    {
        //NOTE(martin): serpentine curve or cusp with inflection at infinity
        //              (these two cases are handled the same way).
        //NOTE(martin): compute the solutions (tl, sl), (tm, sm), and (tn, sn) of the inflection point equation
        float tmtl[2];
        quadratic_roots_with_det(1, -2 * d2, (4. / 3. * d1 * d3), (1. / 3.) * discrFactor2, tmtl);

        float tm = tmtl[0];
        float sm = 2 * d1;
        float tl = tmtl[1];
        float sl = 2 * d1;

        float invNorm = 1 / sqrt(square(tm) + square(sm));
        tm *= invNorm;
        sm *= invNorm;

        invNorm = 1 / sqrt(square(tl) + square(sl));
        tl *= invNorm;
        sl *= invNorm;

        /*NOTE(martin):
			the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:

				| tl*tm            tl^3        tm^3        1 |
				| -sm*tl - sl*tm   -3sl*tl^2   -3*sm*tm^2  0 |
				| sl*sm            3*sl^2*tl   3*sm^2*tm   0 |
				| 0                -sl^3       -sm^3       0 |
		*/
        result.kind = (discrFactor2 > 0 && d1 != 0) ? CUBIC_SERPENTINE : CUBIC_CUSP;

        F = mat4(tl * tm, -sm * tl - sl * tm, sl * sm, 0,
                 cube(tl), -3 * sl * square(tl), 3 * square(sl) * tl, -cube(sl),
                 cube(tm), -3 * sm * square(tm), 3 * square(sm) * tm, -cube(sm),
                 1, 0, 0, 0);

        result.ts[0] = vec2(tm, sm);
        result.ts[1] = vec2(tl, sl);
    }
    else if(discrFactor2 < 0 && abs(d1) > 1e-6)
    {
        //NOTE(martin): loop curve
        result.kind = CUBIC_LOOP;

        float tetd[2];
        quadratic_roots_with_det(1, -2 * d2, 4 * (square(d2) - d1 * d3), -discrFactor2, tetd);

        float td = tetd[1];
        float sd = 2 * d1;
        float te = tetd[0];
        float se = 2 * d1;

        float invNorm = 1 / sqrt(square(td) + square(sd));
        td *= invNorm;
        sd *= invNorm;

        invNorm = 1 / sqrt(square(te) + square(se));
        te *= invNorm;
        se *= invNorm;

        /*NOTE(martin):
			the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:

				| td*te            td^2*te                 td*te^2                1 |
				| -se*td - sd*te   -se*td^2 - 2sd*te*td    -sd*te^2 - 2*se*td*te  0 |
				| sd*se            te*sd^2 + 2*se*td*sd    td*se^2 + 2*sd*te*se   0 |
				| 0                -sd^2*se                -sd*se^2               0 |
		*/
        F = mat4(td * te, -se * td - sd * te, sd * se, 0,
                 square(td) * te, -se * square(td) - 2 * sd * te * td, te * square(sd) + 2 * se * td * sd, -square(sd) * se,
                 td * square(te), -sd * square(te) - 2 * se * td * te, td * square(se) + 2 * sd * te * se, -sd * square(se),
                 1, 0, 0, 0);

        result.ts[0] = vec2(td, sd);
        result.ts[1] = vec2(te, se);
    }
    else if(d2 != 0)
    {
        //NOTE(martin): cusp with cusp at infinity
        float tl = d3;
        float sl = 3 * d2;

        float invNorm = 1 / sqrt(square(tl) + square(sl));
        tl *= invNorm;
        sl *= invNorm;

        /*NOTE(martin):
			the power basis coefficients of points k,l,m,n are collected into the rows of the 4x4 matrix F:

				| tl    tl^3        1  1 |
				| -sl   -3sl*tl^2   0  0 |
				| 0     3*sl^2*tl   0  0 |
				| 0     -sl^3       0  0 |
		*/
        result.kind = CUBIC_CUSP_INFINITY;

        F = mat4(tl, -sl, 0, 0,
                 cube(tl), -3 * sl * square(tl), 3 * square(sl) * tl, -cube(sl),
                 1, 0, 0, 0,
                 1, 0, 0, 0);

        result.ts[0] = vec2(tl, sl);
        result.ts[1] = vec2(0, 0);
    }
    else
    {
        //NOTE(martin): line or point degenerate case
        result.kind = CUBIC_DEGENERATE_LINE;
    }

    /*
			F is then multiplied by M3^(-1) on the left which yelds the bezier coefficients k, l, m, n
			at the control points.

			               | 1  0   0   0 |
				M3^(-1) =  | 1  1/3 0   0 |
				           | 1  2/3 1/3 0 |
					       | 1  1   1   1 |
	*/
    mat4 invM3 = mat4(1, 1, 1, 1,
                      0, 1. / 3., 2. / 3., 1,
                      0, 0, 1. / 3., 1,
                      0, 0, 0, 1);

    result.K = transpose(invM3 * F);

    return (result);
}

vec2 select_hull_vertex(vec2 p0, vec2 p1, vec2 p2, vec2 p3)
{
    /*NOTE: check intersection of lines (p1-p0) and (p3-p2)
		P = p0 + u(p1-p0)
		P = p2 + w(p3-p2)

		control points are inside a right triangle so we should always find an intersection
	*/
    vec2 pm;

    float det = (p1.x - p0.x) * (p3.y - p2.y) - (p1.y - p0.y) * (p3.x - p2.x);
    float sqrNorm0 = dot(p1 - p0, p1 - p0);
    float sqrNorm1 = dot(p2 - p3, p2 - p3);

    if(abs(det) < 1e-3 || sqrNorm0 < 0.1 || sqrNorm1 < 0.1)
    {
        if(sqrNorm0 < sqrNorm1)
        {
            pm = p2;
        }
        else
        {
            pm = p1;
        }
    }
    else
    {
        float u = ((p0.x - p2.x) * (p2.y - p3.y) - (p0.y - p2.y) * (p2.x - p3.x)) / det;
        pm = p0 + u * (p1 - p0);
    }
    return (pm);
}

void cubic_emit(cubic_info curve, vec2 p[4], float s0, float s1, vec2 sp[4], int pathIndex)
{
    int segIndex = push_segment(sp, OC_GL_CUBIC, pathIndex);

    if(segIndex < segmentBuffer.elements.length())
    {
        vec2 v0 = p[0];
        vec2 v1 = p[3];
        vec2 v2;
        mat3 K;

        //TODO: haul that up in caller
        float sqrNorm0 = dot(p[1] - p[0], p[1] - p[0]);
        float sqrNorm1 = dot(p[2] - p[3], p[2] - p[3]);

        if(dot(p[0] - p[3], p[0] - p[3]) > 1e-5)
        {
            if(sqrNorm0 >= sqrNorm1)
            {
                v2 = p[1];
                K = mat3(curve.K[0].xyz, curve.K[3].xyz, curve.K[1].xyz);
            }
            else
            {
                v2 = p[2];
                K = mat3(curve.K[0].xyz, curve.K[3].xyz, curve.K[2].xyz);
            }
        }
        else
        {
            v1 = p[1];
            v2 = p[2];
            K = mat3(curve.K[0].xyz, curve.K[1].xyz, curve.K[2].xyz);
        }
        //NOTE: set matrices

        //TODO: should we compute matrix relative to a base point to avoid loss of precision
        //      when computing barycentric matrix?

        mat3 B = barycentric_matrix(v0, v1, v2);

        segmentBuffer.elements[segIndex].implicitMatrix = K * B;
        segmentBuffer.elements[segIndex].hullVertex = select_hull_vertex(sp[0], sp[1], sp[2], sp[3]);

        //NOTE: compute sign flip
        segmentBuffer.elements[segIndex].sign = 1;

        if(curve.kind == CUBIC_SERPENTINE
           || curve.kind == CUBIC_CUSP)
        {
            segmentBuffer.elements[segIndex].sign = (curve.d1 < 0) ? -1 : 1;
        }
        else if(curve.kind == CUBIC_LOOP)
        {
            float d1 = curve.d1;
            float d2 = curve.d2;
            float d3 = curve.d3;

            float H0 = d3 * d1 - square(d2) + d1 * d2 * s0 - square(d1) * square(s0);
            float H1 = d3 * d1 - square(d2) + d1 * d2 * s1 - square(d1) * square(s1);
            float H = (abs(H0) > abs(H1)) ? H0 : H1;
            segmentBuffer.elements[segIndex].sign = (H * d1 > 0) ? -1 : 1;
        }

        if(sp[3].y > sp[0].y)
        {
            segmentBuffer.elements[segIndex].sign *= -1;
        }

        //NOTE: bin to tiles
        bin_to_tiles(segIndex);
    }
}

void cubic_setup(vec2 p[4], int pathIndex)
{
    /*NOTE(martin): first convert the control points to power basis, multiplying by M3

		     | 1  0  0  0|      |p0|      |c0|
		M3 = |-3  3  0  0|, B = |p1|, C = |c1| = M3*B
		     | 3 -6  3  0|      |p2|      |c2|
		     |-1  3 -3  1|      |p3|      |c3|
	*/
    vec2 c[4] = {
        p[0],
        3.0 * (p[1] - p[0]),
        3.0 * (p[0] + p[2] - 2 * p[1]),
        3.0 * (p[1] - p[2]) + p[3] - p[0]
    };

    //NOTE: get classification, implicit matrix, double points and inflection points
    cubic_info curve = cubic_classify(c);

    if(curve.kind == CUBIC_DEGENERATE_LINE)
    {
        vec2 l[4] = { p[0], p[3], vec2(0), vec2(0) };
        line_setup(l, pathIndex);
        return;
    }
    else if(curve.kind == CUBIC_DEGENERATE_QUADRATIC)
    {
        vec2 quadPoint = vec2(1.5 * p[1].x - 0.5 * p[0].x, 1.5 * p[1].y - 0.5 * p[0].y);
        vec2 q[4] = { p[0], quadPoint, p[3], vec2(0) };
        quadratic_setup(q, pathIndex);
        return;
    }

    //NOTE: get the roots of B'(s) = 3.c3.s^2 + 2.c2.s + c1
    float rootsX[2];
    int rootCountX = quadratic_roots(3 * c[3].x, 2 * c[2].x, c[1].x, rootsX);

    float rootsY[2];
    int rootCountY = quadratic_roots(3 * c[3].y, 2 * c[2].y, c[1].y, rootsY);

    float roots[6];
    for(int i = 0; i < rootCountX; i++)
    {
        roots[i] = rootsX[i];
    }
    for(int i = 0; i < rootCountY; i++)
    {
        roots[i + rootCountX] = rootsY[i];
    }

    //NOTE: add double points and inflection points to roots if finite
    int rootCount = rootCountX + rootCountY;
    for(int i = 0; i < 2; i++)
    {
        if(curve.ts[i].y != 0)
        {
            roots[rootCount] = curve.ts[i].x / curve.ts[i].y;
            rootCount++;
        }
    }

    //NOTE: sort roots
    for(int i = 1; i < rootCount; i++)
    {
        float tmp = roots[i];
        int j = i - 1;
        while(j >= 0 && roots[j] > tmp)
        {
            roots[j + 1] = roots[j];
            j--;
        }
        roots[j + 1] = tmp;
    }

    //NOTE: compute split points
    float splits[8];
    int splitCount = 0;
    splits[0] = 0;
    splitCount++;
    for(int i = 0; i < rootCount; i++)
    {
        if(roots[i] > 0 && roots[i] < 1)
        {
            splits[splitCount] = roots[i];
            splitCount++;
        }
    }
    splits[splitCount] = 1;
    splitCount++;

    //NOTE: for each monotonic segment, compute hull matrix and sign, and emit segment
    for(int sliceIndex = 0; sliceIndex < splitCount - 1; sliceIndex++)
    {
        float s0 = splits[sliceIndex];
        float s1 = splits[sliceIndex + 1];
        vec2 sp[4];
        cubic_slice(p, s0, s1, sp);
        cubic_emit(curve, p, s0, s1, sp, pathIndex);
    }
}

void main()
{
    int eltIndex = int(gl_WorkGroupID.x);

    oc_gl_path_elt elt = elementBuffer.elements[elementBufferStart + eltIndex];

    switch(elt.kind)
    {
        case OC_GL_LINE:
        {
            vec2 p[4] = { elt.p[0] * scale, elt.p[1] * scale, vec2(0), vec2(0) };
            line_setup(p, elt.pathIndex);
        }
        break;

        case OC_GL_QUADRATIC:
        {
            vec2 p[4] = { elt.p[0] * scale, elt.p[1] * scale, elt.p[2] * scale, vec2(0) };
            quadratic_setup(p, elt.pathIndex);
        }
        break;

        case OC_GL_CUBIC:
        {
            vec2 p[4] = { elt.p[0] * scale, elt.p[1] * scale, elt.p[2] * scale, elt.p[3] * scale };
            cubic_setup(p, elt.pathIndex);
        }
        break;

        default:
            break;
    }
}