2023-07-16 15:19:09 +00:00
25 changed files with 1820 additions and 178 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,6 +18,8 @@ Debug/*
 src/bindgen_core_api.c
 src/bindgen_gles_api.c
 *bind_gen.c
 sdk/io_stubs.c
 sdk/orca_surface.c
 *bind_gen.c
 .vscode/settings.json
--- a/build.bat
+++ b/build.bat
@ -13,7 +13,7 @@ if %target% == wasm3 (
 	set wasm3_sources=/I .\ext\wasm3\source\*.c
 	for %%f in ( .\ext\wasm3\source\*.c ) do (
-		cl /nologo /Zi /Zc:preprocessor /c /Fo:bin\obj\%%~nf.obj %wasm3_includes% %%f
+		cl /nologo /Zi /Zc:preprocessor /O2 /c /Fo:bin\obj\%%~nf.obj %wasm3_includes% %%f
 	)
 	lib /nologo /out:bin\wasm3.lib bin\obj\*.obj
 )
--- a/cstdlib/include/math.h
+++ b/cstdlib/include/math.h
@ -54,6 +54,9 @@ double      acos(double);
 double      ceil(double);
 double      cos(double);
 float       cosf(float);
 double      fabs(double);
 double      floor(double);
@ -62,7 +65,27 @@ double      fmod(double, double);
 double      pow(double, double);
 double      scalbn(double, int);
 double      sin(double);
 float       sinf(float);
 double      sqrt(double);
 float       sqrtf(float);
 #define M_E             2.7182818284590452354   /* e */
 #define M_LOG2E         1.4426950408889634074   /* log_2 e */
 #define M_LOG10E        0.43429448190325182765  /* log_10 e */
 #define M_LN2           0.69314718055994530942  /* log_e 2 */
 #define M_LN10          2.30258509299404568402  /* log_e 10 */
 #define M_PI            3.14159265358979323846  /* pi */
 #define M_PI_2          1.57079632679489661923  /* pi/2 */
 #define M_PI_4          0.78539816339744830962  /* pi/4 */
 #define M_1_PI          0.31830988618379067154  /* 1/pi */
 #define M_2_PI          0.63661977236758134308  /* 2/pi */
 #define M_2_SQRTPI      1.12837916709551257390  /* 2/sqrt(pi) */
 #define M_SQRT2         1.41421356237309504880  /* sqrt(2) */
 #define M_SQRT1_2       0.70710678118654752440  /* 1/sqrt(2) */
 #ifdef __cplusplus
 }
--- a/cstdlib/src/__cos.c
+++ b/cstdlib/src/__cos.c
@ -0,0 +1,71 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/k_cos.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /*
 * __cos( x,  y )
 * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
 * Input x is assumed to be bounded by ~pi/4 in magnitude.
 * Input y is the tail of x.
 *
 * Algorithm
 *      1. Since cos(-x) = cos(x), we need only to consider positive x.
 *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
 *      3. cos(x) is approximated by a polynomial of degree 14 on
 *         [0,pi/4]
 *                                       4            14
 *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
 *         where the remez error is
 *
 *      |              2     4     6     8     10    12     14 |     -58
 *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
 *      |                                                      |
 *
 *                     4     6     8     10    12     14
 *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
 *             cos(x) ~ 1 - x*x/2 + r
 *         since cos(x+y) ~ cos(x) - sin(x)*y
 *                        ~ cos(x) - x*y,
 *         a correction term is necessary in cos(x) and hence
 *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
 *         For better accuracy, rearrange to
 *              cos(x+y) ~ w + (tmp + (r-x*y))
 *         where w = 1 - x*x/2 and tmp is a tiny correction term
 *         (1 - x*x/2 == w + tmp exactly in infinite precision).
 *         The exactness of w + tmp in infinite precision depends on w
 *         and tmp having the same precision as x.  If they have extra
 *         precision due to compiler bugs, then the extra precision is
 *         only good provided it is retained in all terms of the final
 *         expression for cos().  Retention happens in all cases tested
 *         under FreeBSD, so don't pessimize things by forcibly clipping
 *         any extra precision in w.
 */
 #include "libm.h"
 static const double
 C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
 C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
 C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
 C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
 C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
 C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
 double __cos(double x, double y)
 {
 	double_t hz,z,r,w;
 	z  = x*x;
 	w  = z*z;
 	r  = z*(C1+z*(C2+z*C3)) + w*w*(C4+z*(C5+z*C6));
 	hz = 0.5*z;
 	w  = 1.0-hz;
 	return w + (((1.0-w)-hz) + (z*r-x*y));
 }
--- a/cstdlib/src/__cosdf.c
+++ b/cstdlib/src/__cosdf.c
@ -0,0 +1,35 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/k_cosf.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Debugged and optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 #include "libm.h"
 /* |cos(x) - c(x)| < 2**-34.1 (~[-5.37e-11, 5.295e-11]). */
 static const double
 C0  = -0x1ffffffd0c5e81.0p-54, /* -0.499999997251031003120 */
 C1  =  0x155553e1053a42.0p-57, /*  0.0416666233237390631894 */
 C2  = -0x16c087e80f1e27.0p-62, /* -0.00138867637746099294692 */
 C3  =  0x199342e0ee5069.0p-68; /*  0.0000243904487962774090654 */
 float __cosdf(double x)
 {
 	double_t r, w, z;
 	/* Try to optimize for parallel evaluation as in __tandf.c. */
 	z = x*x;
 	w = z*z;
 	r = C2+z*C3;
 	return ((1.0+z*C0) + w*C1) + (w*z)*r;
 }
--- a/cstdlib/src/__math_invalidf.c
+++ b/cstdlib/src/__math_invalidf.c
@ -0,0 +1,6 @@
 #include "libm.h"
 float __math_invalidf(float x)
 {
 	return (x - x) / (x - x);
 }
--- a/cstdlib/src/__rem_pio2.c
+++ b/cstdlib/src/__rem_pio2.c
@ -0,0 +1,190 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/e_rem_pio2.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 *
 * Optimized by Bruce D. Evans.
 */
 /* __rem_pio2(x,y)
 *
 * return the remainder of x rem pi/2 in y[0]+y[1]
 * use __rem_pio2_large() for large x
 */
 #include "libm.h"
 #if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
 #define EPS DBL_EPSILON
 #elif FLT_EVAL_METHOD==2
 #define EPS LDBL_EPSILON
 #endif
 /*
 * invpio2:  53 bits of 2/pi
 * pio2_1:   first  33 bit of pi/2
 * pio2_1t:  pi/2 - pio2_1
 * pio2_2:   second 33 bit of pi/2
 * pio2_2t:  pi/2 - (pio2_1+pio2_2)
 * pio2_3:   third  33 bit of pi/2
 * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
 */
 static const double
 toint   = 1.5/EPS,
 pio4    = 0x1.921fb54442d18p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
 pio2_2  = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
 pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
 pio2_3  = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
 pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
 /* caller must handle the case when reduction is not needed: |x| ~<= pi/4 */
 int __rem_pio2(double x, double *y)
 {
 	union {double f; uint64_t i;} u = {x};
 	double_t z,w,t,r,fn;
 	double tx[3],ty[2];
 	uint32_t ix;
 	int sign, n, ex, ey, i;
 	sign = u.i>>63;
 	ix = u.i>>32 & 0x7fffffff;
 	if (ix <= 0x400f6a7a) {  /* |x| ~<= 5pi/4 */
 		if ((ix & 0xfffff) == 0x921fb)  /* |x| ~= pi/2 or 2pi/2 */
 			goto medium;  /* cancellation -- use medium case */
 		if (ix <= 0x4002d97c) {  /* |x| ~<= 3pi/4 */
 			if (!sign) {
 				z = x - pio2_1;  /* one round good to 85 bits */
 				y[0] = z - pio2_1t;
 				y[1] = (z-y[0]) - pio2_1t;
 				return 1;
 			} else {
 				z = x + pio2_1;
 				y[0] = z + pio2_1t;
 				y[1] = (z-y[0]) + pio2_1t;
 				return -1;
 			}
 		} else {
 			if (!sign) {
 				z = x - 2*pio2_1;
 				y[0] = z - 2*pio2_1t;
 				y[1] = (z-y[0]) - 2*pio2_1t;
 				return 2;
 			} else {
 				z = x + 2*pio2_1;
 				y[0] = z + 2*pio2_1t;
 				y[1] = (z-y[0]) + 2*pio2_1t;
 				return -2;
 			}
 		}
 	}
 	if (ix <= 0x401c463b) {  /* |x| ~<= 9pi/4 */
 		if (ix <= 0x4015fdbc) {  /* |x| ~<= 7pi/4 */
 			if (ix == 0x4012d97c)  /* |x| ~= 3pi/2 */
 				goto medium;
 			if (!sign) {
 				z = x - 3*pio2_1;
 				y[0] = z - 3*pio2_1t;
 				y[1] = (z-y[0]) - 3*pio2_1t;
 				return 3;
 			} else {
 				z = x + 3*pio2_1;
 				y[0] = z + 3*pio2_1t;
 				y[1] = (z-y[0]) + 3*pio2_1t;
 				return -3;
 			}
 		} else {
 			if (ix == 0x401921fb)  /* |x| ~= 4pi/2 */
 				goto medium;
 			if (!sign) {
 				z = x - 4*pio2_1;
 				y[0] = z - 4*pio2_1t;
 				y[1] = (z-y[0]) - 4*pio2_1t;
 				return 4;
 			} else {
 				z = x + 4*pio2_1;
 				y[0] = z + 4*pio2_1t;
 				y[1] = (z-y[0]) + 4*pio2_1t;
 				return -4;
 			}
 		}
 	}
 	if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */
 medium:
 		/* rint(x/(pi/2)) */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n = (int32_t)fn;
 		r = x - fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round, good to 85 bits */
 		/* Matters with directed rounding. */
 		if (predict_false(r - w < -pio4)) {
 			n--;
 			fn--;
 			r = x - fn*pio2_1;
 			w = fn*pio2_1t;
 		} else if (predict_false(r - w > pio4)) {
 			n++;
 			fn++;
 			r = x - fn*pio2_1;
 			w = fn*pio2_1t;
 		}
 		y[0] = r - w;
 		u.f = y[0];
 		ey = u.i>>52 & 0x7ff;
 		ex = ix>>20;
 		if (ex - ey > 16) { /* 2nd round, good to 118 bits */
 			t = r;
 			w = fn*pio2_2;
 			r = t - w;
 			w = fn*pio2_2t - ((t-r)-w);
 			y[0] = r - w;
 			u.f = y[0];
 			ey = u.i>>52 & 0x7ff;
 			if (ex - ey > 49) {  /* 3rd round, good to 151 bits, covers all cases */
 				t = r;
 				w = fn*pio2_3;
 				r = t - w;
 				w = fn*pio2_3t - ((t-r)-w);
 				y[0] = r - w;
 			}
 		}
 		y[1] = (r - y[0]) - w;
 		return n;
 	}
 	/*
 	 * all other (large) arguments
 	 */
 	if (ix >= 0x7ff00000) {  /* x is inf or NaN */
 		y[0] = y[1] = x - x;
 		return 0;
 	}
 	/* set z = scalbn(|x|,-ilogb(x)+23) */
 	u.f = x;
 	u.i &= (uint64_t)-1>>12;
 	u.i |= (uint64_t)(0x3ff + 23)<<52;
 	z = u.f;
 	for (i=0; i < 2; i++) {
 		tx[i] = (double)(int32_t)z;
 		z     = (z-tx[i])*0x1p24;
 	}
 	tx[i] = z;
 	/* skip zero terms, first term is non-zero */
 	while (tx[i] == 0.0)
 		i--;
 	n = __rem_pio2_large(tx,ty,(int)(ix>>20)-(0x3ff+23),i+1,1);
 	if (sign) {
 		y[0] = -ty[0];
 		y[1] = -ty[1];
 		return -n;
 	}
 	y[0] = ty[0];
 	y[1] = ty[1];
 	return n;
 }
--- a/cstdlib/src/__rem_pio2_large.c
+++ b/cstdlib/src/__rem_pio2_large.c
@ -0,0 +1,442 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /*
 * __rem_pio2_large(x,y,e0,nx,prec)
 * double x[],y[]; int e0,nx,prec;
 *
 * __rem_pio2_large return the last three digits of N with
 *              y = x - N*pi/2
 * so that |y| < pi/2.
 *
 * The method is to compute the integer (mod 8) and fraction parts of
 * (2/pi)*x without doing the full multiplication. In general we
 * skip the part of the product that are known to be a huge integer (
 * more accurately, = 0 mod 8 ). Thus the number of operations are
 * independent of the exponent of the input.
 *
 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
 *
 * Input parameters:
 *      x[]     The input value (must be positive) is broken into nx
 *              pieces of 24-bit integers in double precision format.
 *              x[i] will be the i-th 24 bit of x. The scaled exponent
 *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
 *              match x's up to 24 bits.
 *
 *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
 *                      e0 = ilogb(z)-23
 *                      z  = scalbn(z,-e0)
 *              for i = 0,1,2
 *                      x[i] = floor(z)
 *                      z    = (z-x[i])*2**24
 *
 *
 *      y[]     ouput result in an array of double precision numbers.
 *              The dimension of y[] is:
 *                      24-bit  precision       1
 *                      53-bit  precision       2
 *                      64-bit  precision       2
 *                      113-bit precision       3
 *              The actual value is the sum of them. Thus for 113-bit
 *              precison, one may have to do something like:
 *
 *              long double t,w,r_head, r_tail;
 *              t = (long double)y[2] + (long double)y[1];
 *              w = (long double)y[0];
 *              r_head = t+w;
 *              r_tail = w - (r_head - t);
 *
 *      e0      The exponent of x[0]. Must be <= 16360 or you need to
 *              expand the ipio2 table.
 *
 *      nx      dimension of x[]
 *
 *      prec    an integer indicating the precision:
 *                      0       24  bits (single)
 *                      1       53  bits (double)
 *                      2       64  bits (extended)
 *                      3       113 bits (quad)
 *
 * External function:
 *      double scalbn(), floor();
 *
 *
 * Here is the description of some local variables:
 *
 *      jk      jk+1 is the initial number of terms of ipio2[] needed
 *              in the computation. The minimum and recommended value
 *              for jk is 3,4,4,6 for single, double, extended, and quad.
 *              jk+1 must be 2 larger than you might expect so that our
 *              recomputation test works. (Up to 24 bits in the integer
 *              part (the 24 bits of it that we compute) and 23 bits in
 *              the fraction part may be lost to cancelation before we
 *              recompute.)
 *
 *      jz      local integer variable indicating the number of
 *              terms of ipio2[] used.
 *
 *      jx      nx - 1
 *
 *      jv      index for pointing to the suitable ipio2[] for the
 *              computation. In general, we want
 *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
 *              is an integer. Thus
 *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
 *              Hence jv = max(0,(e0-3)/24).
 *
 *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
 *
 *      q[]     double array with integral value, representing the
 *              24-bits chunk of the product of x and 2/pi.
 *
 *      q0      the corresponding exponent of q[0]. Note that the
 *              exponent for q[i] would be q0-24*i.
 *
 *      PIo2[]  double precision array, obtained by cutting pi/2
 *              into 24 bits chunks.
 *
 *      f[]     ipio2[] in floating point
 *
 *      iq[]    integer array by breaking up q[] in 24-bits chunk.
 *
 *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
 *
 *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
 *              it also indicates the *sign* of the result.
 *
 */
 /*
 * Constants:
 * The hexadecimal values are the intended ones for the following
 * constants. The decimal values may be used, provided that the
 * compiler will convert from decimal to binary accurately enough
 * to produce the hexadecimal values shown.
 */
 #include "libm.h"
 static const int init_jk[] = {3,4,4,6}; /* initial value for jk */
 /*
 * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
 *
 *              integer array, contains the (24*i)-th to (24*i+23)-th
 *              bit of 2/pi after binary point. The corresponding
 *              floating value is
 *
 *                      ipio2[i] * 2^(-24(i+1)).
 *
 * NB: This table must have at least (e0-3)/24 + jk terms.
 *     For quad precision (e0 <= 16360, jk = 6), this is 686.
 */
 static const int32_t ipio2[] = {
 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
 #if LDBL_MAX_EXP > 1024
 0x47C419, 0xC367CD, 0xDCE809, 0x2A8359, 0xC4768B, 0x961CA6,
 0xDDAF44, 0xD15719, 0x053EA5, 0xFF0705, 0x3F7E33, 0xE832C2,
 0xDE4F98, 0x327DBB, 0xC33D26, 0xEF6B1E, 0x5EF89F, 0x3A1F35,
 0xCAF27F, 0x1D87F1, 0x21907C, 0x7C246A, 0xFA6ED5, 0x772D30,
 0x433B15, 0xC614B5, 0x9D19C3, 0xC2C4AD, 0x414D2C, 0x5D000C,
 0x467D86, 0x2D71E3, 0x9AC69B, 0x006233, 0x7CD2B4, 0x97A7B4,
 0xD55537, 0xF63ED7, 0x1810A3, 0xFC764D, 0x2A9D64, 0xABD770,
 0xF87C63, 0x57B07A, 0xE71517, 0x5649C0, 0xD9D63B, 0x3884A7,
 0xCB2324, 0x778AD6, 0x23545A, 0xB91F00, 0x1B0AF1, 0xDFCE19,
 0xFF319F, 0x6A1E66, 0x615799, 0x47FBAC, 0xD87F7E, 0xB76522,
 0x89E832, 0x60BFE6, 0xCDC4EF, 0x09366C, 0xD43F5D, 0xD7DE16,
 0xDE3B58, 0x929BDE, 0x2822D2, 0xE88628, 0x4D58E2, 0x32CAC6,
 0x16E308, 0xCB7DE0, 0x50C017, 0xA71DF3, 0x5BE018, 0x34132E,
 0x621283, 0x014883, 0x5B8EF5, 0x7FB0AD, 0xF2E91E, 0x434A48,
 0xD36710, 0xD8DDAA, 0x425FAE, 0xCE616A, 0xA4280A, 0xB499D3,
 0xF2A606, 0x7F775C, 0x83C2A3, 0x883C61, 0x78738A, 0x5A8CAF,
 0xBDD76F, 0x63A62D, 0xCBBFF4, 0xEF818D, 0x67C126, 0x45CA55,
 0x36D9CA, 0xD2A828, 0x8D61C2, 0x77C912, 0x142604, 0x9B4612,
 0xC459C4, 0x44C5C8, 0x91B24D, 0xF31700, 0xAD43D4, 0xE54929,
 0x10D5FD, 0xFCBE00, 0xCC941E, 0xEECE70, 0xF53E13, 0x80F1EC,
 0xC3E7B3, 0x28F8C7, 0x940593, 0x3E71C1, 0xB3092E, 0xF3450B,
 0x9C1288, 0x7B20AB, 0x9FB52E, 0xC29247, 0x2F327B, 0x6D550C,
 0x90A772, 0x1FE76B, 0x96CB31, 0x4A1679, 0xE27941, 0x89DFF4,
 0x9794E8, 0x84E6E2, 0x973199, 0x6BED88, 0x365F5F, 0x0EFDBB,
 0xB49A48, 0x6CA467, 0x427271, 0x325D8D, 0xB8159F, 0x09E5BC,
 0x25318D, 0x3974F7, 0x1C0530, 0x010C0D, 0x68084B, 0x58EE2C,
 0x90AA47, 0x02E774, 0x24D6BD, 0xA67DF7, 0x72486E, 0xEF169F,
 0xA6948E, 0xF691B4, 0x5153D1, 0xF20ACF, 0x339820, 0x7E4BF5,
 0x6863B2, 0x5F3EDD, 0x035D40, 0x7F8985, 0x295255, 0xC06437,
 0x10D86D, 0x324832, 0x754C5B, 0xD4714E, 0x6E5445, 0xC1090B,
 0x69F52A, 0xD56614, 0x9D0727, 0x50045D, 0xDB3BB4, 0xC576EA,
 0x17F987, 0x7D6B49, 0xBA271D, 0x296996, 0xACCCC6, 0x5414AD,
 0x6AE290, 0x89D988, 0x50722C, 0xBEA404, 0x940777, 0x7030F3,
 0x27FC00, 0xA871EA, 0x49C266, 0x3DE064, 0x83DD97, 0x973FA3,
 0xFD9443, 0x8C860D, 0xDE4131, 0x9D3992, 0x8C70DD, 0xE7B717,
 0x3BDF08, 0x2B3715, 0xA0805C, 0x93805A, 0x921110, 0xD8E80F,
 0xAF806C, 0x4BFFDB, 0x0F9038, 0x761859, 0x15A562, 0xBBCB61,
 0xB989C7, 0xBD4010, 0x04F2D2, 0x277549, 0xF6B6EB, 0xBB22DB,
 0xAA140A, 0x2F2689, 0x768364, 0x333B09, 0x1A940E, 0xAA3A51,
 0xC2A31D, 0xAEEDAF, 0x12265C, 0x4DC26D, 0x9C7A2D, 0x9756C0,
 0x833F03, 0xF6F009, 0x8C402B, 0x99316D, 0x07B439, 0x15200C,
 0x5BC3D8, 0xC492F5, 0x4BADC6, 0xA5CA4E, 0xCD37A7, 0x36A9E6,
 0x9492AB, 0x6842DD, 0xDE6319, 0xEF8C76, 0x528B68, 0x37DBFC,
 0xABA1AE, 0x3115DF, 0xA1AE00, 0xDAFB0C, 0x664D64, 0xB705ED,
 0x306529, 0xBF5657, 0x3AFF47, 0xB9F96A, 0xF3BE75, 0xDF9328,
 0x3080AB, 0xF68C66, 0x15CB04, 0x0622FA, 0x1DE4D9, 0xA4B33D,
 0x8F1B57, 0x09CD36, 0xE9424E, 0xA4BE13, 0xB52333, 0x1AAAF0,
 0xA8654F, 0xA5C1D2, 0x0F3F0B, 0xCD785B, 0x76F923, 0x048B7B,
 0x721789, 0x53A6C6, 0xE26E6F, 0x00EBEF, 0x584A9B, 0xB7DAC4,
 0xBA66AA, 0xCFCF76, 0x1D02D1, 0x2DF1B1, 0xC1998C, 0x77ADC3,
 0xDA4886, 0xA05DF7, 0xF480C6, 0x2FF0AC, 0x9AECDD, 0xBC5C3F,
 0x6DDED0, 0x1FC790, 0xB6DB2A, 0x3A25A3, 0x9AAF00, 0x9353AD,
 0x0457B6, 0xB42D29, 0x7E804B, 0xA707DA, 0x0EAA76, 0xA1597B,
 0x2A1216, 0x2DB7DC, 0xFDE5FA, 0xFEDB89, 0xFDBE89, 0x6C76E4,
 0xFCA906, 0x70803E, 0x156E85, 0xFF87FD, 0x073E28, 0x336761,
 0x86182A, 0xEABD4D, 0xAFE7B3, 0x6E6D8F, 0x396795, 0x5BBF31,
 0x48D784, 0x16DF30, 0x432DC7, 0x356125, 0xCE70C9, 0xB8CB30,
 0xFD6CBF, 0xA200A4, 0xE46C05, 0xA0DD5A, 0x476F21, 0xD21262,
 0x845CB9, 0x496170, 0xE0566B, 0x015299, 0x375550, 0xB7D51E,
 0xC4F133, 0x5F6E13, 0xE4305D, 0xA92E85, 0xC3B21D, 0x3632A1,
 0xA4B708, 0xD4B1EA, 0x21F716, 0xE4698F, 0x77FF27, 0x80030C,
 0x2D408D, 0xA0CD4F, 0x99A520, 0xD3A2B3, 0x0A5D2F, 0x42F9B4,
 0xCBDA11, 0xD0BE7D, 0xC1DB9B, 0xBD17AB, 0x81A2CA, 0x5C6A08,
 0x17552E, 0x550027, 0xF0147F, 0x8607E1, 0x640B14, 0x8D4196,
 0xDEBE87, 0x2AFDDA, 0xB6256B, 0x34897B, 0xFEF305, 0x9EBFB9,
 0x4F6A68, 0xA82A4A, 0x5AC44F, 0xBCF82D, 0x985AD7, 0x95C7F4,
 0x8D4D0D, 0xA63A20, 0x5F57A4, 0xB13F14, 0x953880, 0x0120CC,
 0x86DD71, 0xB6DEC9, 0xF560BF, 0x11654D, 0x6B0701, 0xACB08C,
 0xD0C0B2, 0x485551, 0x0EFB1E, 0xC37295, 0x3B06A3, 0x3540C0,
 0x7BDC06, 0xCC45E0, 0xFA294E, 0xC8CAD6, 0x41F3E8, 0xDE647C,
 0xD8649B, 0x31BED9, 0xC397A4, 0xD45877, 0xC5E369, 0x13DAF0,
 0x3C3ABA, 0x461846, 0x5F7555, 0xF5BDD2, 0xC6926E, 0x5D2EAC,
 0xED440E, 0x423E1C, 0x87C461, 0xE9FD29, 0xF3D6E7, 0xCA7C22,
 0x35916F, 0xC5E008, 0x8DD7FF, 0xE26A6E, 0xC6FDB0, 0xC10893,
 0x745D7C, 0xB2AD6B, 0x9D6ECD, 0x7B723E, 0x6A11C6, 0xA9CFF7,
 0xDF7329, 0xBAC9B5, 0x5100B7, 0x0DB2E2, 0x24BA74, 0x607DE5,
 0x8AD874, 0x2C150D, 0x0C1881, 0x94667E, 0x162901, 0x767A9F,
 0xBEFDFD, 0xEF4556, 0x367ED9, 0x13D9EC, 0xB9BA8B, 0xFC97C4,
 0x27A831, 0xC36EF1, 0x36C594, 0x56A8D8, 0xB5A8B4, 0x0ECCCF,
 0x2D8912, 0x34576F, 0x89562C, 0xE3CE99, 0xB920D6, 0xAA5E6B,
 0x9C2A3E, 0xCC5F11, 0x4A0BFD, 0xFBF4E1, 0x6D3B8E, 0x2C86E2,
 0x84D4E9, 0xA9B4FC, 0xD1EEEF, 0xC9352E, 0x61392F, 0x442138,
 0xC8D91B, 0x0AFC81, 0x6A4AFB, 0xD81C2F, 0x84B453, 0x8C994E,
 0xCC2254, 0xDC552A, 0xD6C6C0, 0x96190B, 0xB8701A, 0x649569,
 0x605A26, 0xEE523F, 0x0F117F, 0x11B5F4, 0xF5CBFC, 0x2DBC34,
 0xEEBC34, 0xCC5DE8, 0x605EDD, 0x9B8E67, 0xEF3392, 0xB817C9,
 0x9B5861, 0xBC57E1, 0xC68351, 0x103ED8, 0x4871DD, 0xDD1C2D,
 0xA118AF, 0x462C21, 0xD7F359, 0x987AD9, 0xC0549E, 0xFA864F,
 0xFC0656, 0xAE79E5, 0x362289, 0x22AD38, 0xDC9367, 0xAAE855,
 0x382682, 0x9BE7CA, 0xA40D51, 0xB13399, 0x0ED7A9, 0x480569,
 0xF0B265, 0xA7887F, 0x974C88, 0x36D1F9, 0xB39221, 0x4A827B,
 0x21CF98, 0xDC9F40, 0x5547DC, 0x3A74E1, 0x42EB67, 0xDF9DFE,
 0x5FD45E, 0xA4677B, 0x7AACBA, 0xA2F655, 0x23882B, 0x55BA41,
 0x086E59, 0x862A21, 0x834739, 0xE6E389, 0xD49EE5, 0x40FB49,
 0xE956FF, 0xCA0F1C, 0x8A59C5, 0x2BFA94, 0xC5C1D3, 0xCFC50F,
 0xAE5ADB, 0x86C547, 0x624385, 0x3B8621, 0x94792C, 0x876110,
 0x7B4C2A, 0x1A2C80, 0x12BF43, 0x902688, 0x893C78, 0xE4C4A8,
 0x7BDBE5, 0xC23AC4, 0xEAF426, 0x8A67F7, 0xBF920D, 0x2BA365,
 0xB1933D, 0x0B7CBD, 0xDC51A4, 0x63DD27, 0xDDE169, 0x19949A,
 0x9529A8, 0x28CE68, 0xB4ED09, 0x209F44, 0xCA984E, 0x638270,
 0x237C7E, 0x32B90F, 0x8EF5A7, 0xE75614, 0x08F121, 0x2A9DB5,
 0x4D7E6F, 0x5119A5, 0xABF9B5, 0xD6DF82, 0x61DD96, 0x023616,
 0x9F3AC4, 0xA1A283, 0x6DED72, 0x7A8D39, 0xA9B882, 0x5C326B,
 0x5B2746, 0xED3400, 0x7700D2, 0x55F4FC, 0x4D5901, 0x8071E0,
 #endif
 };
 static const double PIo2[] = {
  1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
  7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
  5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
  3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
  1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
  1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
  2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
  2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
 };
 int __rem_pio2_large(double *x, double *y, int e0, int nx, int prec)
 {
 	int32_t jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
 	double z,fw,f[20],fq[20],q[20];
 	/* initialize jk*/
 	jk = init_jk[prec];
 	jp = jk;
 	/* determine jx,jv,q0, note that 3>q0 */
 	jx = nx-1;
 	jv = (e0-3)/24;  if(jv<0) jv=0;
 	q0 = e0-24*(jv+1);
 	/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
 	j = jv-jx; m = jx+jk;
 	for (i=0; i<=m; i++,j++)
 		f[i] = j<0 ? 0.0 : (double)ipio2[j];
 	/* compute q[0],q[1],...q[jk] */
 	for (i=0; i<=jk; i++) {
 		for (j=0,fw=0.0; j<=jx; j++)
 			fw += x[j]*f[jx+i-j];
 		q[i] = fw;
 	}
 	jz = jk;
 recompute:
 	/* distill q[] into iq[] reversingly */
 	for (i=0,j=jz,z=q[jz]; j>0; i++,j--) {
 		fw    = (double)(int32_t)(0x1p-24*z);
 		iq[i] = (int32_t)(z - 0x1p24*fw);
 		z     = q[j-1]+fw;
 	}
 	/* compute n */
 	z  = scalbn(z,q0);       /* actual value of z */
 	z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */
 	n  = (int32_t)z;
 	z -= (double)n;
 	ih = 0;
 	if (q0 > 0) {  /* need iq[jz-1] to determine n */
 		i  = iq[jz-1]>>(24-q0); n += i;
 		iq[jz-1] -= i<<(24-q0);
 		ih = iq[jz-1]>>(23-q0);
 	}
 	else if (q0 == 0) ih = iq[jz-1]>>23;
 	else if (z >= 0.5) ih = 2;
 	if (ih > 0) {  /* q > 0.5 */
 		n += 1; carry = 0;
 		for (i=0; i<jz; i++) {  /* compute 1-q */
 			j = iq[i];
 			if (carry == 0) {
 				if (j != 0) {
 					carry = 1;
 					iq[i] = 0x1000000 - j;
 				}
 			} else
 				iq[i] = 0xffffff - j;
 		}
 		if (q0 > 0) {  /* rare case: chance is 1 in 12 */
 			switch(q0) {
 			case 1:
 				iq[jz-1] &= 0x7fffff; break;
 			case 2:
 				iq[jz-1] &= 0x3fffff; break;
 			}
 		}
 		if (ih == 2) {
 			z = 1.0 - z;
 			if (carry != 0)
 				z -= scalbn(1.0,q0);
 		}
 	}
 	/* check if recomputation is needed */
 	if (z == 0.0) {
 		j = 0;
 		for (i=jz-1; i>=jk; i--) j |= iq[i];
 		if (j == 0) {  /* need recomputation */
 			for (k=1; iq[jk-k]==0; k++);  /* k = no. of terms needed */
 			for (i=jz+1; i<=jz+k; i++) {  /* add q[jz+1] to q[jz+k] */
 				f[jx+i] = (double)ipio2[jv+i];
 				for (j=0,fw=0.0; j<=jx; j++)
 					fw += x[j]*f[jx+i-j];
 				q[i] = fw;
 			}
 			jz += k;
 			goto recompute;
 		}
 	}
 	/* chop off zero terms */
 	if (z == 0.0) {
 		jz -= 1;
 		q0 -= 24;
 		while (iq[jz] == 0) {
 			jz--;
 			q0 -= 24;
 		}
 	} else { /* break z into 24-bit if necessary */
 		z = scalbn(z,-q0);
 		if (z >= 0x1p24) {
 			fw = (double)(int32_t)(0x1p-24*z);
 			iq[jz] = (int32_t)(z - 0x1p24*fw);
 			jz += 1;
 			q0 += 24;
 			iq[jz] = (int32_t)fw;
 		} else
 			iq[jz] = (int32_t)z;
 	}
 	/* convert integer "bit" chunk to floating-point value */
 	fw = scalbn(1.0,q0);
 	for (i=jz; i>=0; i--) {
 		q[i] = fw*(double)iq[i];
 		fw *= 0x1p-24;
 	}
 	/* compute PIo2[0,...,jp]*q[jz,...,0] */
 	for(i=jz; i>=0; i--) {
 		for (fw=0.0,k=0; k<=jp && k<=jz-i; k++)
 			fw += PIo2[k]*q[i+k];
 		fq[jz-i] = fw;
 	}
 	/* compress fq[] into y[] */
 	switch(prec) {
 	case 0:
 		fw = 0.0;
 		for (i=jz; i>=0; i--)
 			fw += fq[i];
 		y[0] = ih==0 ? fw : -fw;
 		break;
 	case 1:
 	case 2:
 		fw = 0.0;
 		for (i=jz; i>=0; i--)
 			fw += fq[i];
 		// TODO: drop excess precision here once double_t is used
 		fw = (double)fw;
 		y[0] = ih==0 ? fw : -fw;
 		fw = fq[0]-fw;
 		for (i=1; i<=jz; i++)
 			fw += fq[i];
 		y[1] = ih==0 ? fw : -fw;
 		break;
 	case 3:  /* painful */
 		for (i=jz; i>0; i--) {
 			fw      = fq[i-1]+fq[i];
 			fq[i]  += fq[i-1]-fw;
 			fq[i-1] = fw;
 		}
 		for (i=jz; i>1; i--) {
 			fw      = fq[i-1]+fq[i];
 			fq[i]  += fq[i-1]-fw;
 			fq[i-1] = fw;
 		}
 		for (fw=0.0,i=jz; i>=2; i--)
 			fw += fq[i];
 		if (ih==0) {
 			y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
 		} else {
 			y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
 		}
 	}
 	return n&7;
 }
--- a/cstdlib/src/__rem_pio2f.c
+++ b/cstdlib/src/__rem_pio2f.c
@ -0,0 +1,86 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/e_rem_pio2f.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Debugged and optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /* __rem_pio2f(x,y)
 *
 * return the remainder of x rem pi/2 in *y
 * use double precision for everything except passing x
 * use __rem_pio2_large() for large x
 */
 #include "libm.h"
 #if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
 #define EPS DBL_EPSILON
 #elif FLT_EVAL_METHOD==2
 #define EPS LDBL_EPSILON
 #endif
 /*
 * invpio2:  53 bits of 2/pi
 * pio2_1:   first 25 bits of pi/2
 * pio2_1t:  pi/2 - pio2_1
 */
 static const double
 toint   = 1.5/EPS,
 pio4    = 0x1.921fb6p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */
 pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
 int __rem_pio2f(float x, double *y)
 {
 	union {float f; uint32_t i;} u = {x};
 	double tx[1],ty[1];
 	double_t fn;
 	uint32_t ix;
 	int n, sign, e0;
 	ix = u.i & 0x7fffffff;
 	/* 25+53 bit pi is good enough for medium size */
 	if (ix < 0x4dc90fdb) {  /* |x| ~< 2^28*(pi/2), medium size */
 		/* Use a specialized rint() to get fn. */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n  = (int32_t)fn;
 		*y = x - fn*pio2_1 - fn*pio2_1t;
 		/* Matters with directed rounding. */
 		if (predict_false(*y < -pio4)) {
 			n--;
 			fn--;
 			*y = x - fn*pio2_1 - fn*pio2_1t;
 		} else if (predict_false(*y > pio4)) {
 			n++;
 			fn++;
 			*y = x - fn*pio2_1 - fn*pio2_1t;
 		}
 		return n;
 	}
 	if(ix>=0x7f800000) {  /* x is inf or NaN */
 		*y = x-x;
 		return 0;
 	}
 	/* scale x into [2^23, 2^24-1] */
 	sign = u.i>>31;
 	e0 = (ix>>23) - (0x7f+23);  /* e0 = ilogb(|x|)-23, positive */
 	u.i = ix - (e0<<23);
 	tx[0] = u.f;
 	n  =  __rem_pio2_large(tx,ty,e0,1,0);
 	if (sign) {
 		*y = -ty[0];
 		return -n;
 	}
 	*y = ty[0];
 	return n;
 }
--- a/cstdlib/src/__sin.c
+++ b/cstdlib/src/__sin.c
@ -0,0 +1,64 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/k_sin.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /* __sin( x, y, iy)
 * kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
 * Input x is assumed to be bounded by ~pi/4 in magnitude.
 * Input y is the tail of x.
 * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
 *
 * Algorithm
 *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
 *      2. Callers must return sin(-0) = -0 without calling here since our
 *         odd polynomial is not evaluated in a way that preserves -0.
 *         Callers may do the optimization sin(x) ~ x for tiny x.
 *      3. sin(x) is approximated by a polynomial of degree 13 on
 *         [0,pi/4]
 *                               3            13
 *              sin(x) ~ x + S1*x + ... + S6*x
 *         where
 *
 *      |sin(x)         2     4     6     8     10     12  |     -58
 *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
 *      |  x                                               |
 *
 *      4. sin(x+y) = sin(x) + sin'(x')*y
 *                  ~ sin(x) + (1-x*x/2)*y
 *         For better accuracy, let
 *                   3      2      2      2      2
 *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
 *         then                   3    2
 *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
 */
 #include "libm.h"
 static const double
 S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
 S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
 S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
 S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
 S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
 S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
 double __sin(double x, double y, int iy)
 {
 	double_t z,r,v,w;
 	z = x*x;
 	w = z*z;
 	r = S2 + z*(S3 + z*S4) + z*w*(S5 + z*S6);
 	v = z*x;
 	if (iy == 0)
 		return x + v*(S1 + z*r);
 	else
 		return x - ((z*(0.5*y - v*r) - y) - v*S1);
 }
--- a/cstdlib/src/__sindf.c
+++ b/cstdlib/src/__sindf.c
@ -0,0 +1,36 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 #include "libm.h"
 /* |sin(x)/x - s(x)| < 2**-37.5 (~[-4.89e-12, 4.824e-12]). */
 static const double
 S1 = -0x15555554cbac77.0p-55, /* -0.166666666416265235595 */
 S2 =  0x111110896efbb2.0p-59, /*  0.0083333293858894631756 */
 S3 = -0x1a00f9e2cae774.0p-65, /* -0.000198393348360966317347 */
 S4 =  0x16cd878c3b46a7.0p-71; /*  0.0000027183114939898219064 */
 float __sindf(double x)
 {
 	double_t r, s, w, z;
 	/* Try to optimize for parallel evaluation as in __tandf.c. */
 	z = x*x;
 	w = z*z;
 	r = S3 + z*S4;
 	s = z*x;
 	return (x + s*(S1 + z*S2)) + s*w*r;
 }
--- a/cstdlib/src/cos.c
+++ b/cstdlib/src/cos.c
@ -0,0 +1,77 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/s_cos.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /* cos(x)
 * Return cosine function of x.
 *
 * kernel function:
 *      __sin           ... sine function on [-pi/4,pi/4]
 *      __cos           ... cosine function on [-pi/4,pi/4]
 *      __rem_pio2      ... argument reduction routine
 *
 * Method.
 *      Let S,C and T denote the sin, cos and tan respectively on
 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 *      in [-pi/4 , +pi/4], and let n = k mod 4.
 *      We have
 *
 *          n        sin(x)      cos(x)        tan(x)
 *     ----------------------------------------------------------
 *          0          S           C             T
 *          1          C          -S            -1/T
 *          2         -S          -C             T
 *          3         -C           S            -1/T
 *     ----------------------------------------------------------
 *
 * Special cases:
 *      Let trig be any of sin, cos, or tan.
 *      trig(+-INF)  is NaN, with signals;
 *      trig(NaN)    is that NaN;
 *
 * Accuracy:
 *      TRIG(x) returns trig(x) nearly rounded
 */
 #include "libm.h"
 double cos(double x)
 {
 	double y[2];
 	uint32_t ix;
 	unsigned n;
 	GET_HIGH_WORD(ix, x);
 	ix &= 0x7fffffff;
 	/* |x| ~< pi/4 */
 	if (ix <= 0x3fe921fb) {
 		if (ix < 0x3e46a09e) {  /* |x| < 2**-27 * sqrt(2) */
 			/* raise inexact if x!=0 */
 			FORCE_EVAL(x + 0x1p120f);
 			return 1.0;
 		}
 		return __cos(x, 0);
 	}
 	/* cos(Inf or NaN) is NaN */
 	if (ix >= 0x7ff00000)
 		return x-x;
 	/* argument reduction */
 	n = __rem_pio2(x, y);
 	switch (n&3) {
 	case 0: return  __cos(y[0], y[1]);
 	case 1: return -__sin(y[0], y[1], 1);
 	case 2: return -__cos(y[0], y[1]);
 	default:
 		return  __sin(y[0], y[1], 1);
 	}
 }
--- a/cstdlib/src/cosf.c
+++ b/cstdlib/src/cosf.c
@ -0,0 +1,78 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/s_cosf.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 #include "libm.h"
 /* Small multiples of pi/2 rounded to double precision. */
 static const double
 c1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
 c2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
 c3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
 c4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */
 float cosf(float x)
 {
 	double y;
 	uint32_t ix;
 	unsigned n, sign;
 	GET_FLOAT_WORD(ix, x);
 	sign = ix >> 31;
 	ix &= 0x7fffffff;
 	if (ix <= 0x3f490fda) {  /* |x| ~<= pi/4 */
 		if (ix < 0x39800000) {  /* |x| < 2**-12 */
 			/* raise inexact if x != 0 */
 			FORCE_EVAL(x + 0x1p120f);
 			return 1.0f;
 		}
 		return __cosdf(x);
 	}
 	if (ix <= 0x407b53d1) {  /* |x| ~<= 5*pi/4 */
 		if (ix > 0x4016cbe3)  /* |x|  ~> 3*pi/4 */
 			return -__cosdf(sign ? x+c2pio2 : x-c2pio2);
 		else {
 			if (sign)
 				return __sindf(x + c1pio2);
 			else
 				return __sindf(c1pio2 - x);
 		}
 	}
 	if (ix <= 0x40e231d5) {  /* |x| ~<= 9*pi/4 */
 		if (ix > 0x40afeddf)  /* |x| ~> 7*pi/4 */
 			return __cosdf(sign ? x+c4pio2 : x-c4pio2);
 		else {
 			if (sign)
 				return __sindf(-x - c3pio2);
 			else
 				return __sindf(x - c3pio2);
 		}
 	}
 	/* cos(Inf or NaN) is NaN */
 	if (ix >= 0x7f800000)
 		return x-x;
 	/* general argument reduction needed */
 	n = __rem_pio2f(x,&y);
 	switch (n&3) {
 	case 0: return  __cosdf(y);
 	case 1: return  __sindf(-y);
 	case 2: return -__cosdf(y);
 	default:
 		return  __sindf(y);
 	}
 }
--- a/cstdlib/src/libm.h
+++ b/cstdlib/src/libm.h
@ -108,6 +108,27 @@ do {                                              \
 #define SET_LOW_WORD(d,lo)                        \
  INSERT_WORDS(d, asuint64(d)>>32, lo)
 #define GET_FLOAT_WORD(w,d)                       \
 do {                                              \
  (w) = asuint(d);                                \
 } while (0)
 #define SET_FLOAT_WORD(d,w)                       \
 do {                                              \
  (d) = asfloat(w);                               \
 } while (0)
 int    __rem_pio2_large(double*,double*,int,int,int);
 int    __rem_pio2(double,double*);
 double __sin(double,double,int);
 double __cos(double,double);
 int    __rem_pio2f(float,double*);
 float  __sindf(double);
 float  __cosdf(double);
 float __math_invalidf(float);
 double __math_xflow(uint32_t, double);
 double __math_uflow(uint32_t);
 double __math_oflow(uint32_t);
--- a/cstdlib/src/scalbn.c
+++ b/cstdlib/src/scalbn.c
@ -0,0 +1,33 @@
 #include <math.h>
 #include <stdint.h>
 double scalbn(double x, int n)
 {
 	union {double f; uint64_t i;} u;
 	double_t y = x;
 	if (n > 1023) {
 		y *= 0x1p1023;
 		n -= 1023;
 		if (n > 1023) {
 			y *= 0x1p1023;
 			n -= 1023;
 			if (n > 1023)
 				n = 1023;
 		}
 	} else if (n < -1022) {
 		/* make sure final n < -53 to avoid double
 		   rounding in the subnormal range */
 		y *= 0x1p-1022 * 0x1p53;
 		n += 1022 - 53;
 		if (n < -1022) {
 			y *= 0x1p-1022 * 0x1p53;
 			n += 1022 - 53;
 			if (n < -1022)
 				n = -1022;
 		}
 	}
 	u.i = (uint64_t)(0x3ff+n)<<52;
 	x = y * u.f;
 	return x;
 }
--- a/cstdlib/src/sin.c
+++ b/cstdlib/src/sin.c
@ -0,0 +1,78 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/s_sin.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /* sin(x)
 * Return sine function of x.
 *
 * kernel function:
 *      __sin            ... sine function on [-pi/4,pi/4]
 *      __cos            ... cose function on [-pi/4,pi/4]
 *      __rem_pio2       ... argument reduction routine
 *
 * Method.
 *      Let S,C and T denote the sin, cos and tan respectively on
 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 *      in [-pi/4 , +pi/4], and let n = k mod 4.
 *      We have
 *
 *          n        sin(x)      cos(x)        tan(x)
 *     ----------------------------------------------------------
 *          0          S           C             T
 *          1          C          -S            -1/T
 *          2         -S          -C             T
 *          3         -C           S            -1/T
 *     ----------------------------------------------------------
 *
 * Special cases:
 *      Let trig be any of sin, cos, or tan.
 *      trig(+-INF)  is NaN, with signals;
 *      trig(NaN)    is that NaN;
 *
 * Accuracy:
 *      TRIG(x) returns trig(x) nearly rounded
 */
 #include "libm.h"
 double sin(double x)
 {
 	double y[2];
 	uint32_t ix;
 	unsigned n;
 	/* High word of x. */
 	GET_HIGH_WORD(ix, x);
 	ix &= 0x7fffffff;
 	/* |x| ~< pi/4 */
 	if (ix <= 0x3fe921fb) {
 		if (ix < 0x3e500000) {  /* |x| < 2**-26 */
 			/* raise inexact if x != 0 and underflow if subnormal*/
 			FORCE_EVAL(ix < 0x00100000 ? x/0x1p120f : x+0x1p120f);
 			return x;
 		}
 		return __sin(x, 0.0, 0);
 	}
 	/* sin(Inf or NaN) is NaN */
 	if (ix >= 0x7ff00000)
 		return x - x;
 	/* argument reduction needed */
 	n = __rem_pio2(x, y);
 	switch (n&3) {
 	case 0: return  __sin(y[0], y[1], 1);
 	case 1: return  __cos(y[0], y[1]);
 	case 2: return -__sin(y[0], y[1], 1);
 	default:
 		return -__cos(y[0], y[1]);
 	}
 }
--- a/cstdlib/src/sinf.c
+++ b/cstdlib/src/sinf.c
@ -0,0 +1,76 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
 * Optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 #include "libm.h"
 /* Small multiples of pi/2 rounded to double precision. */
 static const double
 s1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
 s2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
 s3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
 s4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */
 float sinf(float x)
 {
 	double y;
 	uint32_t ix;
 	int n, sign;
 	GET_FLOAT_WORD(ix, x);
 	sign = ix >> 31;
 	ix &= 0x7fffffff;
 	if (ix <= 0x3f490fda) {  /* |x| ~<= pi/4 */
 		if (ix < 0x39800000) {  /* |x| < 2**-12 */
 			/* raise inexact if x!=0 and underflow if subnormal */
 			FORCE_EVAL(ix < 0x00800000 ? x/0x1p120f : x+0x1p120f);
 			return x;
 		}
 		return __sindf(x);
 	}
 	if (ix <= 0x407b53d1) {  /* |x| ~<= 5*pi/4 */
 		if (ix <= 0x4016cbe3) {  /* |x| ~<= 3pi/4 */
 			if (sign)
 				return -__cosdf(x + s1pio2);
 			else
 				return __cosdf(x - s1pio2);
 		}
 		return __sindf(sign ? -(x + s2pio2) : -(x - s2pio2));
 	}
 	if (ix <= 0x40e231d5) {  /* |x| ~<= 9*pi/4 */
 		if (ix <= 0x40afeddf) {  /* |x| ~<= 7*pi/4 */
 			if (sign)
 				return __cosdf(x + s3pio2);
 			else
 				return -__cosdf(x - s3pio2);
 		}
 		return __sindf(sign ? x + s4pio2 : x - s4pio2);
 	}
 	/* sin(Inf or NaN) is NaN */
 	if (ix >= 0x7f800000)
 		return x - x;
 	/* general argument reduction needed */
 	n = __rem_pio2f(x, &y);
 	switch (n&3) {
 	case 0: return  __sindf(y);
 	case 1: return  __cosdf(y);
 	case 2: return  __sindf(-y);
 	default:
 		return -__cosdf(y);
 	}
 }
--- a/cstdlib/src/sqrtf.c
+++ b/cstdlib/src/sqrtf.c
@ -0,0 +1,83 @@
 #include <stdint.h>
 #include <math.h>
 #include "libm.h"
 #include "sqrt_data.h"
 #define FENV_SUPPORT 1
 static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
 	return (uint64_t)a*b >> 32;
 }
 /* see sqrt.c for more detailed comments.  */
 float sqrtf(float x)
 {
 	uint32_t ix, m, m1, m0, even, ey;
 	ix = asuint(x);
 	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
 		/* x < 0x1p-126 or inf or nan.  */
 		if (ix * 2 == 0)
 			return x;
 		if (ix == 0x7f800000)
 			return x;
 		if (ix > 0x7f800000)
 			return __math_invalidf(x);
 		/* x is subnormal, normalize it.  */
 		ix = asuint(x * 0x1p23f);
 		ix -= 23 << 23;
 	}
 	/* x = 4^e m; with int e and m in [1, 4).  */
 	even = ix & 0x00800000;
 	m1 = (ix << 8) | 0x80000000;
 	m0 = (ix << 7) & 0x7fffffff;
 	m = even ? m0 : m1;
 	/* 2^e is the exponent part of the return value.  */
 	ey = ix >> 1;
 	ey += 0x3f800000 >> 1;
 	ey &= 0x7f800000;
 	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
 	static const uint32_t three = 0xc0000000;
 	uint32_t r, s, d, u, i;
 	i = (ix >> 17) % 128;
 	r = (uint32_t)__rsqrt_tab[i] << 16;
 	/* |r*sqrt(m) - 1| < 0x1p-8 */
 	s = mul32(m, r);
 	/* |s/sqrt(m) - 1| < 0x1p-8 */
 	d = mul32(s, r);
 	u = three - d;
 	r = mul32(r, u) << 1;
 	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
 	s = mul32(s, u) << 1;
 	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
 	d = mul32(s, r);
 	u = three - d;
 	s = mul32(s, u);
 	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
 	s = (s - 1)>>6;
 	/* s < sqrt(m) < s + 0x1.08p-23 */
 	/* compute nearest rounded result.  */
 	uint32_t d0, d1, d2;
 	float y, t;
 	d0 = (m << 16) - s*s;
 	d1 = s - d0;
 	d2 = d1 + s + 1;
 	s += d1 >> 31;
 	s &= 0x007fffff;
 	s |= ey;
 	y = asfloat(s);
 	if (FENV_SUPPORT) {
 		/* handle rounding and inexact exception. */
 		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
 		tiny |= (d1^d2) & 0x80000000;
 		t = asfloat(tiny);
 		y = eval_as_float(y + t);
 	}
 	return y;
 }
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 36c75a55dd1d843b29691293410344a9f48a9201
+Subproject commit 59fdc27ac6dd84af4c9ed505213dea0fec641c32
--- a/samples/pong/.gitignore
+++ b/samples/pong/.gitignore
@ -1 +1,3 @@
 Pong
 profile.dtrace
 profile.spall
--- a/samples/pong/build.sh
+++ b/samples/pong/build.sh
@ -11,6 +11,10 @@ else
  CLANG=clang
 fi
 STDLIB_DIR=../../cstdlib
 ORCA_SDK_DIR=../../sdk
 MILEPOST_DIR=../../milepost
 wasmFlags="--target=wasm32 \
  --no-standard-libraries \
  -fno-builtin \
@ -20,7 +24,9 @@ wasmFlags="--target=wasm32 \
  -O2 \
  -mbulk-memory \
  -D__ORCA__ \
-	   -isystem ../../cstdlib/include -I ../../sdk -I../../milepost/ext -I ../../milepost -I ../../milepost/src -I ../../milepost/src/util -I ../../milepost/src/platform -I../.."
+  -I $STDLIB_DIR/include \
  -I $ORCA_SDK_DIR \
  -I $MILEPOST_DIR/ext -I $MILEPOST_DIR -I $MILEPOST_DIR/src -I $MILEPOST_DIR/src/util -I $MILEPOST_DIR/src/platform"
 $CLANG $wasmFlags -o ./module.wasm ../../sdk/orca.c ../../cstdlib/src/*.c src/main.c
--- a/samples/pong/data/ball.png
+++ b/samples/pong/data/ball.png
--- a/samples/pong/data/underwater.jpg
+++ b/samples/pong/data/underwater.jpg
--- a/samples/pong/src/main.c
+++ b/samples/pong/src/main.c
@ -1,95 +1,95 @@
-/************************************************************//**
+#include <keys.h>
-*
+#include <graphics.h>
-*	@file: wasm_main.cpp
+#include <math.h>
 *	@author: Martin Fouilleul
 *	@date: 14/08/2022
 *	@revision:
 *
 *****************************************************************/
-#include"keys.h"
+#include <orca.h>
 #include"graphics.h"
-#include"orca.h"
+#define NUM_BLOCKS_PER_ROW 7
 #define NUM_BLOCKS 42 // 7 * 6
-#define M_PI 3.14159265358979323846
+#define BLOCKS_WIDTH 810.0f
 #define BLOCK_HEIGHT 30.0f
 #define BLOCKS_PADDING 15.0f
 #define BLOCKS_BOTTOM 300.0f
 const f32 BLOCK_WIDTH = (BLOCKS_WIDTH - ((NUM_BLOCKS_PER_ROW + 1) * BLOCKS_PADDING)) / NUM_BLOCKS_PER_ROW;
-extern float cosf(float x);
+#define PADDLE_MAX_LAUNCH_ANGLE 0.7f
 extern float sinf(float x);
 const mg_color paddleColor = {1, 0, 0, 1};
-mp_rect paddle = {200, 40, 200, 40};
+mp_rect paddle = {300, 50, 200, 24};
 const mg_color ballColor = {1, 1, 0, 1};
-mp_rect ball = {200, 200, 60, 60};
+mp_rect ball = {200, 200, 20, 20};
-vec2 velocity = {10, 10};
+vec2 velocity = {5, 5};
 // This is upside down from how it will actually be drawn.
 int blockHealth[NUM_BLOCKS] = {
    0, 1, 1, 1, 1, 1, 0,
    1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3,
    3, 3, 3, 3, 3, 3, 3,
 };
 vec2 frameSize = {100, 100};
 bool leftDown = false;
 bool rightDown = false;
 mg_canvas canvas;
 mg_surface surface;
 mg_canvas canvas;
 mg_image waterImage;
 mg_image ballImage;
 mg_image paddleImage;
 mg_font pongFont;
 // TODO(ben): Why is this here? Why isn't it forward-declared by some header?
 mg_surface mg_surface_main(void);
 f32 lerp(f32 a, f32 b, f32 t);
 mp_rect blockRect(int i);
 int checkCollision(mp_rect block);
 mg_mat2x3 flipY(mp_rect r);
 mg_mat2x3 flipYAt(vec2 pos);
 str8 loadFile(mem_arena* arena, str8 filename) {
    file_handle file = file_open(filename, FILE_ACCESS_READ, 0);
    if(file_last_error(file) != IO_OK)
    {
        log_error("Couldn't open file %s\n", str8_to_cstring(mem_scratch(), filename));
    }
    u64 size = file_size(file);
    char* buffer = mem_arena_alloc(arena, size);
    file_read(file, size, buffer);
    file_close(file);
    return str8_from_buffer(size, buffer);
 }
 ORCA_EXPORT void OnInit(void)
 {
 	//TODO create surface for main window
    surface = mg_surface_main();
    canvas = mg_canvas_create();
-	log_info("try allocating\n");
+    waterImage = mg_image_create_from_data(surface, loadFile(mem_scratch(), STR8("/underwater.jpg")), false);
    ballImage = mg_image_create_from_data(surface, loadFile(mem_scratch(), STR8("/ball.png")), false);
    paddleImage = mg_image_create_from_data(surface, loadFile(mem_scratch(), STR8("/wall.png")), false);
-	char* foo = malloc(1024);
+    if(mg_image_is_nil(waterImage))
 	free(foo);
 	log_info("allocated and freed 1024 bytes\n");
 	//NOTE: load ball texture
    {
-		file_handle file = file_open(STR8("/ball.png"), FILE_ACCESS_READ, 0);
+		log_error("coulnd't load ball image\n");
 		if(file_last_error(file) != IO_OK)
 		{
 			log_error("Couldn't open file ball.png\n");
    }
-		u64 size = file_size(file);
+    if(mg_image_is_nil(ballImage))
-		char* buffer = mem_arena_alloc(mem_scratch(), size);
+    {
-		file_read(file, size, buffer);
+		log_error("coulnd't load ball image\n");
-		file_close(file);
+    }
-		ballImage = mg_image_create_from_data(surface, str8_from_buffer(size, buffer), false);
+    if(mg_image_is_nil(paddleImage))
    {
 		log_error("coulnd't load paddle image\n");
    }
-	//NOTE: load paddle texture
+    str8 fontStr = loadFile(mem_scratch(), STR8("/Literata-SemiBoldItalic.ttf"));
 	{
 		file_handle file = file_open(STR8("/wall.png"), FILE_ACCESS_READ, 0);
 		if(file_last_error(file) != IO_OK)
 		{
 			log_error("Couldn't open file wall.png\n");
 		}
 		u64 size = file_size(file);
 		char* buffer = mem_arena_alloc(mem_scratch(), size);
 		file_read(file, size, buffer);
 		file_close(file);
 		paddleImage = mg_image_create_from_data(surface, str8_from_buffer(size, buffer), false);
 	}
 	//NOTE: load paddle texture
 	{
 		file_handle file = file_open(STR8("/Literata-SemiBoldItalic.ttf"), FILE_ACCESS_READ, 0);
 		if(file_last_error(file) != IO_OK)
 		{
 			log_error("Couldn't open file Literata-SemiBoldItalic.ttf\n");
 		}
 		u64 size = file_size(file);
 		char* buffer = mem_arena_alloc(mem_scratch(), size);
 		file_read(file, size, buffer);
 		file_close(file);
    unicode_range ranges[5] = {UNICODE_RANGE_BASIC_LATIN,
                               UNICODE_RANGE_C1_CONTROLS_AND_LATIN_1_SUPPLEMENT,
                               UNICODE_RANGE_LATIN_EXTENDED_A,
@ -97,8 +97,7 @@ ORCA_EXPORT void OnInit(void)
                               UNICODE_RANGE_SPECIALS};
    // NOTE(ben): Weird that images are "create from data" but fonts are "create from memory"
    // TODO: Decide whether we're using strings or explicit pointer + length
-		pongFont = mg_font_create_from_memory(size, (byte*)buffer, 5, ranges);
+    pongFont = mg_font_create_from_memory(fontStr.len, (byte*)fontStr.ptr, 5, ranges);
 	}
    mem_arena_clear(mem_scratch());
 }
@ -176,73 +175,291 @@ ORCA_EXPORT void OnFrameRefresh(void)
    ball.x = Clamp(ball.x, 0, frameSize.x - ball.w);
    ball.y = Clamp(ball.y, 0, frameSize.y - ball.h);
-    if(ball.x + ball.w >= frameSize.x)
+    if (ball.x + ball.w >= frameSize.x) {
-    {
+        velocity.x = -velocity.x;
 		velocity.x = -10;
    }
-    if(ball.x <= 0)
+    if (ball.x <= 0) {
-    {
+        velocity.x = -velocity.x;
 		velocity.x = +10;
    }
-    if(ball.y + ball.h >= frameSize.y)
+    if (ball.y + ball.h >= frameSize.y) {
-    {
+        velocity.y = -velocity.y;
 		velocity.y = -10;
    }
-    if(ball.y <= paddle.y + paddle.h
+    if (
        ball.y <= paddle.y + paddle.h
        && ball.x+ball.w >= paddle.x
        && ball.x <= paddle.x + paddle.w
-       && velocity.y < 0)
+        && velocity.y < 0
-    {
+    ) {
-		velocity.y *= -1;
+        f32 t = ((ball.x + ball.w/2) - paddle.x) / paddle.w;
        f32 launchAngle = lerp(-PADDLE_MAX_LAUNCH_ANGLE, PADDLE_MAX_LAUNCH_ANGLE, t);
        f32 speed = sqrtf(velocity.x*velocity.x + velocity.y*velocity.y);
        velocity = (vec2){
            sinf(launchAngle) * speed,
            cosf(launchAngle) * speed,
        };
        ball.y = paddle.y + paddle.h;
        log_info("PONG!");
    }
-    if(ball.y <= 0)
+    if (ball.y <= 0) {
    {
        ball.x = frameSize.x/2. - ball.w;
        ball.y = frameSize.y/2. - ball.h;
    }
    for (int i = 0; i < NUM_BLOCKS; i++) {
        if (blockHealth[i] <= 0) {
            continue;
        }
        mp_rect r = blockRect(i);
        int result = checkCollision(r);
        if (result) {
            log_info("Collision! direction=%d", result);
            blockHealth[i] -= 1;
            f32 vx = velocity.x;
            f32 vy = velocity.y;
            switch (result) {
            case 1:
            case 5:
                velocity.y = -vy;
                break;
            case 3:
            case 7:
                velocity.x = -vx;
                break;
            case 2:
            case 6:
                velocity.x = -vy;
                velocity.y = -vx;
                break;
            case 4:
            case 8:
                velocity.x = vy;
                velocity.y = vx;
                break;
            }
        }
    }
    mg_canvas_set_current(canvas);
-	mg_set_color_rgba(0, 1, 1, 1);
+    mg_set_color_rgba(10.0f/255.0f, 31.0f/255.0f, 72.0f/255.0f, 1);
    mg_clear();
-	mg_mat2x3 transform = {1, 0, 0,
+    mg_image_draw(waterImage, (mp_rect){0, 0, frameSize.x, frameSize.y});
 	                       0, -1, frameSize.y};
-	mg_matrix_push(transform);
+    mg_mat2x3 yUp = {
        1, 0, 0,
        0, -1, frameSize.y,
    };
-	mg_image_draw(paddleImage, paddle);
+    mg_matrix_push(yUp);
-	/*
+    {
-	mg_set_color(paddleColor);
+        for (int i = 0; i < NUM_BLOCKS; i++) {
-	mg_rectangle_fill(paddle.x, paddle.y, paddle.w, paddle.h);
+            if (blockHealth[i] <= 0) {
-	*/
+                continue;
            }
-	mg_image_draw(ballImage, ball);
+            mp_rect r = blockRect(i);
-	/*
+            mg_set_color_rgba(0, 0, 0, 0.2);
-	mg_set_color(ballColor);
+            mg_rounded_rectangle_fill(r.x, r.y-2, r.w, r.h, 4);
-	mg_circle_fill(ball.x+ball.w/2, ball.y + ball.w/2, ball.w/2.);
+            mg_set_color_rgba(0.9, 0.9, 0.9, 1);
-	*/
+            mg_rounded_rectangle_fill(r.x, r.y, r.w, r.h, 4);
-    mg_matrix_pop();
+            int fontSize = 18;
            str8 text = str8_pushf(mem_scratch(),
                "%d", blockHealth[i]
            );
            mp_rect textRect = mg_text_bounding_box(pongFont, fontSize, text);
            vec2 textPos = {
                r.x + r.w/2 - textRect.w/2,
                r.y + 9, // TODO: mg_text_bounding_box is returning extremely wack results for height.
            };
            mg_set_color_rgba(0, 0, 0, 1);
            mg_set_font(pongFont);
-	mg_set_font_size(14);
+            mg_set_font_size(18);
-	mg_move_to(10, 20);
+            mg_move_to(textPos.x, textPos.y);
-
+            mg_matrix_push(flipYAt(textPos));
-	str8 text = str8_pushf(mem_scratch(),
+            {
 		"wahoo I'm did a text. ball is at x = %f, y = %f",
 		ball.x, ball.y
 	);
                mg_text_outlines(text);
                mg_fill();
            }
            mg_matrix_pop();
        }
        mg_set_color_rgba(0.9, 0.9, 0.9, 1);
        mg_rounded_rectangle_fill(paddle.x, paddle.y, paddle.w, paddle.h, 4);
        mg_matrix_push(flipY(ball));
        {
            mg_image_draw(ballImage, ball);
        }
        mg_matrix_pop();
    }
    mg_matrix_pop();
    mg_surface_prepare(surface);
    mg_render(surface, canvas);
    mg_surface_present(surface);
 }
 mp_rect blockRect(int i) {
    int row = i / NUM_BLOCKS_PER_ROW;
    int col = i % NUM_BLOCKS_PER_ROW;
    return (mp_rect){
        BLOCKS_PADDING + (BLOCKS_PADDING + BLOCK_WIDTH) * col,
        BLOCKS_BOTTOM + (BLOCKS_PADDING + BLOCK_HEIGHT) * row,
        BLOCK_WIDTH,
        BLOCK_HEIGHT
    };
 }
 // Returns a cardinal direction 1-8 for the collision with the block, or zero
 // if no collision. 1 is straight up and directions proceed clockwise.
 int checkCollision(mp_rect block) {
    // Note that all the logic for this game has the origin in the bottom left.
    f32 ballx2 = ball.x + ball.w;
    f32 bally2 = ball.y + ball.h;
    f32 blockx2 = block.x + block.w;
    f32 blocky2 = block.y + block.h;
    if (
        ballx2 < block.x
        || blockx2 < ball.x
        || bally2 < block.y
        || blocky2 < ball.y
    ) {
        // Ball is fully outside block
        return 0;
    }
    // if (
    //     (block.x <= ball.x && ballx2 <= blockx2)
    //     && (block.y <= ball.y && bally2 <= blocky2)
    // ) {
    //     // Ball is fully inside block; do not consider as a collision
    //     return 0;
    // }
    // If moving right, the ball can bounce off its top right corner, right
    // side, or bottom right corner. Corner bounces occur if the block's bottom
    // left corner is in the ball's top right quadrant, or if the block's top
    // left corner is in the ball's bottom left quadrant. Otherwise, an edge
    // bounce occurs if the block's left edge falls in either of the ball's
    // right quadrants.
    //
    // This logic generalizes to other directions.
    //
    // We assume significant tunneling can't happen.
    vec2 ballCenter = (vec2){ball.x + ball.w/2, ball.y + ball.h/2};
    vec2 blockCenter = (vec2){block.x + block.w/2, block.y + block.h/2};
    // Moving right
    if (velocity.x > 0) {
        // Ball's top right corner
        if (
            ballCenter.x <= block.x && block.x <= ballx2
            && ballCenter.y <= block.y && block.y <= bally2
        ) { return 2; }
        // Ball's bottom right corner
        if (
            ballCenter.x <= block.x && block.x <= ballx2
            && ball.y <= blocky2 && blocky2 <= ballCenter.y
        ) { return 4; }
        // Ball's right edge
        if (
            ballCenter.x <= block.x && block.x <= ballx2
        ) { return 3; }
    }
    // Moving up
    if (velocity.y > 0) {
        // Ball's top left corner
        if (
            ball.x <= blockx2 && blockx2 <= ballCenter.x
            && ballCenter.y <= block.y && block.y <= bally2
        ) { return 8; }
        // Ball's top right corner
        if (
            ballCenter.x <= block.x && block.x <= ballx2
            && ballCenter.y <= block.y && block.y <= bally2
        ) { return 2; }
        // Ball's top edge
        if (
            ballCenter.y <= block.y && block.y <= bally2
        ) { return 1; }
    }
    // Moving left
    if (velocity.x < 0) {
        // Ball's bottom left corner
        if (
            ball.x <= blockx2 && blockx2 <= ballCenter.x
            && ball.y <= blocky2 && blocky2 <= ballCenter.y
        ) { return 6; }
        // Ball's top left corner
        if (
            ball.x <= blockx2 && blockx2 <= ballCenter.x
            && ballCenter.y <= block.y && block.y <= bally2
        ) { return 8; }
        // Ball's left edge
        if (
            ball.x <= blockx2 && blockx2 <= ballCenter.x
        ) { return 7; }
    }
    // Moving down
    if (velocity.y < 0) {
        // Ball's bottom right corner
        if (
            ballCenter.x <= block.x && block.x <= ballx2
            && ball.y <= blocky2 && blocky2 <= ballCenter.y
        ) { return 4; }
        // Ball's bottom left corner
        if (
            ball.x <= blockx2 && blockx2 <= ballCenter.x
            && ball.y <= blocky2 && blocky2 <= ballCenter.y
        ) { return 6; }
        // Ball's bottom edge
        if (
            ball.y <= blocky2 && blocky2 <= ballCenter.y
        ) { return 5; }
    }
    return 0;
 }
 f32 lerp(f32 a, f32 b, f32 t) {
    return (1 - t) * a + t * b;
 }
 mg_mat2x3 flipY(mp_rect r) {
    return (mg_mat2x3){
        1, 0, 0,
        0, -1, 2 * r.y + r.h,
    };
 }
 mg_mat2x3 flipYAt(vec2 pos) {
    return (mg_mat2x3){
        1, 0, 0,
        0, -1, 2 * pos.y,
    };
 }
--- a/samples/pong/trace.sh
+++ b/samples/pong/trace.sh
@ -0,0 +1,16 @@
 #!/bin/bash
 # You will need dtrace2spall installed:
 #
 #   go install github.com/bvisness/dtrace2spall@latest
 #
 # You will also need `$(go env GOPATH)/bin` on your PATH.
 # Run DTrace and save to profile.dtrace
 sudo dtrace -n 'profile-997 /pid == $target/ { @[timestamp, pid, tid, ustack(100)] = count(); }' \
    -x ustackframes=100 \
    -o profile.dtrace \
    -x aggsortkey -x aggsortkeypos=0 \
    -c ./Pong.app/Contents/MacOS/orca
 # Convert to Spall and save to profile.spall
 cat profile.dtrace | dtrace2spall --freq 997 -o profile.spall --fields=_,pid,tid
		`@ -1 +1 @@`
			`Subproject commit 36c75a55dd1d843b29691293410344a9f48a9201`				`Subproject commit 59fdc27ac6dd84af4c9ed505213dea0fec641c32`