We can render text!

2023-06-28 20:43:52 -05:00 · 2023-06-28 20:43:52 -05:00 · b792476d36
parent b7e3e83f3e
commit b792476d36
15 changed files with 682 additions and 0 deletions
--- a/cstdlib/include/bits/float.h
+++ b/cstdlib/include/bits/float.h
@ -0,0 +1,16 @@
 #define FLT_EVAL_METHOD 0
 #define LDBL_TRUE_MIN 4.94065645841246544177e-324L
 #define LDBL_MIN 2.22507385850720138309e-308L
 #define LDBL_MAX 1.79769313486231570815e+308L
 #define LDBL_EPSILON 2.22044604925031308085e-16L
 #define LDBL_MANT_DIG 53
 #define LDBL_MIN_EXP (-1021)
 #define LDBL_MAX_EXP 1024
 #define LDBL_DIG 15
 #define LDBL_MIN_10_EXP (-307)
 #define LDBL_MAX_10_EXP 308
 #define DECIMAL_DIG 17
--- a/cstdlib/include/float.h
+++ b/cstdlib/include/float.h
@ -0,0 +1,52 @@
 #ifndef _FLOAT_H
 #define _FLOAT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 int __flt_rounds(void);
 #define FLT_ROUNDS (__flt_rounds())
 #define FLT_RADIX 2
 #define FLT_TRUE_MIN 1.40129846432481707092e-45F
 #define FLT_MIN 1.17549435082228750797e-38F
 #define FLT_MAX 3.40282346638528859812e+38F
 #define FLT_EPSILON 1.1920928955078125e-07F
 #define FLT_MANT_DIG 24
 #define FLT_MIN_EXP (-125)
 #define FLT_MAX_EXP 128
 #define FLT_HAS_SUBNORM 1
 #define FLT_DIG 6
 #define FLT_DECIMAL_DIG 9
 #define FLT_MIN_10_EXP (-37)
 #define FLT_MAX_10_EXP 38
 #define DBL_TRUE_MIN 4.94065645841246544177e-324
 #define DBL_MIN 2.22507385850720138309e-308
 #define DBL_MAX 1.79769313486231570815e+308
 #define DBL_EPSILON 2.22044604925031308085e-16
 #define DBL_MANT_DIG 53
 #define DBL_MIN_EXP (-1021)
 #define DBL_MAX_EXP 1024
 #define DBL_HAS_SUBNORM 1
 #define DBL_DIG 15
 #define DBL_DECIMAL_DIG 17
 #define DBL_MIN_10_EXP (-307)
 #define DBL_MAX_10_EXP 308
 #define LDBL_HAS_SUBNORM 1
 #define LDBL_DECIMAL_DIG DECIMAL_DIG
 #include <bits/float.h>
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/cstdlib/include/math.h
+++ b/cstdlib/include/math.h
@ -1,3 +1,10 @@
 #ifndef _MATH_H
 #define _MATH_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 // NOTE(orca): not doing anything fancy for float_t and double_t
 typedef float float_t;
 typedef double double_t;
@ -5,6 +12,60 @@ typedef double double_t;
 #define NAN       __builtin_nanf("")
 #define INFINITY  __builtin_inff()
 #define FP_NAN       0
 #define FP_INFINITE  1
 #define FP_ZERO      2
 #define FP_SUBNORMAL 3
 #define FP_NORMAL    4
 int __fpclassify(double);
 int __fpclassifyf(float);
 int __fpclassifyl(long double);
 static __inline unsigned __FLOAT_BITS(float __f)
 {
 	union {float __f; unsigned __i;} __u;
 	__u.__f = __f;
 	return __u.__i;
 }
 static __inline unsigned long long __DOUBLE_BITS(double __f)
 {
 	union {double __f; unsigned long long __i;} __u;
 	__u.__f = __f;
 	return __u.__i;
 }
 #define fpclassify(x) ( \
 	sizeof(x) == sizeof(float) ? __fpclassifyf(x) : \
 	sizeof(x) == sizeof(double) ? __fpclassify(x) : \
 	__fpclassifyl(x) )
 #define isinf(x) ( \
 	sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) == 0x7f800000 : \
 	sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) == 0x7ffULL<<52 : \
 	__fpclassifyl(x) == FP_INFINITE)
 #define isnan(x) ( \
 	sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
 	sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
 	__fpclassifyl(x) == FP_NAN)
 double      acos(double);
 double      ceil(double);
 double      fabs(double);
 double      floor(double);
 double      fmod(double, double);
 double      pow(double, double);
 double      sqrt(double);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/cstdlib/include/stdio.h
+++ b/cstdlib/include/stdio.h
@ -1,3 +1,10 @@
 #ifndef _STDIO_H
 #define _STDIO_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct _IO_FILE { char __x; };
 typedef struct _IO_FILE FILE;
@ -10,3 +17,9 @@ extern FILE *const stderr;
 #define stderr (stderr)
 int fprintf(FILE *__restrict, const char *__restrict, ...);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/cstdlib/include/stdlib.h
+++ b/cstdlib/include/stdlib.h
@ -1,3 +1,16 @@
 #ifndef _STDLIB_H
 #define _STDLIB_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 _Noreturn void abort (void);
 int abs (int);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/cstdlib/src/acos.c
+++ b/cstdlib/src/acos.c
@ -0,0 +1,101 @@
 /* origin: FreeBSD /usr/src/lib/msun/src/e_acos.c */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunSoft, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */
 /* acos(x)
 * Method :
 *      acos(x)  = pi/2 - asin(x)
 *      acos(-x) = pi/2 + asin(x)
 * For |x|<=0.5
 *      acos(x) = pi/2 - (x + x*x^2*R(x^2))     (see asin.c)
 * For x>0.5
 *      acos(x) = pi/2 - (pi/2 - 2asin(sqrt((1-x)/2)))
 *              = 2asin(sqrt((1-x)/2))
 *              = 2s + 2s*z*R(z)        ...z=(1-x)/2, s=sqrt(z)
 *              = 2f + (2c + 2s*z*R(z))
 *     where f=hi part of s, and c = (z-f*f)/(s+f) is the correction term
 *     for f so that f+c ~ sqrt(z).
 * For x<-0.5
 *      acos(x) = pi - 2asin(sqrt((1-|x|)/2))
 *              = pi - 0.5*(s+s*z*R(z)), where z=(1-|x|)/2,s=sqrt(z)
 *
 * Special cases:
 *      if x is NaN, return x itself;
 *      if |x|>1, return NaN with invalid signal.
 *
 * Function needed: sqrt
 */
 #include "libm.h"
 static const double
 pio2_hi = 1.57079632679489655800e+00, /* 0x3FF921FB, 0x54442D18 */
 pio2_lo = 6.12323399573676603587e-17, /* 0x3C91A626, 0x33145C07 */
 pS0 =  1.66666666666666657415e-01, /* 0x3FC55555, 0x55555555 */
 pS1 = -3.25565818622400915405e-01, /* 0xBFD4D612, 0x03EB6F7D */
 pS2 =  2.01212532134862925881e-01, /* 0x3FC9C155, 0x0E884455 */
 pS3 = -4.00555345006794114027e-02, /* 0xBFA48228, 0xB5688F3B */
 pS4 =  7.91534994289814532176e-04, /* 0x3F49EFE0, 0x7501B288 */
 pS5 =  3.47933107596021167570e-05, /* 0x3F023DE1, 0x0DFDF709 */
 qS1 = -2.40339491173441421878e+00, /* 0xC0033A27, 0x1C8A2D4B */
 qS2 =  2.02094576023350569471e+00, /* 0x40002AE5, 0x9C598AC8 */
 qS3 = -6.88283971605453293030e-01, /* 0xBFE6066C, 0x1B8D0159 */
 qS4 =  7.70381505559019352791e-02; /* 0x3FB3B8C5, 0xB12E9282 */
 static double R(double z)
 {
 	double_t p, q;
 	p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
 	q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
 	return p/q;
 }
 double acos(double x)
 {
 	double z,w,s,c,df;
 	uint32_t hx,ix;
 	GET_HIGH_WORD(hx, x);
 	ix = hx & 0x7fffffff;
 	/* |x| >= 1 or nan */
 	if (ix >= 0x3ff00000) {
 		uint32_t lx;
 		GET_LOW_WORD(lx,x);
 		if ((ix-0x3ff00000 | lx) == 0) {
 			/* acos(1)=0, acos(-1)=pi */
 			if (hx >> 31)
 				return 2*pio2_hi + 0x1p-120f;
 			return 0;
 		}
 		return 0/(x-x);
 	}
 	/* |x| < 0.5 */
 	if (ix < 0x3fe00000) {
 		if (ix <= 0x3c600000)  /* |x| < 2**-57 */
 			return pio2_hi + 0x1p-120f;
 		return pio2_hi - (x - (pio2_lo-x*R(x*x)));
 	}
 	/* x < -0.5 */
 	if (hx >> 31) {
 		z = (1.0+x)*0.5;
 		s = sqrt(z);
 		w = R(z)*s-pio2_lo;
 		return 2*(pio2_hi - (s+w));
 	}
 	/* x > 0.5 */
 	z = (1.0-x)*0.5;
 	s = sqrt(z);
 	df = s;
 	SET_LOW_WORD(df,0);
 	c = (z-df*df)/(s+df);
 	w = R(z)*s+c;
 	return 2*(df+w);
 }
--- a/cstdlib/src/ceil.c
+++ b/cstdlib/src/ceil.c
@ -0,0 +1,31 @@
 #include "libm.h"
 #if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
 #define EPS DBL_EPSILON
 #elif FLT_EVAL_METHOD==2
 #define EPS LDBL_EPSILON
 #endif
 static const double_t toint = 1/EPS;
 double ceil(double x)
 {
 	union {double f; uint64_t i;} u = {x};
 	int e = u.i >> 52 & 0x7ff;
 	double_t y;
 	if (e >= 0x3ff+52 || x == 0)
 		return x;
 	/* y = int(x) - x, where int(x) is an integer neighbor of x */
 	if (u.i >> 63)
 		y = x - toint + toint - x;
 	else
 		y = x + toint - toint - x;
 	/* special case because of non-nearest rounding modes */
 	if (e <= 0x3ff-1) {
 		FORCE_EVAL(y);
 		return u.i >> 63 ? -0.0 : 1;
 	}
 	if (y < 0)
 		return x + y + 1;
 	return x + y;
 }
--- a/cstdlib/src/floor.c
+++ b/cstdlib/src/floor.c
@ -0,0 +1,31 @@
 #include "libm.h"
 #if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
 #define EPS DBL_EPSILON
 #elif FLT_EVAL_METHOD==2
 #define EPS LDBL_EPSILON
 #endif
 static const double_t toint = 1/EPS;
 double floor(double x)
 {
 	union {double f; uint64_t i;} u = {x};
 	int e = u.i >> 52 & 0x7ff;
 	double_t y;
 	if (e >= 0x3ff+52 || x == 0)
 		return x;
 	/* y = int(x) - x, where int(x) is an integer neighbor of x */
 	if (u.i >> 63)
 		y = x - toint + toint - x;
 	else
 		y = x + toint - toint - x;
 	/* special case because of non-nearest rounding modes */
 	if (e <= 0x3ff-1) {
 		FORCE_EVAL(y);
 		return u.i >> 63 ? -1 : 0;
 	}
 	if (y > 0)
 		return x + y - 1;
 	return x + y;
 }
--- a/cstdlib/src/fmod.c
+++ b/cstdlib/src/fmod.c
@ -0,0 +1,68 @@
 #include <math.h>
 #include <stdint.h>
 double fmod(double x, double y)
 {
 	union {double f; uint64_t i;} ux = {x}, uy = {y};
 	int ex = ux.i>>52 & 0x7ff;
 	int ey = uy.i>>52 & 0x7ff;
 	int sx = ux.i>>63;
 	uint64_t i;
 	/* in the followings uxi should be ux.i, but then gcc wrongly adds */
 	/* float load/store to inner loops ruining performance and code size */
 	uint64_t uxi = ux.i;
 	if (uy.i<<1 == 0 || isnan(y) || ex == 0x7ff)
 		return (x*y)/(x*y);
 	if (uxi<<1 <= uy.i<<1) {
 		if (uxi<<1 == uy.i<<1)
 			return 0*x;
 		return x;
 	}
 	/* normalize x and y */
 	if (!ex) {
 		for (i = uxi<<12; i>>63 == 0; ex--, i <<= 1);
 		uxi <<= -ex + 1;
 	} else {
 		uxi &= -1ULL >> 12;
 		uxi |= 1ULL << 52;
 	}
 	if (!ey) {
 		for (i = uy.i<<12; i>>63 == 0; ey--, i <<= 1);
 		uy.i <<= -ey + 1;
 	} else {
 		uy.i &= -1ULL >> 12;
 		uy.i |= 1ULL << 52;
 	}
 	/* x mod y */
 	for (; ex > ey; ex--) {
 		i = uxi - uy.i;
 		if (i >> 63 == 0) {
 			if (i == 0)
 				return 0*x;
 			uxi = i;
 		}
 		uxi <<= 1;
 	}
 	i = uxi - uy.i;
 	if (i >> 63 == 0) {
 		if (i == 0)
 			return 0*x;
 		uxi = i;
 	}
 	for (; uxi>>52 == 0; uxi <<= 1, ex--);
 	/* scale result */
 	if (ex > 0) {
 		uxi -= 1ULL << 52;
 		uxi |= (uint64_t)ex << 52;
 	} else {
 		uxi >>= -ex + 1;
 	}
 	uxi |= (uint64_t)sx << 63;
 	ux.i = uxi;
 	return ux.f;
 }
--- a/cstdlib/src/libm.h
+++ b/cstdlib/src/libm.h
@ -1,4 +1,6 @@
 #include <stdint.h>
 #include <float.h>
 #include <math.h>
 #define WANT_ROUNDING 1
@ -30,11 +32,82 @@ static inline double eval_as_double(double x)
 	return y;
 }
 /* fp_force_eval ensures that the input value is computed when that's
   otherwise unused.  To prevent the constant folding of the input
   expression, an additional fp_barrier may be needed or a compilation
   mode that does so (e.g. -frounding-math in gcc). Then it can be
   used to evaluate an expression for its fenv side-effects only.   */
 #ifndef fp_force_evalf
 #define fp_force_evalf fp_force_evalf
 static inline void fp_force_evalf(float x)
 {
 	volatile float y;
 	y = x;
 }
 #endif
 #ifndef fp_force_eval
 #define fp_force_eval fp_force_eval
 static inline void fp_force_eval(double x)
 {
 	volatile double y;
 	y = x;
 }
 #endif
 #ifndef fp_force_evall
 #define fp_force_evall fp_force_evall
 static inline void fp_force_evall(long double x)
 {
 	volatile long double y;
 	y = x;
 }
 #endif
 #define FORCE_EVAL(x) do {                        \
 	if (sizeof(x) == sizeof(float)) {         \
 		fp_force_evalf(x);                \
 	} else if (sizeof(x) == sizeof(double)) { \
 		fp_force_eval(x);                 \
 	} else {                                  \
 		fp_force_evall(x);                \
 	}                                         \
 } while(0)
 #define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
 #define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
 #define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
 #define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
 #define EXTRACT_WORDS(hi,lo,d)                    \
 do {                                              \
  uint64_t __u = asuint64(d);                     \
  (hi) = __u >> 32;                               \
  (lo) = (uint32_t)__u;                           \
 } while (0)
 #define GET_HIGH_WORD(hi,d)                       \
 do {                                              \
  (hi) = asuint64(d) >> 32;                       \
 } while (0)
 #define GET_LOW_WORD(lo,d)                        \
 do {                                              \
  (lo) = (uint32_t)asuint64(d);                   \
 } while (0)
 #define INSERT_WORDS(d,hi,lo)                     \
 do {                                              \
  (d) = asdouble(((uint64_t)(hi)<<32) | (uint32_t)(lo)); \
 } while (0)
 #define SET_HIGH_WORD(d,hi)                       \
  INSERT_WORDS(d, hi, (uint32_t)asuint64(d))
 #define SET_LOW_WORD(d,lo)                        \
  INSERT_WORDS(d, asuint64(d)>>32, lo)
 double __math_xflow(uint32_t, double);
 double __math_uflow(uint32_t);
 double __math_oflow(uint32_t);
--- a/cstdlib/src/sqrt.c
+++ b/cstdlib/src/sqrt.c
@ -0,0 +1,158 @@
 #include <stdint.h>
 #include <math.h>
 #include "libm.h"
 #include "sqrt_data.h"
 #define FENV_SUPPORT 1
 /* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
 static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
 	return (uint64_t)a*b >> 32;
 }
 /* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
 static inline uint64_t mul64(uint64_t a, uint64_t b)
 {
 	uint64_t ahi = a>>32;
 	uint64_t alo = a&0xffffffff;
 	uint64_t bhi = b>>32;
 	uint64_t blo = b&0xffffffff;
 	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
 }
 double sqrt(double x)
 {
 	uint64_t ix, top, m;
 	/* special case handling.  */
 	ix = asuint64(x);
 	top = ix >> 52;
 	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
 		/* x < 0x1p-1022 or inf or nan.  */
 		if (ix * 2 == 0)
 			return x;
 		if (ix == 0x7ff0000000000000)
 			return x;
 		if (ix > 0x7ff0000000000000)
 			return __math_invalid(x);
 		/* x is subnormal, normalize it.  */
 		ix = asuint64(x * 0x1p52);
 		top = ix >> 52;
 		top -= 52;
 	}
 	/* argument reduction:
 	   x = 4^e m; with integer e, and m in [1, 4)
 	   m: fixed point representation [2.62]
 	   2^e is the exponent part of the result.  */
 	int even = top & 1;
 	m = (ix << 11) | 0x8000000000000000;
 	if (even) m >>= 1;
 	top = (top + 0x3ff) >> 1;
 	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
 	   initial estimate:
 	   7bit table lookup (1bit exponent and 6bit significand).
 	   iterative approximation:
 	   using 2 goldschmidt iterations with 32bit int arithmetics
 	   and a final iteration with 64bit int arithmetics.
 	   details:
 	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
 	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
 	   a table lookup is faster and needs one less iteration
 	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
 	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
 	   for single and double prec 6bit is enough but for quad
 	   prec 7bit is needed (or modified iterations). to avoid
 	   one more iteration >=13bit table would be needed (16k).
 	   a newton-raphson iteration for r is
 	     w = r*r
 	     u = 3 - m*w
 	     r = r*u/2
 	   can use a goldschmidt iteration for s at the end or
 	     s = m*r
 	   first goldschmidt iteration is
 	     s = m*r
 	     u = 3 - s*r
 	     r = r*u/2
 	     s = s*u/2
 	   next goldschmidt iteration is
 	     u = 3 - s*r
 	     r = r*u/2
 	     s = s*u/2
 	   and at the end r is not computed only s.
 	   they use the same amount of operations and converge at the
 	   same quadratic rate, i.e. if
 	     r1 sqrt(m) - 1 = e, then
 	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
 	   the advantage of goldschmidt is that the mul for s and r
 	   are independent (computed in parallel), however it is not
 	   "self synchronizing": it only uses the input m in the
 	   first iteration so rounding errors accumulate. at the end
 	   or when switching to larger precision arithmetics rounding
 	   errors dominate so the first iteration should be used.
 	   the fixed point representations are
 	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
 	   and after switching to 64 bit
 	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
 	static const uint64_t three = 0xc0000000;
 	uint64_t r, s, d, u, i;
 	i = (ix >> 46) % 128;
 	r = (uint32_t)__rsqrt_tab[i] << 16;
 	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
 	s = mul32(m>>32, r);
 	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
 	d = mul32(s, r);
 	u = three - d;
 	r = mul32(r, u) << 1;
 	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
 	s = mul32(s, u) << 1;
 	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
 	d = mul32(s, r);
 	u = three - d;
 	r = mul32(r, u) << 1;
 	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
 	r = r << 32;
 	s = mul64(m, r);
 	d = mul64(s, r);
 	u = (three<<32) - d;
 	s = mul64(s, u);  /* repr: 3.61 */
 	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
 	s = (s - 2) >> 9; /* repr: 12.52 */
 	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
 	/* s < sqrt(m) < s + 0x1.09p-52,
 	   compute nearest rounded result:
 	   the nearest result to 52 bits is either s or s+0x1p-52,
 	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
 	uint64_t d0, d1, d2;
 	double y, t;
 	d0 = (m << 42) - s*s;
 	d1 = s - d0;
 	d2 = d1 + s + 1;
 	s += d1 >> 63;
 	s &= 0x000fffffffffffff;
 	s |= top << 52;
 	y = asdouble(s);
 	if (FENV_SUPPORT) {
 		/* handle rounding modes and inexact exception:
 		   only (s+1)^2 == 2^42 m case is exact otherwise
 		   add a tiny value to cause the fenv effects.  */
 		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
 		tiny |= (d1^d2) & 0x8000000000000000;
 		t = asdouble(tiny);
 		y = eval_as_double(y + t);
 	}
 	return y;
 }
--- a/cstdlib/src/sqrt_data.c
+++ b/cstdlib/src/sqrt_data.c
@ -0,0 +1,19 @@
 #include "sqrt_data.h"
 const uint16_t __rsqrt_tab[128] = {
 0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
 0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
 0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
 0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
 0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
 0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
 0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
 0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
 0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
 0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
 0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
 0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
 0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
 0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
 0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
 0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
 };
--- a/cstdlib/src/sqrt_data.h
+++ b/cstdlib/src/sqrt_data.h
@ -0,0 +1,13 @@
 #ifndef _SQRT_DATA_H
 #define _SQRT_DATA_H
 #include <features.h>
 #include <stdint.h>
 /* if x in [1,2): i = (int)(64*x);
   if x in [2,4): i = (int)(32*x-64);
   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
 extern const uint16_t __rsqrt_tab[128];
 #endif
--- a/samples/pong/data/Literata-SemiBoldItalic.ttf
+++ b/samples/pong/data/Literata-SemiBoldItalic.ttf
--- a/samples/pong/src/main.c
+++ b/samples/pong/src/main.c
@ -34,6 +34,7 @@ mg_canvas canvas;
 mg_surface surface;
 mg_image ballImage;
 mg_image paddleImage;
 mg_font pongFont;
 mg_surface mg_surface_main(void);
@ -78,6 +79,26 @@ ORCA_EXPORT void OnInit(void)
 		paddleImage = mg_image_create_from_data(surface, str8_from_buffer(size, buffer), false);
 	}
 	//NOTE: load paddle texture
 	{
 		file_handle file = file_open(STR8("/Literata-SemiBoldItalic.ttf"), FILE_ACCESS_READ, 0);
 		if(file_last_error(file) != IO_OK)
 		{
 			log_error("Couldn't open file Literata-SemiBoldItalic.ttf\n");
 		}
 		u64 size = file_size(file);
 		char* buffer = mem_arena_alloc(mem_scratch(), size);
 		file_read(file, size, buffer);
 		file_close(file);
 		unicode_range ranges[5] = {UNICODE_RANGE_BASIC_LATIN,
 	                           UNICODE_RANGE_C1_CONTROLS_AND_LATIN_1_SUPPLEMENT,
 	                           UNICODE_RANGE_LATIN_EXTENDED_A,
 	                           UNICODE_RANGE_LATIN_EXTENDED_B,
 	                           UNICODE_RANGE_SPECIALS};
 		// NOTE(ben): Weird that images are "create from data" but fonts are "create from memory"
 		// TODO: Decide whether we're using strings or explicit pointer + length
 		pongFont = mg_font_create_from_memory(size, (byte*)buffer, 5, ranges);
 	}
 	mem_arena_clear(mem_scratch());
 }
@ -209,6 +230,18 @@ ORCA_EXPORT void OnFrameRefresh(void)
    mg_matrix_pop();
 	mg_set_color_rgba(0, 0, 0, 1);
 	mg_set_font(pongFont);
 	mg_set_font_size(14);
 	mg_move_to(10, 20);
 	str8 text = str8_pushf(mem_scratch(),
 		"wahoo I'm did a text. ball is at x = %f, y = %f",
 		ball.x, ball.y
 	);
 	mg_text_outlines(text);
 	mg_fill();
    mg_surface_prepare(surface);
    mg_render(surface, canvas);
    mg_surface_present(surface);