diff --git a/cstdlib/include/bits/float.h b/cstdlib/include/bits/float.h
new file mode 100644
index 0000000..c4a655e
--- /dev/null
+++ b/cstdlib/include/bits/float.h
@@ -0,0 +1,16 @@
+#define FLT_EVAL_METHOD 0
+
+#define LDBL_TRUE_MIN 4.94065645841246544177e-324L
+#define LDBL_MIN 2.22507385850720138309e-308L
+#define LDBL_MAX 1.79769313486231570815e+308L
+#define LDBL_EPSILON 2.22044604925031308085e-16L
+
+#define LDBL_MANT_DIG 53
+#define LDBL_MIN_EXP (-1021)
+#define LDBL_MAX_EXP 1024
+
+#define LDBL_DIG 15
+#define LDBL_MIN_10_EXP (-307)
+#define LDBL_MAX_10_EXP 308
+
+#define DECIMAL_DIG 17
diff --git a/cstdlib/include/float.h b/cstdlib/include/float.h
new file mode 100644
index 0000000..713aadb
--- /dev/null
+++ b/cstdlib/include/float.h
@@ -0,0 +1,52 @@
+#ifndef _FLOAT_H
+#define _FLOAT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __flt_rounds(void);
+#define FLT_ROUNDS (__flt_rounds())
+
+#define FLT_RADIX 2
+
+#define FLT_TRUE_MIN 1.40129846432481707092e-45F
+#define FLT_MIN 1.17549435082228750797e-38F
+#define FLT_MAX 3.40282346638528859812e+38F
+#define FLT_EPSILON 1.1920928955078125e-07F
+
+#define FLT_MANT_DIG 24
+#define FLT_MIN_EXP (-125)
+#define FLT_MAX_EXP 128
+#define FLT_HAS_SUBNORM 1
+
+#define FLT_DIG 6
+#define FLT_DECIMAL_DIG 9
+#define FLT_MIN_10_EXP (-37)
+#define FLT_MAX_10_EXP 38
+
+#define DBL_TRUE_MIN 4.94065645841246544177e-324
+#define DBL_MIN 2.22507385850720138309e-308
+#define DBL_MAX 1.79769313486231570815e+308
+#define DBL_EPSILON 2.22044604925031308085e-16
+
+#define DBL_MANT_DIG 53
+#define DBL_MIN_EXP (-1021)
+#define DBL_MAX_EXP 1024
+#define DBL_HAS_SUBNORM 1
+
+#define DBL_DIG 15
+#define DBL_DECIMAL_DIG 17
+#define DBL_MIN_10_EXP (-307)
+#define DBL_MAX_10_EXP 308
+
+#define LDBL_HAS_SUBNORM 1
+#define LDBL_DECIMAL_DIG DECIMAL_DIG
+
+#include <bits/float.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cstdlib/include/math.h b/cstdlib/include/math.h
index 77b813d..92a168e 100644
--- a/cstdlib/include/math.h
+++ b/cstdlib/include/math.h
@@ -1,3 +1,10 @@
+#ifndef _MATH_H
+#define _MATH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // NOTE(orca): not doing anything fancy for float_t and double_t
 typedef float float_t;
 typedef double double_t;
@@ -5,6 +12,60 @@ typedef double double_t;
 #define NAN       __builtin_nanf("")
 #define INFINITY  __builtin_inff()
 
+#define FP_NAN       0
+#define FP_INFINITE  1
+#define FP_ZERO      2
+#define FP_SUBNORMAL 3
+#define FP_NORMAL    4
+
+int __fpclassify(double);
+int __fpclassifyf(float);
+int __fpclassifyl(long double);
+
+static __inline unsigned __FLOAT_BITS(float __f)
+{
+	union {float __f; unsigned __i;} __u;
+	__u.__f = __f;
+	return __u.__i;
+}
+static __inline unsigned long long __DOUBLE_BITS(double __f)
+{
+	union {double __f; unsigned long long __i;} __u;
+	__u.__f = __f;
+	return __u.__i;
+}
+
+#define fpclassify(x) ( \
+	sizeof(x) == sizeof(float) ? __fpclassifyf(x) : \
+	sizeof(x) == sizeof(double) ? __fpclassify(x) : \
+	__fpclassifyl(x) )
+
+#define isinf(x) ( \
+	sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) == 0x7f800000 : \
+	sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) == 0x7ffULL<<52 : \
+	__fpclassifyl(x) == FP_INFINITE)
+
+#define isnan(x) ( \
+	sizeof(x) == sizeof(float) ? (__FLOAT_BITS(x) & 0x7fffffff) > 0x7f800000 : \
+	sizeof(x) == sizeof(double) ? (__DOUBLE_BITS(x) & -1ULL>>1) > 0x7ffULL<<52 : \
+	__fpclassifyl(x) == FP_NAN)
+
+double      acos(double);
+
+double      ceil(double);
+
 double      fabs(double);
 
+double      floor(double);
+
+double      fmod(double, double);
+
 double      pow(double, double);
+
+double      sqrt(double);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cstdlib/include/stdio.h b/cstdlib/include/stdio.h
index 0ef60b4..53a3f35 100644
--- a/cstdlib/include/stdio.h
+++ b/cstdlib/include/stdio.h
@@ -1,3 +1,10 @@
+#ifndef _STDIO_H
+#define _STDIO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct _IO_FILE { char __x; };
 typedef struct _IO_FILE FILE;
 
@@ -10,3 +17,9 @@ extern FILE *const stderr;
 #define stderr (stderr)
 
 int fprintf(FILE *__restrict, const char *__restrict, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cstdlib/include/stdlib.h b/cstdlib/include/stdlib.h
index 64d1b15..92f9469 100644
--- a/cstdlib/include/stdlib.h
+++ b/cstdlib/include/stdlib.h
@@ -1,3 +1,16 @@
+#ifndef _STDLIB_H
+#define _STDLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 _Noreturn void abort (void);
 
 int abs (int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cstdlib/src/acos.c b/cstdlib/src/acos.c
new file mode 100644
index 0000000..ea9c87b
--- /dev/null
+++ b/cstdlib/src/acos.c
@@ -0,0 +1,101 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_acos.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* acos(x)
+ * Method :
+ *      acos(x)  = pi/2 - asin(x)
+ *      acos(-x) = pi/2 + asin(x)
+ * For |x|<=0.5
+ *      acos(x) = pi/2 - (x + x*x^2*R(x^2))     (see asin.c)
+ * For x>0.5
+ *      acos(x) = pi/2 - (pi/2 - 2asin(sqrt((1-x)/2)))
+ *              = 2asin(sqrt((1-x)/2))
+ *              = 2s + 2s*z*R(z)        ...z=(1-x)/2, s=sqrt(z)
+ *              = 2f + (2c + 2s*z*R(z))
+ *     where f=hi part of s, and c = (z-f*f)/(s+f) is the correction term
+ *     for f so that f+c ~ sqrt(z).
+ * For x<-0.5
+ *      acos(x) = pi - 2asin(sqrt((1-|x|)/2))
+ *              = pi - 0.5*(s+s*z*R(z)), where z=(1-|x|)/2,s=sqrt(z)
+ *
+ * Special cases:
+ *      if x is NaN, return x itself;
+ *      if |x|>1, return NaN with invalid signal.
+ *
+ * Function needed: sqrt
+ */
+
+#include "libm.h"
+
+static const double
+pio2_hi = 1.57079632679489655800e+00, /* 0x3FF921FB, 0x54442D18 */
+pio2_lo = 6.12323399573676603587e-17, /* 0x3C91A626, 0x33145C07 */
+pS0 =  1.66666666666666657415e-01, /* 0x3FC55555, 0x55555555 */
+pS1 = -3.25565818622400915405e-01, /* 0xBFD4D612, 0x03EB6F7D */
+pS2 =  2.01212532134862925881e-01, /* 0x3FC9C155, 0x0E884455 */
+pS3 = -4.00555345006794114027e-02, /* 0xBFA48228, 0xB5688F3B */
+pS4 =  7.91534994289814532176e-04, /* 0x3F49EFE0, 0x7501B288 */
+pS5 =  3.47933107596021167570e-05, /* 0x3F023DE1, 0x0DFDF709 */
+qS1 = -2.40339491173441421878e+00, /* 0xC0033A27, 0x1C8A2D4B */
+qS2 =  2.02094576023350569471e+00, /* 0x40002AE5, 0x9C598AC8 */
+qS3 = -6.88283971605453293030e-01, /* 0xBFE6066C, 0x1B8D0159 */
+qS4 =  7.70381505559019352791e-02; /* 0x3FB3B8C5, 0xB12E9282 */
+
+static double R(double z)
+{
+	double_t p, q;
+	p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
+	q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
+	return p/q;
+}
+
+double acos(double x)
+{
+	double z,w,s,c,df;
+	uint32_t hx,ix;
+
+	GET_HIGH_WORD(hx, x);
+	ix = hx & 0x7fffffff;
+	/* |x| >= 1 or nan */
+	if (ix >= 0x3ff00000) {
+		uint32_t lx;
+
+		GET_LOW_WORD(lx,x);
+		if ((ix-0x3ff00000 | lx) == 0) {
+			/* acos(1)=0, acos(-1)=pi */
+			if (hx >> 31)
+				return 2*pio2_hi + 0x1p-120f;
+			return 0;
+		}
+		return 0/(x-x);
+	}
+	/* |x| < 0.5 */
+	if (ix < 0x3fe00000) {
+		if (ix <= 0x3c600000)  /* |x| < 2**-57 */
+			return pio2_hi + 0x1p-120f;
+		return pio2_hi - (x - (pio2_lo-x*R(x*x)));
+	}
+	/* x < -0.5 */
+	if (hx >> 31) {
+		z = (1.0+x)*0.5;
+		s = sqrt(z);
+		w = R(z)*s-pio2_lo;
+		return 2*(pio2_hi - (s+w));
+	}
+	/* x > 0.5 */
+	z = (1.0-x)*0.5;
+	s = sqrt(z);
+	df = s;
+	SET_LOW_WORD(df,0);
+	c = (z-df*df)/(s+df);
+	w = R(z)*s+c;
+	return 2*(df+w);
+}
diff --git a/cstdlib/src/ceil.c b/cstdlib/src/ceil.c
new file mode 100644
index 0000000..b13e6f2
--- /dev/null
+++ b/cstdlib/src/ceil.c
@@ -0,0 +1,31 @@
+#include "libm.h"
+
+#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+#define EPS DBL_EPSILON
+#elif FLT_EVAL_METHOD==2
+#define EPS LDBL_EPSILON
+#endif
+static const double_t toint = 1/EPS;
+
+double ceil(double x)
+{
+	union {double f; uint64_t i;} u = {x};
+	int e = u.i >> 52 & 0x7ff;
+	double_t y;
+
+	if (e >= 0x3ff+52 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i >> 63)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3ff-1) {
+		FORCE_EVAL(y);
+		return u.i >> 63 ? -0.0 : 1;
+	}
+	if (y < 0)
+		return x + y + 1;
+	return x + y;
+}
diff --git a/cstdlib/src/floor.c b/cstdlib/src/floor.c
new file mode 100644
index 0000000..14a31cd
--- /dev/null
+++ b/cstdlib/src/floor.c
@@ -0,0 +1,31 @@
+#include "libm.h"
+
+#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+#define EPS DBL_EPSILON
+#elif FLT_EVAL_METHOD==2
+#define EPS LDBL_EPSILON
+#endif
+static const double_t toint = 1/EPS;
+
+double floor(double x)
+{
+	union {double f; uint64_t i;} u = {x};
+	int e = u.i >> 52 & 0x7ff;
+	double_t y;
+
+	if (e >= 0x3ff+52 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i >> 63)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3ff-1) {
+		FORCE_EVAL(y);
+		return u.i >> 63 ? -1 : 0;
+	}
+	if (y > 0)
+		return x + y - 1;
+	return x + y;
+}
diff --git a/cstdlib/src/fmod.c b/cstdlib/src/fmod.c
new file mode 100644
index 0000000..6849722
--- /dev/null
+++ b/cstdlib/src/fmod.c
@@ -0,0 +1,68 @@
+#include <math.h>
+#include <stdint.h>
+
+double fmod(double x, double y)
+{
+	union {double f; uint64_t i;} ux = {x}, uy = {y};
+	int ex = ux.i>>52 & 0x7ff;
+	int ey = uy.i>>52 & 0x7ff;
+	int sx = ux.i>>63;
+	uint64_t i;
+
+	/* in the followings uxi should be ux.i, but then gcc wrongly adds */
+	/* float load/store to inner loops ruining performance and code size */
+	uint64_t uxi = ux.i;
+
+	if (uy.i<<1 == 0 || isnan(y) || ex == 0x7ff)
+		return (x*y)/(x*y);
+	if (uxi<<1 <= uy.i<<1) {
+		if (uxi<<1 == uy.i<<1)
+			return 0*x;
+		return x;
+	}
+
+	/* normalize x and y */
+	if (!ex) {
+		for (i = uxi<<12; i>>63 == 0; ex--, i <<= 1);
+		uxi <<= -ex + 1;
+	} else {
+		uxi &= -1ULL >> 12;
+		uxi |= 1ULL << 52;
+	}
+	if (!ey) {
+		for (i = uy.i<<12; i>>63 == 0; ey--, i <<= 1);
+		uy.i <<= -ey + 1;
+	} else {
+		uy.i &= -1ULL >> 12;
+		uy.i |= 1ULL << 52;
+	}
+
+	/* x mod y */
+	for (; ex > ey; ex--) {
+		i = uxi - uy.i;
+		if (i >> 63 == 0) {
+			if (i == 0)
+				return 0*x;
+			uxi = i;
+		}
+		uxi <<= 1;
+	}
+	i = uxi - uy.i;
+	if (i >> 63 == 0) {
+		if (i == 0)
+			return 0*x;
+		uxi = i;
+	}
+	for (; uxi>>52 == 0; uxi <<= 1, ex--);
+
+	/* scale result */
+	if (ex > 0) {
+		uxi -= 1ULL << 52;
+		uxi |= (uint64_t)ex << 52;
+	} else {
+		uxi >>= -ex + 1;
+	}
+	uxi |= (uint64_t)sx << 63;
+	ux.i = uxi;
+	return ux.f;
+}
diff --git a/cstdlib/src/libm.h b/cstdlib/src/libm.h
index 9109265..f54ca1b 100644
--- a/cstdlib/src/libm.h
+++ b/cstdlib/src/libm.h
@@ -1,4 +1,6 @@
 #include <stdint.h>
+#include <float.h>
+#include <math.h>
 
 #define WANT_ROUNDING 1
 
@@ -30,11 +32,82 @@ static inline double eval_as_double(double x)
 	return y;
 }
 
+/* fp_force_eval ensures that the input value is computed when that's
+   otherwise unused.  To prevent the constant folding of the input
+   expression, an additional fp_barrier may be needed or a compilation
+   mode that does so (e.g. -frounding-math in gcc). Then it can be
+   used to evaluate an expression for its fenv side-effects only.   */
+
+#ifndef fp_force_evalf
+#define fp_force_evalf fp_force_evalf
+static inline void fp_force_evalf(float x)
+{
+	volatile float y;
+	y = x;
+}
+#endif
+
+#ifndef fp_force_eval
+#define fp_force_eval fp_force_eval
+static inline void fp_force_eval(double x)
+{
+	volatile double y;
+	y = x;
+}
+#endif
+
+#ifndef fp_force_evall
+#define fp_force_evall fp_force_evall
+static inline void fp_force_evall(long double x)
+{
+	volatile long double y;
+	y = x;
+}
+#endif
+
+#define FORCE_EVAL(x) do {                        \
+	if (sizeof(x) == sizeof(float)) {         \
+		fp_force_evalf(x);                \
+	} else if (sizeof(x) == sizeof(double)) { \
+		fp_force_eval(x);                 \
+	} else {                                  \
+		fp_force_evall(x);                \
+	}                                         \
+} while(0)
+
 #define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
 #define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
 #define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
 #define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
 
+#define EXTRACT_WORDS(hi,lo,d)                    \
+do {                                              \
+  uint64_t __u = asuint64(d);                     \
+  (hi) = __u >> 32;                               \
+  (lo) = (uint32_t)__u;                           \
+} while (0)
+
+#define GET_HIGH_WORD(hi,d)                       \
+do {                                              \
+  (hi) = asuint64(d) >> 32;                       \
+} while (0)
+
+#define GET_LOW_WORD(lo,d)                        \
+do {                                              \
+  (lo) = (uint32_t)asuint64(d);                   \
+} while (0)
+
+#define INSERT_WORDS(d,hi,lo)                     \
+do {                                              \
+  (d) = asdouble(((uint64_t)(hi)<<32) | (uint32_t)(lo)); \
+} while (0)
+
+#define SET_HIGH_WORD(d,hi)                       \
+  INSERT_WORDS(d, hi, (uint32_t)asuint64(d))
+
+#define SET_LOW_WORD(d,lo)                        \
+  INSERT_WORDS(d, asuint64(d)>>32, lo)
+
 double __math_xflow(uint32_t, double);
 double __math_uflow(uint32_t);
 double __math_oflow(uint32_t);
diff --git a/cstdlib/src/sqrt.c b/cstdlib/src/sqrt.c
new file mode 100644
index 0000000..5ba2655
--- /dev/null
+++ b/cstdlib/src/sqrt.c
@@ -0,0 +1,158 @@
+#include <stdint.h>
+#include <math.h>
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
+
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
+	}
+
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
+	}
+	return y;
+}
diff --git a/cstdlib/src/sqrt_data.c b/cstdlib/src/sqrt_data.c
new file mode 100644
index 0000000..61bc22f
--- /dev/null
+++ b/cstdlib/src/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/cstdlib/src/sqrt_data.h b/cstdlib/src/sqrt_data.h
new file mode 100644
index 0000000..619193c
--- /dev/null
+++ b/cstdlib/src/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/samples/pong/data/Literata-SemiBoldItalic.ttf b/samples/pong/data/Literata-SemiBoldItalic.ttf
new file mode 100644
index 0000000..d49c441
Binary files /dev/null and b/samples/pong/data/Literata-SemiBoldItalic.ttf differ
diff --git a/samples/pong/src/main.c b/samples/pong/src/main.c
index 7d3cbf8..6f78c3f 100644
--- a/samples/pong/src/main.c
+++ b/samples/pong/src/main.c
@@ -34,6 +34,7 @@ mg_canvas canvas;
 mg_surface surface;
 mg_image ballImage;
 mg_image paddleImage;
+mg_font pongFont;
 
 mg_surface mg_surface_main(void);
 
@@ -78,6 +79,26 @@ ORCA_EXPORT void OnInit(void)
 		paddleImage = mg_image_create_from_data(surface, str8_from_buffer(size, buffer), false);
 	}
 
+	//NOTE: load paddle texture
+	{
+		file_handle file = file_open(STR8("/Literata-SemiBoldItalic.ttf"), FILE_ACCESS_READ, 0);
+		if(file_last_error(file) != IO_OK)
+		{
+			log_error("Couldn't open file Literata-SemiBoldItalic.ttf\n");
+		}
+		u64 size = file_size(file);
+		char* buffer = mem_arena_alloc(mem_scratch(), size);
+		file_read(file, size, buffer);
+		file_close(file);
+		unicode_range ranges[5] = {UNICODE_RANGE_BASIC_LATIN,
+	                           UNICODE_RANGE_C1_CONTROLS_AND_LATIN_1_SUPPLEMENT,
+	                           UNICODE_RANGE_LATIN_EXTENDED_A,
+	                           UNICODE_RANGE_LATIN_EXTENDED_B,
+	                           UNICODE_RANGE_SPECIALS};
+		// NOTE(ben): Weird that images are "create from data" but fonts are "create from memory"
+		// TODO: Decide whether we're using strings or explicit pointer + length
+		pongFont = mg_font_create_from_memory(size, (byte*)buffer, 5, ranges);
+	}
 
 	mem_arena_clear(mem_scratch());
 }
@@ -209,6 +230,18 @@ ORCA_EXPORT void OnFrameRefresh(void)
 
     mg_matrix_pop();
 
+	mg_set_color_rgba(0, 0, 0, 1);
+	mg_set_font(pongFont);
+	mg_set_font_size(14);
+	mg_move_to(10, 20);
+
+	str8 text = str8_pushf(mem_scratch(),
+		"wahoo I'm did a text. ball is at x = %f, y = %f",
+		ball.x, ball.y
+	);
+	mg_text_outlines(text);
+	mg_fill();
+
     mg_surface_prepare(surface);
     mg_render(surface, canvas);
     mg_surface_present(surface);