Started the move to my utf library. Imported the library.

2016-12-03 11:20:22 -05:00 · 2016-12-03 11:20:22 -05:00 · c0f91058c4
parent 5fa7ba487e
commit c0f91058c4
2 changed files with 408 additions and 0 deletions
--- a/common/utf.c
+++ b/common/utf.c
@ -0,0 +1,347 @@
+// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
+// 10 november 2016
+#include "utf.h"
+
+// this code imitates Go's unicode/utf8 and unicode/utf16
+// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
+// it is also an imitation so we can license it under looser terms than the Go source
+#define badrune 0xFFFD
+
+// encoded must be at most 4 bytes
+// TODO clean this code up somehow
+size_t utf8EncodeRune(uint32_t rune, char *encoded)
+{
+	uint8_t b, c, d, e;
+	size_t n;
+
+	// not in the valid range for Unicode
+	if (rune > 0x10FFFF)
+		rune = badrune;
+	// surrogate runes cannot be encoded
+	if (rune >= 0xD800 && rune < 0xE000)
+		rune = badrune;
+
+	if (rune < 0x80) {		// ASCII bytes represent themselves
+		b = (uint8_t) (rune & 0xFF);
+		n = 1;
+		goto done;
+	}
+	if (rune < 0x800) {		// two-byte encoding
+		c = (uint8_t) (rune & 0x3F);
+		c |= 0x80;
+		rune >>= 6;
+		b = (uint8_t) (rune & 0x1F);
+		b |= 0xC0;
+		n = 2;
+		goto done;
+	}
+	if (rune < 0x10000) {	// three-byte encoding
+		d = (uint8_t) (rune & 0x3F);
+		d |= 0x80;
+		rune >>= 6;
+		c = (uint8_t) (rune & 0x3F);
+		c |= 0x80;
+		rune >>= 6;
+		b = (uint8_t) (rune & 0x0F);
+		b |= 0xE0;
+		n = 3;
+		goto done;
+	}
+	// otherwise use a four-byte encoding
+	e = (uint8_t) (rune & 0x3F);
+	e |= 0x80;
+	rune >>= 6;
+	d = (uint8_t) (rune & 0x3F);
+	d |= 0x80;
+	rune >>= 6;
+	c = (uint8_t) (rune & 0x3F);
+	c |= 0x80;
+	rune >>= 6;
+	b = (uint8_t) (rune & 0x07);
+	b |= 0xF0;
+	n = 4;
+
+done:
+	encoded[0] = b;
+	if (n > 1)
+		encoded[1] = c;
+	if (n > 2)
+		encoded[2] = d;
+	if (n > 3)
+		encoded[3] = e;
+	return n;
+}
+
+const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
+{
+	uint8_t b, c;
+	uint8_t lowestAllowed, highestAllowed;
+	size_t i, expected;
+	int bad;
+
+	b = (uint8_t) (*s);
+	if (b < 0x80) {		// ASCII bytes represent themselves
+		*rune = b;
+		s++;
+		return s;
+	}
+	// 0xC0 and 0xC1 cover 2-byte overlong equivalents
+	// 0xF5 to 0xFD cover values > 0x10FFFF
+	// 0xFE and 0xFF were never defined (always illegal)
+	if (b < 0xC2 || b > 0xF4) {		// invalid
+		*rune = badrune;
+		s++;
+		return s;
+	}
+
+	// this determines the range of allowed first continuation bytes
+	lowestAllowed = 0x80;
+	highestAllowed = 0xBF;
+	switch (b) {
+	case 0xE0:
+		// disallow 3-byte overlong equivalents
+		lowestAllowed = 0xA0;
+		break;
+	case 0xED:
+		// disallow surrogate characters
+		highestAllowed = 0x9F;
+		break;
+	case 0xF0:
+		// disallow 4-byte overlong equivalents
+		lowestAllowed = 0x90;
+		break;
+	case 0xF4:
+		// disallow values > 0x10FFFF
+		highestAllowed = 0x8F;
+		break;
+	}
+
+	// and this determines how many continuation bytes are expected
+	expected = 1;
+	if (b >= 0xE0)
+		expected++;
+	if (b >= 0xF0)
+		expected++;
+	if (nElem != 0) {				// are there enough bytes?
+		nElem--;
+		if (nElem < expected) {	// nope
+			*rune = badrune;
+			s++;
+			return s;
+		}
+	}
+
+	// ensure that everything is correct
+	// if not, **only** consume the initial byte
+	bad = 0;
+	for (i = 0; i < expected; i++) {
+		c = (uint8_t) (s[1 + i]);
+		if (c < lowestAllowed || c > highestAllowed) {
+			bad = 1;
+			break;
+		}
+		// the old lowestAllowed and highestAllowed is only for the first continuation byte
+		lowestAllowed = 0x80;
+		highestAllowed = 0xBF;
+	}
+	if (bad) {
+		*rune = badrune;
+		s++;
+		return s;
+	}
+
+	// now do the topmost bits
+	if (b < 0xE0)
+		*rune = b & 0x1F;
+	else if (b < 0xF0)
+		*rune = b & 0x0F;
+	else
+		*rune = b & 0x07;
+	s++;		// we can finally move on
+
+	// now do the continuation bytes
+	for (; expected; expected--) {
+		c = (uint8_t) (*s);
+		s++;
+		c &= 0x3F;		// strip continuation bits
+		*rune <<= 6;
+		*rune |= c;
+	}
+
+	return s;
+}
+
+// encoded must have at most 2 elements
+size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
+{
+	uint16_t low, high;
+
+	// not in the valid range for Unicode
+	if (rune > 0x10FFFF)
+		rune = badrune;
+	// surrogate runes cannot be encoded
+	if (rune >= 0xD800 && rune < 0xE000)
+		rune = badrune;
+
+	if (rune < 0x10000) {
+		encoded[0] = (uint16_t) rune;
+		return 1;
+	}
+
+	rune -= 0x10000;
+	low = (uint16_t) (rune & 0x3FF);
+	rune >>= 10;
+	high = (uint16_t) (rune & 0x3FF);
+	encoded[0] = high | 0xD800;
+	encoded[1] = low | 0xDC00;
+	return 2;
+}
+
+// TODO see if this can be cleaned up somehow
+const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
+{
+	uint16_t high, low;
+
+	if (*s < 0xD800 || *s >= 0xE000) {
+		// self-representing character
+		*rune = *s;
+		s++;
+		return s;
+	}
+	if (*s >= 0xDC00) {
+		// out-of-order surrogates
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	if (nElem == 1) {		// not enough elements
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	high = *s;
+	high &= 0x3FF;
+	if (s[1] < 0xDC00 || s[1] >= 0xE000) {
+		// bad surrogate pair
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	s++;
+	low = *s;
+	s++;
+	low &= 0x3FF;
+	*rune = high;
+	*rune <<= 10;
+	*rune |= low;
+	*rune += 0x10000;
+	return s;
+}
+
+// TODO find a way to reduce the code in all of these somehow
+// TODO find a way to remove u as well
+size_t utf8RuneCount(const char *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+
+	if (nElem != 0) {
+		const char *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf8DecodeRune(t, nElem, &rune);
+			len++;
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf8DecodeRune(s, nElem, &rune);
+		len++;
+	}
+	return len;
+}
+
+size_t utf8UTF16Count(const char *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+	uint16_t encoded[2];
+
+	if (nElem != 0) {
+		const char *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf8DecodeRune(t, nElem, &rune);
+			len += utf16EncodeRune(rune, encoded);
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf8DecodeRune(s, nElem, &rune);
+		len += utf16EncodeRune(rune, encoded);
+	}
+	return len;
+}
+
+size_t utf16RuneCount(const uint16_t *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+
+	if (nElem != 0) {
+		const uint16_t *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf16DecodeRune(t, nElem, &rune);
+			len++;
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf16DecodeRune(s, nElem, &rune);
+		len++;
+	}
+	return len;
+}
+
+size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+	char encoded[4];
+
+	if (nElem != 0) {
+		const uint16_t *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf16DecodeRune(t, nElem, &rune);
+			len += utf8EncodeRune(rune, encoded);
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf16DecodeRune(s, nElem, &rune);
+		len += utf8EncodeRune(rune, encoded);
+	}
+	return len;
+}
--- a/common/utf.h
+++ b/common/utf.h
@ -0,0 +1,61 @@
+// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
+// 10 november 2016
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated
+// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements*
+
+extern size_t utf8EncodeRune(uint32_t rune, char *encoded);
+extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune);
+extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded);
+extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune);
+
+extern size_t utf8RuneCount(const char *s, size_t nElem);
+extern size_t utf8UTF16Count(const char *s, size_t nElem);
+extern size_t utf16RuneCount(const uint16_t *s, size_t nElem);
+extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem);
+
+#ifdef __cplusplus
+}
+
+// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default).
+// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions.
+// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!).
+// TODO check this on MinGW-w64
+// TODO check this under /Wall
+// TODO C-style casts enough? or will that fail in /Wall?
+// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about
+#if defined(_MSC_VER)
+
+inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded)
+{
+	return utf16EncodeRune(rune, reinterpret_cast<uint16_t *>(encoded));
+}
+
+inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune)
+{
+	const uint16_t *ret;
+
+	ret = utf16DecodeRune(reinterpret_cast<const uint16_t *>(s), nElem, rune);
+	return reinterpret_cast<const __wchar_t *>(ret);
+}
+
+inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem)
+{
+	return utf16RuneCount(reinterpret_cast<const uint16_t *>(s), nElem);
+}
+
+inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem)
+{
+	return utf16UTF8Count(reinterpret_cast<const uint16_t *>(s), nElem);
+}
+
+#endif
+
+#endif