libui/common/utf.c

// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
// 10 november 2016
#include "utf.h"

// this code imitates Go's unicode/utf8 and unicode/utf16
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
// it is also an imitation so we can license it under looser terms than the Go source
#define badrune 0xFFFD

// encoded must be at most 4 bytes
// TODO clean this code up somehow
size_t utf8EncodeRune(uint32_t rune, char *encoded)
{
	uint8_t b, c, d, e;
	size_t n;

	// not in the valid range for Unicode
	if (rune > 0x10FFFF)
		rune = badrune;
	// surrogate runes cannot be encoded
	if (rune >= 0xD800 && rune < 0xE000)
		rune = badrune;

	if (rune < 0x80) {		// ASCII bytes represent themselves
		b = (uint8_t) (rune & 0xFF);
		n = 1;
		goto done;
	}
	if (rune < 0x800) {		// two-byte encoding
		c = (uint8_t) (rune & 0x3F);
		c |= 0x80;
		rune >>= 6;
		b = (uint8_t) (rune & 0x1F);
		b |= 0xC0;
		n = 2;
		goto done;
	}
	if (rune < 0x10000) {	// three-byte encoding
		d = (uint8_t) (rune & 0x3F);
		d |= 0x80;
		rune >>= 6;
		c = (uint8_t) (rune & 0x3F);
		c |= 0x80;
		rune >>= 6;
		b = (uint8_t) (rune & 0x0F);
		b |= 0xE0;
		n = 3;
		goto done;
	}
	// otherwise use a four-byte encoding
	e = (uint8_t) (rune & 0x3F);
	e |= 0x80;
	rune >>= 6;
	d = (uint8_t) (rune & 0x3F);
	d |= 0x80;
	rune >>= 6;
	c = (uint8_t) (rune & 0x3F);
	c |= 0x80;
	rune >>= 6;
	b = (uint8_t) (rune & 0x07);
	b |= 0xF0;
	n = 4;

done:
	encoded[0] = b;
	if (n > 1)
		encoded[1] = c;
	if (n > 2)
		encoded[2] = d;
	if (n > 3)
		encoded[3] = e;
	return n;
}

const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
{
	uint8_t b, c;
	uint8_t lowestAllowed, highestAllowed;
	size_t i, expected;
	int bad;

	b = (uint8_t) (*s);
	if (b < 0x80) {		// ASCII bytes represent themselves
		*rune = b;
		s++;
		return s;
	}
	// 0xC0 and 0xC1 cover 2-byte overlong equivalents
	// 0xF5 to 0xFD cover values > 0x10FFFF
	// 0xFE and 0xFF were never defined (always illegal)
	if (b < 0xC2 || b > 0xF4) {		// invalid
		*rune = badrune;
		s++;
		return s;
	}

	// this determines the range of allowed first continuation bytes
	lowestAllowed = 0x80;
	highestAllowed = 0xBF;
	switch (b) {
	case 0xE0:
		// disallow 3-byte overlong equivalents
		lowestAllowed = 0xA0;
		break;
	case 0xED:
		// disallow surrogate characters
		highestAllowed = 0x9F;
		break;
	case 0xF0:
		// disallow 4-byte overlong equivalents
		lowestAllowed = 0x90;
		break;
	case 0xF4:
		// disallow values > 0x10FFFF
		highestAllowed = 0x8F;
		break;
	}

	// and this determines how many continuation bytes are expected
	expected = 1;
	if (b >= 0xE0)
		expected++;
	if (b >= 0xF0)
		expected++;
	if (nElem != 0) {				// are there enough bytes?
		nElem--;
		if (nElem < expected) {	// nope
			*rune = badrune;
			s++;
			return s;
		}
	}

	// ensure that everything is correct
	// if not, **only** consume the initial byte
	bad = 0;
	for (i = 0; i < expected; i++) {
		c = (uint8_t) (s[1 + i]);
		if (c < lowestAllowed || c > highestAllowed) {
			bad = 1;
			break;
		}
		// the old lowestAllowed and highestAllowed is only for the first continuation byte
		lowestAllowed = 0x80;
		highestAllowed = 0xBF;
	}
	if (bad) {
		*rune = badrune;
		s++;
		return s;
	}

	// now do the topmost bits
	if (b < 0xE0)
		*rune = b & 0x1F;
	else if (b < 0xF0)
		*rune = b & 0x0F;
	else
		*rune = b & 0x07;
	s++;		// we can finally move on

	// now do the continuation bytes
	for (; expected; expected--) {
		c = (uint8_t) (*s);
		s++;
		c &= 0x3F;		// strip continuation bits
		*rune <<= 6;
		*rune |= c;
	}

	return s;
}

// encoded must have at most 2 elements
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
{
	uint16_t low, high;

	// not in the valid range for Unicode
	if (rune > 0x10FFFF)
		rune = badrune;
	// surrogate runes cannot be encoded
	if (rune >= 0xD800 && rune < 0xE000)
		rune = badrune;

	if (rune < 0x10000) {
		encoded[0] = (uint16_t) rune;
		return 1;
	}

	rune -= 0x10000;
	low = (uint16_t) (rune & 0x3FF);
	rune >>= 10;
	high = (uint16_t) (rune & 0x3FF);
	encoded[0] = high | 0xD800;
	encoded[1] = low | 0xDC00;
	return 2;
}

// TODO see if this can be cleaned up somehow
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
{
	uint16_t high, low;

	if (*s < 0xD800 || *s >= 0xE000) {
		// self-representing character
		*rune = *s;
		s++;
		return s;
	}
	if (*s >= 0xDC00) {
		// out-of-order surrogates
		*rune = badrune;
		s++;
		return s;
	}
	if (nElem == 1) {		// not enough elements
		*rune = badrune;
		s++;
		return s;
	}
	high = *s;
	high &= 0x3FF;
	if (s[1] < 0xDC00 || s[1] >= 0xE000) {
		// bad surrogate pair
		*rune = badrune;
		s++;
		return s;
	}
	s++;
	low = *s;
	s++;
	low &= 0x3FF;
	*rune = high;
	*rune <<= 10;
	*rune |= low;
	*rune += 0x10000;
	return s;
}

// TODO find a way to reduce the code in all of these somehow
// TODO find a way to remove u as well
size_t utf8RuneCount(const char *s, size_t nElem)
{
	size_t len;
	uint32_t rune;

	if (nElem != 0) {
		const char *t, *u;

		len = 0;
		t = s;
		while (nElem != 0) {
			u = utf8DecodeRune(t, nElem, &rune);
			len++;
			nElem -= u - t;
			t = u;
		}
		return len;
	}
	len = 0;
	while (*s) {
		s = utf8DecodeRune(s, nElem, &rune);
		len++;
	}
	return len;
}

size_t utf8UTF16Count(const char *s, size_t nElem)
{
	size_t len;
	uint32_t rune;
	uint16_t encoded[2];

	if (nElem != 0) {
		const char *t, *u;

		len = 0;
		t = s;
		while (nElem != 0) {
			u = utf8DecodeRune(t, nElem, &rune);
			len += utf16EncodeRune(rune, encoded);
			nElem -= u - t;
			t = u;
		}
		return len;
	}
	len = 0;
	while (*s) {
		s = utf8DecodeRune(s, nElem, &rune);
		len += utf16EncodeRune(rune, encoded);
	}
	return len;
}

size_t utf16RuneCount(const uint16_t *s, size_t nElem)
{
	size_t len;
	uint32_t rune;

	if (nElem != 0) {
		const uint16_t *t, *u;

		len = 0;
		t = s;
		while (nElem != 0) {
			u = utf16DecodeRune(t, nElem, &rune);
			len++;
			nElem -= u - t;
			t = u;
		}
		return len;
	}
	len = 0;
	while (*s) {
		s = utf16DecodeRune(s, nElem, &rune);
		len++;
	}
	return len;
}

size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
{
	size_t len;
	uint32_t rune;
	char encoded[4];

	if (nElem != 0) {
		const uint16_t *t, *u;

		len = 0;
		t = s;
		while (nElem != 0) {
			u = utf16DecodeRune(t, nElem, &rune);
			len += utf8EncodeRune(rune, encoded);
			nElem -= u - t;
			t = u;
		}
		return len;
	}
	len = 0;
	while (*s) {
		s = utf16DecodeRune(s, nElem, &rune);
		len += utf8EncodeRune(rune, encoded);
	}
	return len;
}
Started the move to my utf library. Imported the library. 2016-12-03 10:20:22 -06:00			`// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/`
			`// 10 november 2016`
			`#include "utf.h"`

			`// this code imitates Go's unicode/utf8 and unicode/utf16`
			`// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)`
			`// it is also an imitation so we can license it under looser terms than the Go source`
			`#define badrune 0xFFFD`

			`// encoded must be at most 4 bytes`
			`// TODO clean this code up somehow`
			`size_t utf8EncodeRune(uint32_t rune, char *encoded)`
			`{`
			`uint8_t b, c, d, e;`
			`size_t n;`

			`// not in the valid range for Unicode`
			`if (rune > 0x10FFFF)`
			`rune = badrune;`
			`// surrogate runes cannot be encoded`
			`if (rune >= 0xD800 && rune < 0xE000)`
			`rune = badrune;`

			`if (rune < 0x80) { // ASCII bytes represent themselves`
			`b = (uint8_t) (rune & 0xFF);`
			`n = 1;`
			`goto done;`
			`}`
			`if (rune < 0x800) { // two-byte encoding`
			`c = (uint8_t) (rune & 0x3F);`
			`c \|= 0x80;`
			`rune >>= 6;`
			`b = (uint8_t) (rune & 0x1F);`
			`b \|= 0xC0;`
			`n = 2;`
			`goto done;`
			`}`
			`if (rune < 0x10000) { // three-byte encoding`
			`d = (uint8_t) (rune & 0x3F);`
			`d \|= 0x80;`
			`rune >>= 6;`
			`c = (uint8_t) (rune & 0x3F);`
			`c \|= 0x80;`
			`rune >>= 6;`
			`b = (uint8_t) (rune & 0x0F);`
			`b \|= 0xE0;`
			`n = 3;`
			`goto done;`
			`}`
			`// otherwise use a four-byte encoding`
			`e = (uint8_t) (rune & 0x3F);`
			`e \|= 0x80;`
			`rune >>= 6;`
			`d = (uint8_t) (rune & 0x3F);`
			`d \|= 0x80;`
			`rune >>= 6;`
			`c = (uint8_t) (rune & 0x3F);`
			`c \|= 0x80;`
			`rune >>= 6;`
			`b = (uint8_t) (rune & 0x07);`
			`b \|= 0xF0;`
			`n = 4;`

			`done:`
			`encoded[0] = b;`
			`if (n > 1)`
			`encoded[1] = c;`
			`if (n > 2)`
			`encoded[2] = d;`
			`if (n > 3)`
			`encoded[3] = e;`
			`return n;`
			`}`

			`const char utf8DecodeRune(const char s, size_t nElem, uint32_t *rune)`
			`{`
			`uint8_t b, c;`
			`uint8_t lowestAllowed, highestAllowed;`
			`size_t i, expected;`
			`int bad;`

			`b = (uint8_t) (*s);`
			`if (b < 0x80) { // ASCII bytes represent themselves`
			`*rune = b;`
			`s++;`
			`return s;`
			`}`
			`// 0xC0 and 0xC1 cover 2-byte overlong equivalents`
			`// 0xF5 to 0xFD cover values > 0x10FFFF`
			`// 0xFE and 0xFF were never defined (always illegal)`
			`if (b < 0xC2 \|\| b > 0xF4) { // invalid`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`

			`// this determines the range of allowed first continuation bytes`
			`lowestAllowed = 0x80;`
			`highestAllowed = 0xBF;`
			`switch (b) {`
			`case 0xE0:`
			`// disallow 3-byte overlong equivalents`
			`lowestAllowed = 0xA0;`
			`break;`
			`case 0xED:`
			`// disallow surrogate characters`
			`highestAllowed = 0x9F;`
			`break;`
			`case 0xF0:`
			`// disallow 4-byte overlong equivalents`
			`lowestAllowed = 0x90;`
			`break;`
			`case 0xF4:`
			`// disallow values > 0x10FFFF`
			`highestAllowed = 0x8F;`
			`break;`
			`}`

			`// and this determines how many continuation bytes are expected`
			`expected = 1;`
			`if (b >= 0xE0)`
			`expected++;`
			`if (b >= 0xF0)`
			`expected++;`
			`if (nElem != 0) { // are there enough bytes?`
			`nElem--;`
			`if (nElem < expected) { // nope`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`
			`}`

			`// ensure that everything is correct`
			`// if not, only consume the initial byte`
			`bad = 0;`
			`for (i = 0; i < expected; i++) {`
			`c = (uint8_t) (s[1 + i]);`
			`if (c < lowestAllowed \|\| c > highestAllowed) {`
			`bad = 1;`
			`break;`
			`}`
			`// the old lowestAllowed and highestAllowed is only for the first continuation byte`
			`lowestAllowed = 0x80;`
			`highestAllowed = 0xBF;`
			`}`
			`if (bad) {`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`

			`// now do the topmost bits`
			`if (b < 0xE0)`
			`*rune = b & 0x1F;`
			`else if (b < 0xF0)`
			`*rune = b & 0x0F;`
			`else`
			`*rune = b & 0x07;`
			`s++; // we can finally move on`

			`// now do the continuation bytes`
			`for (; expected; expected--) {`
			`c = (uint8_t) (*s);`
			`s++;`
			`c &= 0x3F; // strip continuation bits`
			`*rune <<= 6;`
			`*rune \|= c;`
			`}`

			`return s;`
			`}`

			`// encoded must have at most 2 elements`
			`size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)`
			`{`
			`uint16_t low, high;`

			`// not in the valid range for Unicode`
			`if (rune > 0x10FFFF)`
			`rune = badrune;`
			`// surrogate runes cannot be encoded`
			`if (rune >= 0xD800 && rune < 0xE000)`
			`rune = badrune;`

			`if (rune < 0x10000) {`
			`encoded[0] = (uint16_t) rune;`
			`return 1;`
			`}`

			`rune -= 0x10000;`
			`low = (uint16_t) (rune & 0x3FF);`
			`rune >>= 10;`
			`high = (uint16_t) (rune & 0x3FF);`
			`encoded[0] = high \| 0xD800;`
			`encoded[1] = low \| 0xDC00;`
			`return 2;`
			`}`

			`// TODO see if this can be cleaned up somehow`
			`const uint16_t utf16DecodeRune(const uint16_t s, size_t nElem, uint32_t *rune)`
			`{`
			`uint16_t high, low;`

			`if (s < 0xD800 \|\| s >= 0xE000) {`
			`// self-representing character`
			`rune = s;`
			`s++;`
			`return s;`
			`}`
			`if (*s >= 0xDC00) {`
			`// out-of-order surrogates`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`
			`if (nElem == 1) { // not enough elements`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`
			`high = *s;`
			`high &= 0x3FF;`
			`if (s[1] < 0xDC00 \|\| s[1] >= 0xE000) {`
			`// bad surrogate pair`
			`*rune = badrune;`
			`s++;`
			`return s;`
			`}`
			`s++;`
			`low = *s;`
			`s++;`
			`low &= 0x3FF;`
			`*rune = high;`
			`*rune <<= 10;`
			`*rune \|= low;`
			`*rune += 0x10000;`
			`return s;`
			`}`

			`// TODO find a way to reduce the code in all of these somehow`
			`// TODO find a way to remove u as well`
			`size_t utf8RuneCount(const char *s, size_t nElem)`
			`{`
			`size_t len;`
			`uint32_t rune;`

			`if (nElem != 0) {`
			`const char t, u;`

			`len = 0;`
			`t = s;`
			`while (nElem != 0) {`
			`u = utf8DecodeRune(t, nElem, &rune);`
			`len++;`
			`nElem -= u - t;`
			`t = u;`
			`}`
			`return len;`
			`}`
			`len = 0;`
			`while (*s) {`
			`s = utf8DecodeRune(s, nElem, &rune);`
			`len++;`
			`}`
			`return len;`
			`}`

			`size_t utf8UTF16Count(const char *s, size_t nElem)`
			`{`
			`size_t len;`
			`uint32_t rune;`
			`uint16_t encoded[2];`

			`if (nElem != 0) {`
			`const char t, u;`

			`len = 0;`
			`t = s;`
			`while (nElem != 0) {`
			`u = utf8DecodeRune(t, nElem, &rune);`
			`len += utf16EncodeRune(rune, encoded);`
			`nElem -= u - t;`
			`t = u;`
			`}`
			`return len;`
			`}`
			`len = 0;`
			`while (*s) {`
			`s = utf8DecodeRune(s, nElem, &rune);`
			`len += utf16EncodeRune(rune, encoded);`
			`}`
			`return len;`
			`}`

			`size_t utf16RuneCount(const uint16_t *s, size_t nElem)`
			`{`
			`size_t len;`
			`uint32_t rune;`

			`if (nElem != 0) {`
			`const uint16_t t, u;`

			`len = 0;`
			`t = s;`
			`while (nElem != 0) {`
			`u = utf16DecodeRune(t, nElem, &rune);`
			`len++;`
			`nElem -= u - t;`
			`t = u;`
			`}`
			`return len;`
			`}`
			`len = 0;`
			`while (*s) {`
			`s = utf16DecodeRune(s, nElem, &rune);`
			`len++;`
			`}`
			`return len;`
			`}`

			`size_t utf16UTF8Count(const uint16_t *s, size_t nElem)`
			`{`
			`size_t len;`
			`uint32_t rune;`
			`char encoded[4];`

			`if (nElem != 0) {`
			`const uint16_t t, u;`

			`len = 0;`
			`t = s;`
			`while (nElem != 0) {`
			`u = utf16DecodeRune(t, nElem, &rune);`
			`len += utf8EncodeRune(rune, encoded);`
			`nElem -= u - t;`
			`t = u;`
			`}`
			`return len;`
			`}`
			`len = 0;`
			`while (*s) {`
			`s = utf16DecodeRune(s, nElem, &rune);`
			`len += utf8EncodeRune(rune, encoded);`
			`}`
			`return len;`
			`}`