From c0f91058c44e7c2ed449188ae6fda795b824933c Mon Sep 17 00:00:00 2001 From: Pietro Gagliardi Date: Sat, 3 Dec 2016 11:20:22 -0500 Subject: [PATCH] Started the move to my utf library. Imported the library. --- common/utf.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++++ common/utf.h | 61 +++++++++ 2 files changed, 408 insertions(+) create mode 100644 common/utf.c create mode 100644 common/utf.h diff --git a/common/utf.c b/common/utf.c new file mode 100644 index 00000000..9efb9493 --- /dev/null +++ b/common/utf.c @@ -0,0 +1,347 @@ +// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ +// 10 november 2016 +#include "utf.h" + +// this code imitates Go's unicode/utf8 and unicode/utf16 +// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not) +// it is also an imitation so we can license it under looser terms than the Go source +#define badrune 0xFFFD + +// encoded must be at most 4 bytes +// TODO clean this code up somehow +size_t utf8EncodeRune(uint32_t rune, char *encoded) +{ + uint8_t b, c, d, e; + size_t n; + + // not in the valid range for Unicode + if (rune > 0x10FFFF) + rune = badrune; + // surrogate runes cannot be encoded + if (rune >= 0xD800 && rune < 0xE000) + rune = badrune; + + if (rune < 0x80) { // ASCII bytes represent themselves + b = (uint8_t) (rune & 0xFF); + n = 1; + goto done; + } + if (rune < 0x800) { // two-byte encoding + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x1F); + b |= 0xC0; + n = 2; + goto done; + } + if (rune < 0x10000) { // three-byte encoding + d = (uint8_t) (rune & 0x3F); + d |= 0x80; + rune >>= 6; + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x0F); + b |= 0xE0; + n = 3; + goto done; + } + // otherwise use a four-byte encoding + e = (uint8_t) (rune & 0x3F); + e |= 0x80; + rune >>= 6; + d = (uint8_t) (rune & 0x3F); + d |= 0x80; + rune >>= 6; + c = (uint8_t) (rune & 0x3F); + c |= 0x80; + rune >>= 6; + b = (uint8_t) (rune & 0x07); + b |= 0xF0; + n = 4; + +done: + encoded[0] = b; + if (n > 1) + encoded[1] = c; + if (n > 2) + encoded[2] = d; + if (n > 3) + encoded[3] = e; + return n; +} + +const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) +{ + uint8_t b, c; + uint8_t lowestAllowed, highestAllowed; + size_t i, expected; + int bad; + + b = (uint8_t) (*s); + if (b < 0x80) { // ASCII bytes represent themselves + *rune = b; + s++; + return s; + } + // 0xC0 and 0xC1 cover 2-byte overlong equivalents + // 0xF5 to 0xFD cover values > 0x10FFFF + // 0xFE and 0xFF were never defined (always illegal) + if (b < 0xC2 || b > 0xF4) { // invalid + *rune = badrune; + s++; + return s; + } + + // this determines the range of allowed first continuation bytes + lowestAllowed = 0x80; + highestAllowed = 0xBF; + switch (b) { + case 0xE0: + // disallow 3-byte overlong equivalents + lowestAllowed = 0xA0; + break; + case 0xED: + // disallow surrogate characters + highestAllowed = 0x9F; + break; + case 0xF0: + // disallow 4-byte overlong equivalents + lowestAllowed = 0x90; + break; + case 0xF4: + // disallow values > 0x10FFFF + highestAllowed = 0x8F; + break; + } + + // and this determines how many continuation bytes are expected + expected = 1; + if (b >= 0xE0) + expected++; + if (b >= 0xF0) + expected++; + if (nElem != 0) { // are there enough bytes? + nElem--; + if (nElem < expected) { // nope + *rune = badrune; + s++; + return s; + } + } + + // ensure that everything is correct + // if not, **only** consume the initial byte + bad = 0; + for (i = 0; i < expected; i++) { + c = (uint8_t) (s[1 + i]); + if (c < lowestAllowed || c > highestAllowed) { + bad = 1; + break; + } + // the old lowestAllowed and highestAllowed is only for the first continuation byte + lowestAllowed = 0x80; + highestAllowed = 0xBF; + } + if (bad) { + *rune = badrune; + s++; + return s; + } + + // now do the topmost bits + if (b < 0xE0) + *rune = b & 0x1F; + else if (b < 0xF0) + *rune = b & 0x0F; + else + *rune = b & 0x07; + s++; // we can finally move on + + // now do the continuation bytes + for (; expected; expected--) { + c = (uint8_t) (*s); + s++; + c &= 0x3F; // strip continuation bits + *rune <<= 6; + *rune |= c; + } + + return s; +} + +// encoded must have at most 2 elements +size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) +{ + uint16_t low, high; + + // not in the valid range for Unicode + if (rune > 0x10FFFF) + rune = badrune; + // surrogate runes cannot be encoded + if (rune >= 0xD800 && rune < 0xE000) + rune = badrune; + + if (rune < 0x10000) { + encoded[0] = (uint16_t) rune; + return 1; + } + + rune -= 0x10000; + low = (uint16_t) (rune & 0x3FF); + rune >>= 10; + high = (uint16_t) (rune & 0x3FF); + encoded[0] = high | 0xD800; + encoded[1] = low | 0xDC00; + return 2; +} + +// TODO see if this can be cleaned up somehow +const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) +{ + uint16_t high, low; + + if (*s < 0xD800 || *s >= 0xE000) { + // self-representing character + *rune = *s; + s++; + return s; + } + if (*s >= 0xDC00) { + // out-of-order surrogates + *rune = badrune; + s++; + return s; + } + if (nElem == 1) { // not enough elements + *rune = badrune; + s++; + return s; + } + high = *s; + high &= 0x3FF; + if (s[1] < 0xDC00 || s[1] >= 0xE000) { + // bad surrogate pair + *rune = badrune; + s++; + return s; + } + s++; + low = *s; + s++; + low &= 0x3FF; + *rune = high; + *rune <<= 10; + *rune |= low; + *rune += 0x10000; + return s; +} + +// TODO find a way to reduce the code in all of these somehow +// TODO find a way to remove u as well +size_t utf8RuneCount(const char *s, size_t nElem) +{ + size_t len; + uint32_t rune; + + if (nElem != 0) { + const char *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf8DecodeRune(t, nElem, &rune); + len++; + nElem -= u - t; + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf8DecodeRune(s, nElem, &rune); + len++; + } + return len; +} + +size_t utf8UTF16Count(const char *s, size_t nElem) +{ + size_t len; + uint32_t rune; + uint16_t encoded[2]; + + if (nElem != 0) { + const char *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf8DecodeRune(t, nElem, &rune); + len += utf16EncodeRune(rune, encoded); + nElem -= u - t; + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf8DecodeRune(s, nElem, &rune); + len += utf16EncodeRune(rune, encoded); + } + return len; +} + +size_t utf16RuneCount(const uint16_t *s, size_t nElem) +{ + size_t len; + uint32_t rune; + + if (nElem != 0) { + const uint16_t *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf16DecodeRune(t, nElem, &rune); + len++; + nElem -= u - t; + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf16DecodeRune(s, nElem, &rune); + len++; + } + return len; +} + +size_t utf16UTF8Count(const uint16_t *s, size_t nElem) +{ + size_t len; + uint32_t rune; + char encoded[4]; + + if (nElem != 0) { + const uint16_t *t, *u; + + len = 0; + t = s; + while (nElem != 0) { + u = utf16DecodeRune(t, nElem, &rune); + len += utf8EncodeRune(rune, encoded); + nElem -= u - t; + t = u; + } + return len; + } + len = 0; + while (*s) { + s = utf16DecodeRune(s, nElem, &rune); + len += utf8EncodeRune(rune, encoded); + } + return len; +} diff --git a/common/utf.h b/common/utf.h new file mode 100644 index 00000000..b810a49d --- /dev/null +++ b/common/utf.h @@ -0,0 +1,61 @@ +// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ +// 10 november 2016 + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated +// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements* + +extern size_t utf8EncodeRune(uint32_t rune, char *encoded); +extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune); +extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded); +extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune); + +extern size_t utf8RuneCount(const char *s, size_t nElem); +extern size_t utf8UTF16Count(const char *s, size_t nElem); +extern size_t utf16RuneCount(const uint16_t *s, size_t nElem); +extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem); + +#ifdef __cplusplus +} + +// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default). +// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions. +// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!). +// TODO check this on MinGW-w64 +// TODO check this under /Wall +// TODO C-style casts enough? or will that fail in /Wall? +// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about +#if defined(_MSC_VER) + +inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded) +{ + return utf16EncodeRune(rune, reinterpret_cast(encoded)); +} + +inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune) +{ + const uint16_t *ret; + + ret = utf16DecodeRune(reinterpret_cast(s), nElem, rune); + return reinterpret_cast(ret); +} + +inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem) +{ + return utf16RuneCount(reinterpret_cast(s), nElem); +} + +inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem) +{ + return utf16UTF8Count(reinterpret_cast(s), nElem); +} + +#endif + +#endif