Started the move to my utf library. Imported the library.
This commit is contained in:
parent
5fa7ba487e
commit
c0f91058c4
|
@ -0,0 +1,347 @@
|
|||
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
|
||||
// 10 november 2016
|
||||
#include "utf.h"
|
||||
|
||||
// this code imitates Go's unicode/utf8 and unicode/utf16
|
||||
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
|
||||
// it is also an imitation so we can license it under looser terms than the Go source
|
||||
#define badrune 0xFFFD
|
||||
|
||||
// encoded must be at most 4 bytes
|
||||
// TODO clean this code up somehow
|
||||
size_t utf8EncodeRune(uint32_t rune, char *encoded)
|
||||
{
|
||||
uint8_t b, c, d, e;
|
||||
size_t n;
|
||||
|
||||
// not in the valid range for Unicode
|
||||
if (rune > 0x10FFFF)
|
||||
rune = badrune;
|
||||
// surrogate runes cannot be encoded
|
||||
if (rune >= 0xD800 && rune < 0xE000)
|
||||
rune = badrune;
|
||||
|
||||
if (rune < 0x80) { // ASCII bytes represent themselves
|
||||
b = (uint8_t) (rune & 0xFF);
|
||||
n = 1;
|
||||
goto done;
|
||||
}
|
||||
if (rune < 0x800) { // two-byte encoding
|
||||
c = (uint8_t) (rune & 0x3F);
|
||||
c |= 0x80;
|
||||
rune >>= 6;
|
||||
b = (uint8_t) (rune & 0x1F);
|
||||
b |= 0xC0;
|
||||
n = 2;
|
||||
goto done;
|
||||
}
|
||||
if (rune < 0x10000) { // three-byte encoding
|
||||
d = (uint8_t) (rune & 0x3F);
|
||||
d |= 0x80;
|
||||
rune >>= 6;
|
||||
c = (uint8_t) (rune & 0x3F);
|
||||
c |= 0x80;
|
||||
rune >>= 6;
|
||||
b = (uint8_t) (rune & 0x0F);
|
||||
b |= 0xE0;
|
||||
n = 3;
|
||||
goto done;
|
||||
}
|
||||
// otherwise use a four-byte encoding
|
||||
e = (uint8_t) (rune & 0x3F);
|
||||
e |= 0x80;
|
||||
rune >>= 6;
|
||||
d = (uint8_t) (rune & 0x3F);
|
||||
d |= 0x80;
|
||||
rune >>= 6;
|
||||
c = (uint8_t) (rune & 0x3F);
|
||||
c |= 0x80;
|
||||
rune >>= 6;
|
||||
b = (uint8_t) (rune & 0x07);
|
||||
b |= 0xF0;
|
||||
n = 4;
|
||||
|
||||
done:
|
||||
encoded[0] = b;
|
||||
if (n > 1)
|
||||
encoded[1] = c;
|
||||
if (n > 2)
|
||||
encoded[2] = d;
|
||||
if (n > 3)
|
||||
encoded[3] = e;
|
||||
return n;
|
||||
}
|
||||
|
||||
const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
|
||||
{
|
||||
uint8_t b, c;
|
||||
uint8_t lowestAllowed, highestAllowed;
|
||||
size_t i, expected;
|
||||
int bad;
|
||||
|
||||
b = (uint8_t) (*s);
|
||||
if (b < 0x80) { // ASCII bytes represent themselves
|
||||
*rune = b;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
// 0xC0 and 0xC1 cover 2-byte overlong equivalents
|
||||
// 0xF5 to 0xFD cover values > 0x10FFFF
|
||||
// 0xFE and 0xFF were never defined (always illegal)
|
||||
if (b < 0xC2 || b > 0xF4) { // invalid
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
|
||||
// this determines the range of allowed first continuation bytes
|
||||
lowestAllowed = 0x80;
|
||||
highestAllowed = 0xBF;
|
||||
switch (b) {
|
||||
case 0xE0:
|
||||
// disallow 3-byte overlong equivalents
|
||||
lowestAllowed = 0xA0;
|
||||
break;
|
||||
case 0xED:
|
||||
// disallow surrogate characters
|
||||
highestAllowed = 0x9F;
|
||||
break;
|
||||
case 0xF0:
|
||||
// disallow 4-byte overlong equivalents
|
||||
lowestAllowed = 0x90;
|
||||
break;
|
||||
case 0xF4:
|
||||
// disallow values > 0x10FFFF
|
||||
highestAllowed = 0x8F;
|
||||
break;
|
||||
}
|
||||
|
||||
// and this determines how many continuation bytes are expected
|
||||
expected = 1;
|
||||
if (b >= 0xE0)
|
||||
expected++;
|
||||
if (b >= 0xF0)
|
||||
expected++;
|
||||
if (nElem != 0) { // are there enough bytes?
|
||||
nElem--;
|
||||
if (nElem < expected) { // nope
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
// ensure that everything is correct
|
||||
// if not, **only** consume the initial byte
|
||||
bad = 0;
|
||||
for (i = 0; i < expected; i++) {
|
||||
c = (uint8_t) (s[1 + i]);
|
||||
if (c < lowestAllowed || c > highestAllowed) {
|
||||
bad = 1;
|
||||
break;
|
||||
}
|
||||
// the old lowestAllowed and highestAllowed is only for the first continuation byte
|
||||
lowestAllowed = 0x80;
|
||||
highestAllowed = 0xBF;
|
||||
}
|
||||
if (bad) {
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
|
||||
// now do the topmost bits
|
||||
if (b < 0xE0)
|
||||
*rune = b & 0x1F;
|
||||
else if (b < 0xF0)
|
||||
*rune = b & 0x0F;
|
||||
else
|
||||
*rune = b & 0x07;
|
||||
s++; // we can finally move on
|
||||
|
||||
// now do the continuation bytes
|
||||
for (; expected; expected--) {
|
||||
c = (uint8_t) (*s);
|
||||
s++;
|
||||
c &= 0x3F; // strip continuation bits
|
||||
*rune <<= 6;
|
||||
*rune |= c;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
// encoded must have at most 2 elements
|
||||
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
|
||||
{
|
||||
uint16_t low, high;
|
||||
|
||||
// not in the valid range for Unicode
|
||||
if (rune > 0x10FFFF)
|
||||
rune = badrune;
|
||||
// surrogate runes cannot be encoded
|
||||
if (rune >= 0xD800 && rune < 0xE000)
|
||||
rune = badrune;
|
||||
|
||||
if (rune < 0x10000) {
|
||||
encoded[0] = (uint16_t) rune;
|
||||
return 1;
|
||||
}
|
||||
|
||||
rune -= 0x10000;
|
||||
low = (uint16_t) (rune & 0x3FF);
|
||||
rune >>= 10;
|
||||
high = (uint16_t) (rune & 0x3FF);
|
||||
encoded[0] = high | 0xD800;
|
||||
encoded[1] = low | 0xDC00;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// TODO see if this can be cleaned up somehow
|
||||
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
|
||||
{
|
||||
uint16_t high, low;
|
||||
|
||||
if (*s < 0xD800 || *s >= 0xE000) {
|
||||
// self-representing character
|
||||
*rune = *s;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
if (*s >= 0xDC00) {
|
||||
// out-of-order surrogates
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
if (nElem == 1) { // not enough elements
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
high = *s;
|
||||
high &= 0x3FF;
|
||||
if (s[1] < 0xDC00 || s[1] >= 0xE000) {
|
||||
// bad surrogate pair
|
||||
*rune = badrune;
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
s++;
|
||||
low = *s;
|
||||
s++;
|
||||
low &= 0x3FF;
|
||||
*rune = high;
|
||||
*rune <<= 10;
|
||||
*rune |= low;
|
||||
*rune += 0x10000;
|
||||
return s;
|
||||
}
|
||||
|
||||
// TODO find a way to reduce the code in all of these somehow
|
||||
// TODO find a way to remove u as well
|
||||
size_t utf8RuneCount(const char *s, size_t nElem)
|
||||
{
|
||||
size_t len;
|
||||
uint32_t rune;
|
||||
|
||||
if (nElem != 0) {
|
||||
const char *t, *u;
|
||||
|
||||
len = 0;
|
||||
t = s;
|
||||
while (nElem != 0) {
|
||||
u = utf8DecodeRune(t, nElem, &rune);
|
||||
len++;
|
||||
nElem -= u - t;
|
||||
t = u;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
len = 0;
|
||||
while (*s) {
|
||||
s = utf8DecodeRune(s, nElem, &rune);
|
||||
len++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t utf8UTF16Count(const char *s, size_t nElem)
|
||||
{
|
||||
size_t len;
|
||||
uint32_t rune;
|
||||
uint16_t encoded[2];
|
||||
|
||||
if (nElem != 0) {
|
||||
const char *t, *u;
|
||||
|
||||
len = 0;
|
||||
t = s;
|
||||
while (nElem != 0) {
|
||||
u = utf8DecodeRune(t, nElem, &rune);
|
||||
len += utf16EncodeRune(rune, encoded);
|
||||
nElem -= u - t;
|
||||
t = u;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
len = 0;
|
||||
while (*s) {
|
||||
s = utf8DecodeRune(s, nElem, &rune);
|
||||
len += utf16EncodeRune(rune, encoded);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t utf16RuneCount(const uint16_t *s, size_t nElem)
|
||||
{
|
||||
size_t len;
|
||||
uint32_t rune;
|
||||
|
||||
if (nElem != 0) {
|
||||
const uint16_t *t, *u;
|
||||
|
||||
len = 0;
|
||||
t = s;
|
||||
while (nElem != 0) {
|
||||
u = utf16DecodeRune(t, nElem, &rune);
|
||||
len++;
|
||||
nElem -= u - t;
|
||||
t = u;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
len = 0;
|
||||
while (*s) {
|
||||
s = utf16DecodeRune(s, nElem, &rune);
|
||||
len++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
|
||||
{
|
||||
size_t len;
|
||||
uint32_t rune;
|
||||
char encoded[4];
|
||||
|
||||
if (nElem != 0) {
|
||||
const uint16_t *t, *u;
|
||||
|
||||
len = 0;
|
||||
t = s;
|
||||
while (nElem != 0) {
|
||||
u = utf16DecodeRune(t, nElem, &rune);
|
||||
len += utf8EncodeRune(rune, encoded);
|
||||
nElem -= u - t;
|
||||
t = u;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
len = 0;
|
||||
while (*s) {
|
||||
s = utf16DecodeRune(s, nElem, &rune);
|
||||
len += utf8EncodeRune(rune, encoded);
|
||||
}
|
||||
return len;
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
|
||||
// 10 november 2016
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated
|
||||
// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements*
|
||||
|
||||
extern size_t utf8EncodeRune(uint32_t rune, char *encoded);
|
||||
extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune);
|
||||
extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded);
|
||||
extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune);
|
||||
|
||||
extern size_t utf8RuneCount(const char *s, size_t nElem);
|
||||
extern size_t utf8UTF16Count(const char *s, size_t nElem);
|
||||
extern size_t utf16RuneCount(const uint16_t *s, size_t nElem);
|
||||
extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default).
|
||||
// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions.
|
||||
// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!).
|
||||
// TODO check this on MinGW-w64
|
||||
// TODO check this under /Wall
|
||||
// TODO C-style casts enough? or will that fail in /Wall?
|
||||
// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded)
|
||||
{
|
||||
return utf16EncodeRune(rune, reinterpret_cast<uint16_t *>(encoded));
|
||||
}
|
||||
|
||||
inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune)
|
||||
{
|
||||
const uint16_t *ret;
|
||||
|
||||
ret = utf16DecodeRune(reinterpret_cast<const uint16_t *>(s), nElem, rune);
|
||||
return reinterpret_cast<const __wchar_t *>(ret);
|
||||
}
|
||||
|
||||
inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem)
|
||||
{
|
||||
return utf16RuneCount(reinterpret_cast<const uint16_t *>(s), nElem);
|
||||
}
|
||||
|
||||
inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem)
|
||||
{
|
||||
return utf16UTF8Count(reinterpret_cast<const uint16_t *>(s), nElem);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue