// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ // 10 november 2016 #include "utf.h" // this code imitates Go's unicode/utf8 and unicode/utf16 // the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not) // it is also an imitation so we can license it under looser terms than the Go source #define badrune 0xFFFD // encoded must be at most 4 bytes // TODO clean this code up somehow size_t utf8EncodeRune(uint32_t rune, char *encoded) { uint8_t b, c, d, e; size_t n; // not in the valid range for Unicode if (rune > 0x10FFFF) rune = badrune; // surrogate runes cannot be encoded if (rune >= 0xD800 && rune < 0xE000) rune = badrune; if (rune < 0x80) { // ASCII bytes represent themselves b = (uint8_t) (rune & 0xFF); n = 1; goto done; } if (rune < 0x800) { // two-byte encoding c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x1F); b |= 0xC0; n = 2; goto done; } if (rune < 0x10000) { // three-byte encoding d = (uint8_t) (rune & 0x3F); d |= 0x80; rune >>= 6; c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x0F); b |= 0xE0; n = 3; goto done; } // otherwise use a four-byte encoding e = (uint8_t) (rune & 0x3F); e |= 0x80; rune >>= 6; d = (uint8_t) (rune & 0x3F); d |= 0x80; rune >>= 6; c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x07); b |= 0xF0; n = 4; done: encoded[0] = b; if (n > 1) encoded[1] = c; if (n > 2) encoded[2] = d; if (n > 3) encoded[3] = e; return n; } const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) { uint8_t b, c; uint8_t lowestAllowed, highestAllowed; size_t i, expected; int bad; b = (uint8_t) (*s); if (b < 0x80) { // ASCII bytes represent themselves *rune = b; s++; return s; } // 0xC0 and 0xC1 cover 2-byte overlong equivalents // 0xF5 to 0xFD cover values > 0x10FFFF // 0xFE and 0xFF were never defined (always illegal) if (b < 0xC2 || b > 0xF4) { // invalid *rune = badrune; s++; return s; } // this determines the range of allowed first continuation bytes lowestAllowed = 0x80; highestAllowed = 0xBF; switch (b) { case 0xE0: // disallow 3-byte overlong equivalents lowestAllowed = 0xA0; break; case 0xED: // disallow surrogate characters highestAllowed = 0x9F; break; case 0xF0: // disallow 4-byte overlong equivalents lowestAllowed = 0x90; break; case 0xF4: // disallow values > 0x10FFFF highestAllowed = 0x8F; break; } // and this determines how many continuation bytes are expected expected = 1; if (b >= 0xE0) expected++; if (b >= 0xF0) expected++; if (nElem != 0) { // are there enough bytes? nElem--; if (nElem < expected) { // nope *rune = badrune; s++; return s; } } // ensure that everything is correct // if not, **only** consume the initial byte bad = 0; for (i = 0; i < expected; i++) { c = (uint8_t) (s[1 + i]); if (c < lowestAllowed || c > highestAllowed) { bad = 1; break; } // the old lowestAllowed and highestAllowed is only for the first continuation byte lowestAllowed = 0x80; highestAllowed = 0xBF; } if (bad) { *rune = badrune; s++; return s; } // now do the topmost bits if (b < 0xE0) *rune = b & 0x1F; else if (b < 0xF0) *rune = b & 0x0F; else *rune = b & 0x07; s++; // we can finally move on // now do the continuation bytes for (; expected; expected--) { c = (uint8_t) (*s); s++; c &= 0x3F; // strip continuation bits *rune <<= 6; *rune |= c; } return s; } // encoded must have at most 2 elements size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) { uint16_t low, high; // not in the valid range for Unicode if (rune > 0x10FFFF) rune = badrune; // surrogate runes cannot be encoded if (rune >= 0xD800 && rune < 0xE000) rune = badrune; if (rune < 0x10000) { encoded[0] = (uint16_t) rune; return 1; } rune -= 0x10000; low = (uint16_t) (rune & 0x3FF); rune >>= 10; high = (uint16_t) (rune & 0x3FF); encoded[0] = high | 0xD800; encoded[1] = low | 0xDC00; return 2; } // TODO see if this can be cleaned up somehow const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) { uint16_t high, low; if (*s < 0xD800 || *s >= 0xE000) { // self-representing character *rune = *s; s++; return s; } if (*s >= 0xDC00) { // out-of-order surrogates *rune = badrune; s++; return s; } if (nElem == 1) { // not enough elements *rune = badrune; s++; return s; } high = *s; high &= 0x3FF; if (s[1] < 0xDC00 || s[1] >= 0xE000) { // bad surrogate pair *rune = badrune; s++; return s; } s++; low = *s; s++; low &= 0x3FF; *rune = high; *rune <<= 10; *rune |= low; *rune += 0x10000; return s; } // TODO find a way to reduce the code in all of these somehow // TODO find a way to remove u as well size_t utf8RuneCount(const char *s, size_t nElem) { size_t len; uint32_t rune; if (nElem != 0) { const char *t, *u; len = 0; t = s; while (nElem != 0) { u = utf8DecodeRune(t, nElem, &rune); len++; nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf8DecodeRune(s, nElem, &rune); len++; } return len; } size_t utf8UTF16Count(const char *s, size_t nElem) { size_t len; uint32_t rune; uint16_t encoded[2]; if (nElem != 0) { const char *t, *u; len = 0; t = s; while (nElem != 0) { u = utf8DecodeRune(t, nElem, &rune); len += utf16EncodeRune(rune, encoded); nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf8DecodeRune(s, nElem, &rune); len += utf16EncodeRune(rune, encoded); } return len; } size_t utf16RuneCount(const uint16_t *s, size_t nElem) { size_t len; uint32_t rune; if (nElem != 0) { const uint16_t *t, *u; len = 0; t = s; while (nElem != 0) { u = utf16DecodeRune(t, nElem, &rune); len++; nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf16DecodeRune(s, nElem, &rune); len++; } return len; } size_t utf16UTF8Count(const uint16_t *s, size_t nElem) { size_t len; uint32_t rune; char encoded[4]; if (nElem != 0) { const uint16_t *t, *u; len = 0; t = s; while (nElem != 0) { u = utf16DecodeRune(t, nElem, &rune); len += utf8EncodeRune(rune, encoded); nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf16DecodeRune(s, nElem, &rune); len += utf8EncodeRune(rune, encoded); } return len; }