From 526173bf76bb064084fa3f89e249bb40bf8c487b Mon Sep 17 00:00:00 2001 From: Pietro Gagliardi Date: Sat, 3 Dec 2016 18:34:06 -0500 Subject: [PATCH] Started writing the implementation of uiAttributedString. Updated windows/graphemes.cpp for this new implementation. Also fixed up a bunch of subtle errors and other issues with windows/graphemes.cpp. --- common/attrstr.c | 160 ++++++++++++++++++++++++++++++++++++++++++ common/uipriv.h | 12 +++- windows/graphemes.cpp | 80 ++++++++++++++------- 3 files changed, 223 insertions(+), 29 deletions(-) create mode 100644 common/attrstr.c diff --git a/common/attrstr.c b/common/attrstr.c new file mode 100644 index 00000000..03a1dadc --- /dev/null +++ b/common/attrstr.c @@ -0,0 +1,160 @@ +// 3 december 2016 +#include "../ui.h" +#include "uipriv.h" + +struct uiAttributedString { + char *s; + size_t len; + + // TODO attributes + + // indiscriminately keep a UTF-16 copy of the string on all platforms so we can hand this off to the grapheme calculator + // this ensures no one platform has a speed advantage (sorry GTK+) + uint16_t *u16; + size_t u16len; + + size_t *u8tou16; + size_t *u16tou8; + + // this is lazily created to keep things from getting *too* slow + struct graphemes *graphemes; +}; + +static void resize(uiAttributedString *s, size_t u8, size_t u16) +{ + s->len = u8; + s->s = (char *) uiRealloc(s->s, (s->len + 1) * sizeof (char), "char[] (uiAttributedString)"); + s->u8tou16 = (size_t *) uiRealloc(s->u8tou16, (s->len + 1) * sizeof (size_t), "size_t[] (uiAttributedString)"); + s->u16len = u16; + s->u16 = (uint16_t *) uiRealloc(s->u16, (s->u16len + 1) * sizeof (uint16_t), "uint16_t[] (uiAttributedString)"); + s->u16tou8 = (size_t *) uiRealloc(s->u16tou8, (s->u16len + 1) * sizeof (size_t), "size_t[] (uiAttributedString)"); +} + +uiAttributedString *uiNewAttributedString(const char *initialString) +{ + uiAttributedString *s; + + s = uiNew(uiAttributedString); + uiAttributedStringAppendUnattributed(s, initialString); + return s; +} + +static void recomputeGraphemes(uiAttributedString *s) +{ + if (s->graphemes != NULL) + return; + if (graphemesTakesUTF16()) { + s->graphemes = graphemes(s->u16, s->u16len); + return; + } + s->graphemes = graphemes(s->s, s->len); +} + +static void invalidateGraphemes(uiAttributedString *s) +{ + if (s->graphemes == NULL) + return; + uiFree(s->graphemes->pointsToGraphemes); + uiFree(s->graphemes->graphemesToPoints); + uiFree(s->graphemes); + s->graphemes = NULL; +} + +void uiFreeAttributedString(uiAttributedString *s) +{ + invalidateGraphemes(s); + uiFree(s->u16tou8); + uiFree(s->u8tou16); + uiFree(s->u16); + uiFree(s->s); + uiFree(s); +} + +const char *uiAttributedStringString(uiAttributedString *s) +{ + return s->s; +} + +void uiAttributedStringAppendUnattributed(uiAttributedString *s, const char *str) +{ + const char *t; + uint32_t rune; + char buf[4]; + uint16_t u16buf[2]; + size_t n, n16; + size_t old, old; + + // first figure out how much we need to grow by + // this includes post-validated UTF-8 + t = str; + n = 0; + n16 = 0; + while (*t) { + t = utf8DecodeRune(t, 0, &rune); + n += utf8EncodeRune(rune, buf); + n16 += utf16EncodeRune(rune, buf16); + } + + // and resize + old = s->len; + old16 = s->len16; + resize(s, s->len + n, s->u16len + n16); + + // and copy + while (*str) { + str = utf8DecodeRune(str, 0, &rune); + n = utf8EncodeRune(rune, buf); + n16 = utf16EncodeRune(rune, buf16); + s->s[old] = buf[0]; + s->u8tou16[old] = old16; + if (n > 1) { + s->s[old + 1] = buf[1]; + s->u8tou16[old + 1] = old16; + } + if (n > 2) { + s->s[old + 2] = buf[2]; + s->u8tou16[old + 2] = old16; + } + if (n > 3) { + s->s[old + 3] = buf[3]; + s->u8tou16[old + 3] = old16; + } + s->u16[old16] = buf16[0]; + s->u16tou8[old16] = old; + if (n16 > 1) { + s->u16[old16 + 1] = buf16[1]; + s->u16tou8[old16 + 1] = old; + } + old += n; + old16 += n16; + } + // and have an index for the end of the string + s->u8tou16[old] = old16; + s->u16tou8[old16] = old; + + invalidateGraphemes(s); +} + +// TODO figure out if we should count the grapheme past the end +size_t uiAttributedStringNumGraphemes(uiAttributedString *s) +{ + recomputeGraphemes(s); + return s->graphemes->len; +} + +size_t uiAttributedStringByteIndexToGrapheme(uiAttributedString *s, size_t pos) +{ + recomputeGraphemes(s); + if (graphemesTakesUTF16()) + pos = s->u8tou16[pos]; + return s->graphemes->pointsToGraphemes[pos]; +} + +size_t uiAttributedStringGraphemeToByteIndex(uiAttributedString *s, size_t pos) +{ + recomputeGraphemes(s); + pos = s->graphemes->graphemesToPoints[pos]; + if (graphemesTakesUTF16()) + pos = s->u16tou8[pos]; + return pos; +} diff --git a/common/uipriv.h b/common/uipriv.h index f22a08a6..1fe00b16 100644 --- a/common/uipriv.h +++ b/common/uipriv.h @@ -1,6 +1,4 @@ // 6 april 2015 -// TODO can extern "C"s nest? -#include "utf.h" #ifdef __cplusplus extern "C" { @@ -8,6 +6,7 @@ extern "C" { #include #include "controlsigs.h" +#include "utf.h" extern uiInitOptions options; @@ -56,6 +55,15 @@ extern void fallbackSkew(uiDrawMatrix *, double, double, double, double); extern void scaleCenter(double, double, double *, double *); extern void fallbackTransformSize(uiDrawMatrix *, double *, double *); +// for attrstr.c +struct graphemes { + size_t len; + size_t *pointsToGraphemes; + size_t *graphemesToPoints; +}; +extern int graphemesTakesUTF16(void); +extern struct graphemes *graphemes(void *s, size_t len); + #ifdef __cplusplus } #endif diff --git a/windows/graphemes.cpp b/windows/graphemes.cpp index 355e4037..d4fc1e1b 100644 --- a/windows/graphemes.cpp +++ b/windows/graphemes.cpp @@ -3,9 +3,14 @@ // We could use CharNext() to generate grapheme cluster boundaries, but it doesn't handle surrogate pairs properly (see http://archives.miloush.net/michkap/archive/2008/12/16/9223301.html). // So let's use Uniscribe (see http://archives.miloush.net/michkap/archive/2005/01/14/352802.html) -// See also http://www.catch22.net/tuts/uniscribe-mysteries and http://www.catch22.net/tuts/keyboard-navigation for more details. +// See also http://www.catch22.net/tuts/uniscribe-mysteries, http://www.catch22.net/tuts/keyboard-navigation, and https://maxradi.us/documents/uniscribe/ for more details. -static HRESULT itemize(WCHAR *msg, size_t len, SCRIPT_ITEM **out, int *outn) +int graphemesTakesUTF16(void) +{ + return 1; +} + +static HRESULT itemize(WCHAR *s, size_t len, SCRIPT_ITEM **out, int *outn) { SCRIPT_CONTROL sc; SCRIPT_STATE ss; @@ -20,8 +25,8 @@ static HRESULT itemize(WCHAR *msg, size_t len, SCRIPT_ITEM **out, int *outn) maxItems = len + 2; for (;;) { - items = new SCRIPT_ITEM[maxItems]; - hr = ScriptItemize(msg, len, + items = new SCRIPT_ITEM[maxItems + 1]; + hr = ScriptItemize(s, len, maxItems, &sc, &ss, items, &n); @@ -39,42 +44,63 @@ static HRESULT itemize(WCHAR *msg, size_t len, SCRIPT_ITEM **out, int *outn) return S_OK; } -size_t *graphemes(WCHAR *msg) +struct graphemes *graphemes(void *s, size_t len) { - size_t len; + struct graphemes *g; + WCHAR *str = (WCHAR *) s; SCRIPT_ITEM *items; - int i, n; - size_t *out; - size_t *op; - SCRIPT_LOGATTR *logattr; - int j, nn; + int nItems; + int curItemIndex; + int nCharsInCurItem; + size_t *pPTG, *pGTP; HRESULT hr; - len = wcslen(msg); - hr = itemize(msg, len, &items, &n); + g = uiNew(struct graphemes); + + hr = itemize(str, len, &items, &n); if (hr != S_OK) logHRESULT(L"error itemizing string for finding grapheme cluster boundaries", hr); + g->len = nItems; + g->pointsToGraphemes = (size_t *) uiAlloc((len + 1) * sizeof (size_t), "size_t[] (graphemes)"); + // note that there are actually nItems + 1 elements in items + // items[nItems] is the grapheme one past the end + g->graphemesToPoints = (size_t *) uiAlloc((g->len + 1) * sizeof (size_t), "size_t[] (graphemes)"); - // should be enough; 2 more just to be safe - out = (size_t *) uiAlloc((len + 2) * sizeof (size_t), "size_t[]"); - op = out; + pPTG = g->pointsToGraphemes; + pGTP = g->graphemesToPoints; + for (curItemIndex = 0; curItemIndex < nItems; curItemIndex++) { + SCRIPT_ITEM *curItem, *nextItem; + SCRIPT_LOGATTR *logattr; + size_t *curGTP; - // note that there are actually n + 1 elements in items - for (i = 0; i < n; i++) { - nn = items[i + 1].iCharPos - items[i].iCharPos; - logattr = new SCRIPT_LOGATTR[nn]; - hr = ScriptBreak(msg + items[i].iCharPos, nn, - &(items[i].a), logattr); + curItem = items + curItemIndex; + nextItem = curItem + 1; + + nCharsInCurItem = nextItem->iCharPos - curItem->iCharPos; + + logattr = new SCRIPT_LOGATTR[nCharsInCurItem]; + hr = ScriptBreak(str + curItem->iCharPos, nCharsInCurItem, + &(curItem->a), logattr); if (hr != S_OK) logHRESULT(L"error breaking string for finding grapheme cluster boundaries", hr); - for (j = 0; j < nn; j++) - if (logattr[j].fCharStop != 0) - *op++ = items[i].iCharPos + j; + + // TODO can we merge these loops somehow? + curGTP = pGTP; + for (i = 0; i < nCharsInCurItem; i++) + if (logattr[i].fCharStop != 0) + *pGTP++ = curItem->iCharPos + i; + for (i = 0; i < nCharsInCurItem; i++) { + *pPTG++ = curGTP - g->graphemesToPoints; + if (logattr[i].fCharStop != 0) + curGTP++; + } + delete[] logattr; } // and handle the last item for the end of the string - *op++ = items[i].iCharPos; + *pGTP++ = items[nItems].iCharPos; + *pPTG++ = pGTP - g->graphemesToPoints; delete[] items; - return out; + return g; }