Gave up and rewrote Windows graphemes.cpp to use CharNextW() instead of Uniscribe, since I can't manage Uniscribe memory properly, it seems. The CharNextW() bug is relatively painless to work around anyway. Next commit will drop Uniscribe from the headers and build system.
This commit is contained in:
parent
5a1c733841
commit
c4b6149ec2
|
@ -1,109 +1,59 @@
|
|||
// 25 may 2016
|
||||
#include "uipriv_windows.hpp"
|
||||
|
||||
// We could use CharNext() to generate grapheme cluster boundaries, but it doesn't handle surrogate pairs properly (see http://archives.miloush.net/michkap/archive/2008/12/16/9223301.html).
|
||||
// So let's use Uniscribe (see http://archives.miloush.net/michkap/archive/2005/01/14/352802.html)
|
||||
// See also http://www.catch22.net/tuts/uniscribe-mysteries, http://www.catch22.net/tuts/keyboard-navigation, and https://maxradi.us/documents/uniscribe/ for more details.
|
||||
|
||||
// TODO the DirectWrite equivalent appears to be https://msdn.microsoft.com/en-us/library/windows/desktop/dd316625(v=vs.85).aspx but is somehow somewhat more complicated to use than Uniscribe is! maybe the PadWrite sample uses it? or should we just keep using Uniscribe?
|
||||
// We could use CharNextW() to generate grapheme cluster boundaries, but it doesn't handle surrogate pairs properly (see http://archives.miloush.net/michkap/archive/2008/12/16/9223301.html).
|
||||
// We could also use Uniscribe (see http://archives.miloush.net/michkap/archive/2005/01/14/352802.html, http://www.catch22.net/tuts/uniscribe-mysteries, http://www.catch22.net/tuts/keyboard-navigation, and https://maxradi.us/documents/uniscribe/), but its rules for buffer sizes is convoluted.
|
||||
// Let's just deal with the CharNextW() bug.
|
||||
|
||||
int graphemesTakesUTF16(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static HRESULT itemize(WCHAR *s, size_t len, SCRIPT_ITEM **out, int *outn)
|
||||
{
|
||||
SCRIPT_CONTROL sc;
|
||||
SCRIPT_STATE ss;
|
||||
SCRIPT_ITEM *items;
|
||||
size_t maxItems;
|
||||
int n;
|
||||
HRESULT hr;
|
||||
|
||||
maxItems = len + 2;
|
||||
for (;;) {
|
||||
// make sure these are zero-initialized to avoid mangling the text
|
||||
ZeroMemory(&sc, sizeof (SCRIPT_CONTROL));
|
||||
ZeroMemory(&ss, sizeof (SCRIPT_STATE));
|
||||
items = new SCRIPT_ITEM[maxItems + 1];
|
||||
hr = ScriptItemize(s, len,
|
||||
maxItems,
|
||||
&sc, &ss,
|
||||
items, &n);
|
||||
if (hr == S_OK)
|
||||
break;
|
||||
// otherwise either an error or not enough room
|
||||
delete[] items;
|
||||
if (hr != E_OUTOFMEMORY)
|
||||
return hr;
|
||||
maxItems *= 2; // add some more and try again
|
||||
}
|
||||
|
||||
*out = items;
|
||||
*outn = n;
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
struct graphemes *graphemes(void *s, size_t len)
|
||||
{
|
||||
struct graphemes *g;
|
||||
WCHAR *str = (WCHAR *) s;
|
||||
SCRIPT_ITEM *items;
|
||||
int nItems;
|
||||
int curItemIndex;
|
||||
int nCharsInCurItem;
|
||||
WCHAR *str;
|
||||
size_t *pPTG, *pGTP;
|
||||
HRESULT hr;
|
||||
|
||||
g = uiNew(struct graphemes);
|
||||
|
||||
hr = itemize(str, len, &items, &nItems);
|
||||
if (hr != S_OK)
|
||||
logHRESULT(L"error itemizing string for finding grapheme cluster boundaries", hr);
|
||||
g->len = nItems;
|
||||
g->len = 0;
|
||||
str = (WCHAR *) s;
|
||||
while (*str != L'\0') {
|
||||
g->len++;
|
||||
str = CharNextW(str);
|
||||
// no need to worry about surrogates if we're just counting
|
||||
}
|
||||
|
||||
g->pointsToGraphemes = (size_t *) uiAlloc((len + 1) * sizeof (size_t), "size_t[] (graphemes)");
|
||||
// note that there are actually nItems + 1 elements in items
|
||||
// items[nItems] is the grapheme one past the end
|
||||
g->graphemesToPoints = (size_t *) uiAlloc((g->len + 1) * sizeof (size_t), "size_t[] (graphemes)");
|
||||
|
||||
pPTG = g->pointsToGraphemes;
|
||||
pGTP = g->graphemesToPoints;
|
||||
for (curItemIndex = 0; curItemIndex < nItems; curItemIndex++) {
|
||||
SCRIPT_ITEM *curItem, *nextItem;
|
||||
SCRIPT_LOGATTR *logattr;
|
||||
size_t *curGTP;
|
||||
int i;
|
||||
str = (WCHAR *) s;
|
||||
while (*str != L'\0') {
|
||||
WCHAR *next, *p;
|
||||
ptrdiff_t nextoff;
|
||||
|
||||
curItem = items + curItemIndex;
|
||||
nextItem = curItem + 1;
|
||||
// as part of the bug, we need to make sure we only call CharNextW() on low halves, otherwise it'll return the same low half forever
|
||||
nextoff = 0;
|
||||
if (IS_HIGH_SURROGATE(*str))
|
||||
nextoff = 1;
|
||||
next = CharNextW(str + nextoff);
|
||||
if (IS_LOW_SURROGATE(*next))
|
||||
next--;
|
||||
|
||||
nCharsInCurItem = nextItem->iCharPos - curItem->iCharPos;
|
||||
*pGTP = pPTG - g->pointsToGraphemes;
|
||||
for (p = str; p < next; p++)
|
||||
*pPTG++ = pGTP - g->graphemesToPoints;
|
||||
pGTP++;
|
||||
|
||||
logattr = new SCRIPT_LOGATTR[nCharsInCurItem];
|
||||
hr = ScriptBreak(str + curItem->iCharPos, nCharsInCurItem,
|
||||
&(curItem->a), logattr);
|
||||
if (hr != S_OK)
|
||||
logHRESULT(L"error breaking string for finding grapheme cluster boundaries", hr);
|
||||
|
||||
// TODO can we merge these loops somehow?
|
||||
curGTP = pGTP;
|
||||
for (i = 0; i < nCharsInCurItem; i++)
|
||||
if (logattr[i].fCharStop != 0)
|
||||
*pGTP++ = curItem->iCharPos + i;
|
||||
for (i = 0; i < nCharsInCurItem; i++) {
|
||||
*pPTG++ = curGTP - g->graphemesToPoints;
|
||||
if (logattr[i].fCharStop != 0)
|
||||
curGTP++;
|
||||
}
|
||||
|
||||
delete[] logattr;
|
||||
str = next;
|
||||
}
|
||||
// and handle the last item for the end of the string
|
||||
*pGTP = items[nItems].iCharPos;
|
||||
*pGTP = pPTG - g->pointsToGraphemes;
|
||||
*pPTG = pGTP - g->graphemesToPoints;
|
||||
// TODO is any of the above broken?... also for all platforms, are the last few bytes not covered?
|
||||
|
||||
delete[] items;
|
||||
return g;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue