Gave up and rewrote Windows graphemes.cpp to use CharNextW() instead of Uniscribe, since I can't manage Uniscribe memory properly, it seems. The CharNextW() bug is relatively painless to work around anyway. Next commit will drop Uniscribe from the headers and build system.

2017-02-10 22:12:37 -05:00 · 2017-02-10 22:12:37 -05:00 · c4b6149ec2
parent 5a1c733841
commit c4b6149ec2
1 changed files with 29 additions and 79 deletions
--- a/windows/graphemes.cpp
+++ b/windows/graphemes.cpp
@ -1,109 +1,59 @@
 // 25 may 2016
 #include "uipriv_windows.hpp"

-// We could use CharNext() to generate grapheme cluster boundaries, but it doesn't handle surrogate pairs properly (see http://archives.miloush.net/michkap/archive/2008/12/16/9223301.html).
-// So let's use Uniscribe (see http://archives.miloush.net/michkap/archive/2005/01/14/352802.html)
-// See also http://www.catch22.net/tuts/uniscribe-mysteries, http://www.catch22.net/tuts/keyboard-navigation, and https://maxradi.us/documents/uniscribe/ for more details.
-
-// TODO the DirectWrite equivalent appears to be https://msdn.microsoft.com/en-us/library/windows/desktop/dd316625(v=vs.85).aspx but is somehow somewhat more complicated to use than Uniscribe is! maybe the PadWrite sample uses it? or should we just keep using Uniscribe?
+// We could use CharNextW() to generate grapheme cluster boundaries, but it doesn't handle surrogate pairs properly (see http://archives.miloush.net/michkap/archive/2008/12/16/9223301.html).
+// We could also use Uniscribe (see http://archives.miloush.net/michkap/archive/2005/01/14/352802.html, http://www.catch22.net/tuts/uniscribe-mysteries, http://www.catch22.net/tuts/keyboard-navigation, and https://maxradi.us/documents/uniscribe/), but its rules for buffer sizes is convoluted.
+// Let's just deal with the CharNextW() bug.

 int graphemesTakesUTF16(void)
 {
 	return 1;
 }

-static HRESULT itemize(WCHAR *s, size_t len, SCRIPT_ITEM **out, int *outn)
-{
-	SCRIPT_CONTROL sc;
-	SCRIPT_STATE ss;
-	SCRIPT_ITEM *items;
-	size_t maxItems;
-	int n;
-	HRESULT hr;
-
-	maxItems = len + 2;
-	for (;;) {
-		// make sure these are zero-initialized to avoid mangling the text
-		ZeroMemory(&sc, sizeof (SCRIPT_CONTROL));
-		ZeroMemory(&ss, sizeof (SCRIPT_STATE));
-		items = new SCRIPT_ITEM[maxItems + 1];
-		hr = ScriptItemize(s, len,
-			maxItems,
-			&sc, &ss,
-			items, &n);
-		if (hr == S_OK)
-			break;
-		// otherwise either an error or not enough room
-		delete[] items;
-		if (hr != E_OUTOFMEMORY)
-			return hr;
-		maxItems *= 2;		// add some more and try again
-	}
-
-	*out = items;
-	*outn = n;
-	return S_OK;
-}
-
 struct graphemes *graphemes(void *s, size_t len)
 {
 	struct graphemes *g;
-	WCHAR *str = (WCHAR *) s;
-	SCRIPT_ITEM *items;
-	int nItems;
-	int curItemIndex;
-	int nCharsInCurItem;
+	WCHAR *str;
 	size_t *pPTG, *pGTP;
-	HRESULT hr;

 	g = uiNew(struct graphemes);

-	hr = itemize(str, len, &items, &nItems);
-	if (hr != S_OK)
-		logHRESULT(L"error itemizing string for finding grapheme cluster boundaries", hr);
-	g->len = nItems;
+	g->len = 0;
+	str = (WCHAR *) s;
+	while (*str != L'\0') {
+		g->len++;
+		str = CharNextW(str);
+		// no need to worry about surrogates if we're just counting
+	}
+
 	g->pointsToGraphemes = (size_t *) uiAlloc((len + 1) * sizeof (size_t), "size_t[] (graphemes)");
-	// note that there are actually nItems + 1 elements in items
-	// items[nItems] is the grapheme one past the end
 	g->graphemesToPoints = (size_t *) uiAlloc((g->len + 1) * sizeof (size_t), "size_t[] (graphemes)");

 	pPTG = g->pointsToGraphemes;
 	pGTP = g->graphemesToPoints;
-	for (curItemIndex = 0; curItemIndex < nItems; curItemIndex++) {
-		SCRIPT_ITEM *curItem, *nextItem;
-		SCRIPT_LOGATTR *logattr;
-		size_t *curGTP;
-		int i;
+	str = (WCHAR *) s;
+	while (*str != L'\0') {
+		WCHAR *next, *p;
+		ptrdiff_t nextoff;

-		curItem = items + curItemIndex;
-		nextItem = curItem + 1;
+		// as part of the bug, we need to make sure we only call CharNextW() on low halves, otherwise it'll return the same low half forever
+		nextoff = 0;
+		if (IS_HIGH_SURROGATE(*str))
+			nextoff = 1;
+		next = CharNextW(str + nextoff);
+		if (IS_LOW_SURROGATE(*next))
+			next--;

-		nCharsInCurItem = nextItem->iCharPos - curItem->iCharPos;
+		*pGTP = pPTG - g->pointsToGraphemes;
+		for (p = str; p < next; p++)
+			*pPTG++ = pGTP - g->graphemesToPoints;
+		pGTP++;

-		logattr = new SCRIPT_LOGATTR[nCharsInCurItem];
-		hr = ScriptBreak(str + curItem->iCharPos, nCharsInCurItem,
-			&(curItem->a), logattr);
-		if (hr != S_OK)
-			logHRESULT(L"error breaking string for finding grapheme cluster boundaries", hr);
-
-		// TODO can we merge these loops somehow?
-		curGTP = pGTP;
-		for (i = 0; i < nCharsInCurItem; i++)
-			if (logattr[i].fCharStop != 0)
-				*pGTP++ = curItem->iCharPos + i;
-		for (i = 0; i < nCharsInCurItem; i++) {
-			*pPTG++ = curGTP - g->graphemesToPoints;
-			if (logattr[i].fCharStop != 0)
-				curGTP++;
-		}
-
-		delete[] logattr;
+		str = next;
 	}
 	// and handle the last item for the end of the string
-	*pGTP = items[nItems].iCharPos;
+	*pGTP = pPTG - g->pointsToGraphemes;
 	*pPTG = pGTP - g->graphemesToPoints;
-	// TODO is any of the above broken?... also for all platforms, are the last few bytes not covered?

-	delete[] items;
 	return g;
 }