Bug 1105644 - Lowercase words before passing them to libhyphen, so as to match patterns fully. r=smontagu

This commit is contained in:
Jonathan Kew 2015-02-18 09:25:51 +00:00
parent ef5fa9a320
commit 6aba8af82d

View File

@ -80,9 +80,46 @@ nsHyphenator::Hyphenate(const nsAString& aString,
}
if (inWord) {
const char16_t *begin = aString.BeginReading();
NS_ConvertUTF16toUTF8 utf8(begin + wordStart,
wordLimit - wordStart);
// Convert the word to utf-8 for libhyphen, lowercasing it as we go
// so that it will match the (lowercased) patterns (bug 1105644).
nsAutoCString utf8;
const char16_t *begin = aString.BeginReading() + wordStart;
const char16_t *end = aString.BeginReading() + wordLimit;
while (begin < end) {
uint32_t ch = *begin++;
if (NS_IS_HIGH_SURROGATE(ch)) {
if (begin < end && NS_IS_LOW_SURROGATE(*begin)) {
ch = SURROGATE_TO_UCS4(ch, *begin++);
} else {
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
}
} else if (NS_IS_LOW_SURROGATE(ch)) {
ch = 0xfffd; // unpaired surrogate
}
// XXX What about language-specific casing? Consider Turkish I/i...
// In practice, it looks like the current patterns will not be
// affected by this, as they treat dotted and undotted i similarly.
ch = ToLowerCase(ch);
if (ch < 0x80) { // U+0000 - U+007F
utf8.Append(ch);
} else if (ch < 0x0800) { // U+0100 - U+07FF
utf8.Append(0xC0 | (ch >> 6));
utf8.Append(0x80 | (0x003F & ch));
} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
utf8.Append(0xE0 | (ch >> 12));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
} else {
utf8.Append(0xF0 | (ch >> 18));
utf8.Append(0x80 | (0x003F & (ch >> 12)));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
}
}
nsAutoTArray<char,200> utf8hyphens;
utf8hyphens.SetLength(utf8.Length() + 5);
char **rep = nullptr;