Bug 1105644 - Lowercase words before passing them to libhyphen, so as to match patterns fully. r=smontagu

2024-09-13 09:24:08 -07:00 · 2015-02-18 09:25:51 +00:00 · 2015-02-18 09:25:51 +00:00 · 6aba8af82d
commit 6aba8af82d
parent ef5fa9a320
1 changed files with 40 additions and 3 deletions
--- a/intl/hyphenation/nsHyphenator.cpp
+++ b/intl/hyphenation/nsHyphenator.cpp
@ -80,9 +80,46 @@ nsHyphenator::Hyphenate(const nsAString& aString,
    }

    if (inWord) {
-      const char16_t *begin = aString.BeginReading();
-      NS_ConvertUTF16toUTF8 utf8(begin + wordStart,
-                                 wordLimit - wordStart);
+      // Convert the word to utf-8 for libhyphen, lowercasing it as we go
+      // so that it will match the (lowercased) patterns (bug 1105644).
+      nsAutoCString utf8;
+      const char16_t *begin = aString.BeginReading() + wordStart;
+      const char16_t *end = aString.BeginReading() + wordLimit;
+      while (begin < end) {
+        uint32_t ch = *begin++;
+
+        if (NS_IS_HIGH_SURROGATE(ch)) {
+          if (begin < end && NS_IS_LOW_SURROGATE(*begin)) {
+            ch = SURROGATE_TO_UCS4(ch, *begin++);
+          } else {
+            ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
+          }
+        } else if (NS_IS_LOW_SURROGATE(ch)) {
+          ch = 0xfffd; // unpaired surrogate
+        }
+
+        // XXX What about language-specific casing? Consider Turkish I/i...
+        // In practice, it looks like the current patterns will not be
+        // affected by this, as they treat dotted and undotted i similarly.
+        ch = ToLowerCase(ch);
+
+        if (ch < 0x80) { // U+0000 - U+007F
+          utf8.Append(ch);
+        } else if (ch < 0x0800) { // U+0100 - U+07FF
+          utf8.Append(0xC0 | (ch >> 6));
+          utf8.Append(0x80 | (0x003F & ch));
+        } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
+          utf8.Append(0xE0 | (ch >> 12));
+          utf8.Append(0x80 | (0x003F & (ch >> 6)));
+          utf8.Append(0x80 | (0x003F & ch));
+        } else {
+          utf8.Append(0xF0 | (ch >> 18));
+          utf8.Append(0x80 | (0x003F & (ch >> 12)));
+          utf8.Append(0x80 | (0x003F & (ch >> 6)));
+          utf8.Append(0x80 | (0x003F & ch));
+        }
+      }
+
      nsAutoTArray<char,200> utf8hyphens;
      utf8hyphens.SetLength(utf8.Length() + 5);
      char **rep = nullptr;