gecko/intl/hyphenation/nsHyphenator.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsHyphenator.h"
#include "nsIFile.h"
#include "nsUTF8Utils.h"
#include "nsUnicodeProperties.h"
#include "nsUnicharUtilCIID.h"
#include "nsIURI.h"

#include "hyphen.h"

nsHyphenator::nsHyphenator(nsIURI *aURI)
  : mDict(nullptr)
{
  nsCString uriSpec;
  nsresult rv = aURI->GetSpec(uriSpec);
  if (NS_FAILED(rv)) {
    return;
  }
  mDict = hnj_hyphen_load(uriSpec.get());
#ifdef DEBUG
  if (mDict) {
    printf("loaded hyphenation patterns from %s\n", uriSpec.get());
  }
#endif
}

nsHyphenator::~nsHyphenator()
{
  if (mDict != nullptr) {
    hnj_hyphen_free((HyphenDict*)mDict);
    mDict = nullptr;
  }
}

bool
nsHyphenator::IsValid()
{
  return (mDict != nullptr);
}

nsresult
nsHyphenator::Hyphenate(const nsAString& aString,
                        FallibleTArray<bool>& aHyphens)
{
  if (!aHyphens.SetLength(aString.Length())) {
    return NS_ERROR_OUT_OF_MEMORY;
  }
  memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));

  bool inWord = false;
  uint32_t wordStart = 0, wordLimit = 0;
  uint32_t chLen;
  for (uint32_t i = 0; i < aString.Length(); i += chLen) {
    uint32_t ch = aString[i];
    chLen = 1;

    if (NS_IS_HIGH_SURROGATE(ch)) {
      if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
        ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
        chLen = 2;
      } else {
        NS_WARNING("unpaired surrogate found during hyphenation");
      }
    }

    nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
    if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) {
      if (!inWord) {
        inWord = true;
        wordStart = i;
      }
      wordLimit = i + chLen;
      if (i + chLen < aString.Length()) {
        continue;
      }
    }

    if (inWord) {
      // Convert the word to utf-8 for libhyphen, lowercasing it as we go
      // so that it will match the (lowercased) patterns (bug 1105644).
      nsAutoCString utf8;
      const char16_t* const begin = aString.BeginReading();
      const char16_t *cur = begin + wordStart;
      const char16_t *end = begin + wordLimit;
      while (cur < end) {
        uint32_t ch = *cur++;

        if (NS_IS_HIGH_SURROGATE(ch)) {
          if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
            ch = SURROGATE_TO_UCS4(ch, *cur++);
          } else {
            ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
          }
        } else if (NS_IS_LOW_SURROGATE(ch)) {
          ch = 0xfffd; // unpaired surrogate
        }

        // XXX What about language-specific casing? Consider Turkish I/i...
        // In practice, it looks like the current patterns will not be
        // affected by this, as they treat dotted and undotted i similarly.
        ch = ToLowerCase(ch);

        if (ch < 0x80) { // U+0000 - U+007F
          utf8.Append(ch);
        } else if (ch < 0x0800) { // U+0100 - U+07FF
          utf8.Append(0xC0 | (ch >> 6));
          utf8.Append(0x80 | (0x003F & ch));
        } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
          utf8.Append(0xE0 | (ch >> 12));
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
          utf8.Append(0x80 | (0x003F & ch));
        } else {
          utf8.Append(0xF0 | (ch >> 18));
          utf8.Append(0x80 | (0x003F & (ch >> 12)));
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
          utf8.Append(0x80 | (0x003F & ch));
        }
      }

      nsAutoTArray<char,200> utf8hyphens;
      utf8hyphens.SetLength(utf8.Length() + 5);
      char **rep = nullptr;
      int *pos = nullptr;
      int *cut = nullptr;
      int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
                                      utf8.BeginReading(), utf8.Length(),
                                      utf8hyphens.Elements(), nullptr,
                                      &rep, &pos, &cut);
      if (!err) {
        // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
        // from utf8 code unit indexing (which would match the utf8 input
        // string directly) to Unicode character indexing.
        // We then need to convert this to utf16 code unit offsets for Gecko.
        const char *hyphPtr = utf8hyphens.Elements();
        const char16_t *cur = begin + wordStart;
        const char16_t *end = begin + wordLimit;
        while (cur < end) {
          if (*hyphPtr & 0x01) {
            aHyphens[cur - begin] = true;
          }
          cur++;
          if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
              NS_IS_HIGH_SURROGATE(*(cur-1)))
          {
            cur++;
          }
          hyphPtr++;
        }
      }
    }
    
    inWord = false;
  }

  return NS_OK;
}
bug 846732 - replace tri-license boilerplate with MPL2 in our hyphenation code. r=gerv 2013-03-01 05:41:30 -08:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* This Source Code Form is subject to the terms of the Mozilla Public`
			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/. */`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00
			`#include "nsHyphenator.h"`
			`#include "nsIFile.h"`
			`#include "nsUTF8Utils.h"`
bug 724826 pt 3 - replace users of nsIUGenCategory service within libxul with direct access to mozilla::unicode::GetGenCategory. r=smontagu 2012-02-24 02:15:46 -08:00			`#include "nsUnicodeProperties.h"`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`#include "nsUnicharUtilCIID.h"`
bug 655337 - part 1 - use nsIURI rather than nsIFile to specify hyphenation resources. r=mfinkle,smontagu 2011-10-06 08:06:32 -07:00			`#include "nsIURI.h"`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00
			`#include "hyphen.h"`

bug 655337 - part 1 - use nsIURI rather than nsIFile to specify hyphenation resources. r=mfinkle,smontagu 2011-10-06 08:06:32 -07:00			`nsHyphenator::nsHyphenator(nsIURI *aURI)`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`: mDict(nullptr)`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`{`
bug 655337 - part 1 - use nsIURI rather than nsIFile to specify hyphenation resources. r=mfinkle,smontagu 2011-10-06 08:06:32 -07:00			`nsCString uriSpec;`
			`nsresult rv = aURI->GetSpec(uriSpec);`
bug 685214 - use URI spec rather than file path to specify hyphenation dictionary, and read using nsIInputStream rather than stdio. r=bsmedberg 2011-09-14 12:20:26 -07:00			`if (NS_FAILED(rv)) {`
			`return;`
			`}`
bug 655337 - part 1 - use nsIURI rather than nsIFile to specify hyphenation resources. r=mfinkle,smontagu 2011-10-06 08:06:32 -07:00			`mDict = hnj_hyphen_load(uriSpec.get());`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`#ifdef DEBUG`
			`if (mDict) {`
bug 655337 - part 1 - use nsIURI rather than nsIFile to specify hyphenation resources. r=mfinkle,smontagu 2011-10-06 08:06:32 -07:00			`printf("loaded hyphenation patterns from %s\n", uriSpec.get());`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`
			`#endif`
			`}`

			`nsHyphenator::~nsHyphenator()`
			`{`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`if (mDict != nullptr) {`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`hnj_hyphen_free((HyphenDict*)mDict);`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`mDict = nullptr;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`
			`}`

Bug 675553 - Switch from PRBool to bool on a CLOSED TREE , r=bsmedberg,khuey,bz,cjones --HG-- rename : tools/trace-malloc/bloatblame.c => tools/trace-malloc/bloatblame.cpp 2011-09-28 23:19:26 -07:00			`bool`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`nsHyphenator::IsValid()`
			`{`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`return (mDict != nullptr);`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`

			`nsresult`
			`nsHyphenator::Hyphenate(const nsAString& aString,`
Bug 969871 - Use fallible allocation for text hyphenation; r=jfkthame It seems like the sizes for these data structures can be controlled from Web content, and we are already prepared to deal with OOM conditions, except that we are using infallible allocations by mistake. 2014-02-08 10:10:44 -08:00			`FallibleTArray<bool>& aHyphens)`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`{`
			`if (!aHyphens.SetLength(aString.Length())) {`
			`return NS_ERROR_OUT_OF_MEMORY;`
			`}`
Bug 1109972: nsHyphenator should not assume sizeof(bool) == sizeof(char). r=dbaron 2014-12-10 17:10:35 -08:00			`memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00
Bug 675553 - Switch from PRBool to bool on a CLOSED TREE , r=bsmedberg,khuey,bz,cjones --HG-- rename : tools/trace-malloc/bloatblame.c => tools/trace-malloc/bloatblame.cpp 2011-09-28 23:19:26 -07:00			`bool inWord = false;`
Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 08:56:38 -07:00			`uint32_t wordStart = 0, wordLimit = 0;`
			`uint32_t chLen;`
			`for (uint32_t i = 0; i < aString.Length(); i += chLen) {`
			`uint32_t ch = aString[i];`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`chLen = 1;`

			`if (NS_IS_HIGH_SURROGATE(ch)) {`
			`if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {`
			`ch = SURROGATE_TO_UCS4(ch, aString[i+1]);`
			`chLen = 2;`
			`} else {`
			`NS_WARNING("unpaired surrogate found during hyphenation");`
			`}`
			`}`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00
bug 724826 pt 3 - replace users of nsIUGenCategory service within libxul with direct access to mozilla::unicode::GetGenCategory. r=smontagu 2012-02-24 02:15:46 -08:00			`nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`if (cat == nsIUGenCategory::kLetter \|\| cat == nsIUGenCategory::kMark) {`
			`if (!inWord) {`
Bug 690892 - Replace PR_TRUE/PR_FALSE with true/false on mozilla-central; rs=dbaron Landing on a CLOSED TREE 2011-10-17 07:59:28 -07:00			`inWord = true;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`wordStart = i;`
			`}`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`wordLimit = i + chLen;`
			`if (i + chLen < aString.Length()) {`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`continue;`
			`}`
			`}`

			`if (inWord) {`
Bug 1105644 - Lowercase words before passing them to libhyphen, so as to match patterns fully. r=smontagu 2015-02-16 02:28:54 -08:00			`// Convert the word to utf-8 for libhyphen, lowercasing it as we go`
			`// so that it will match the (lowercased) patterns (bug 1105644).`
			`nsAutoCString utf8;`
			`const char16_t* const begin = aString.BeginReading();`
			`const char16_t *cur = begin + wordStart;`
			`const char16_t *end = begin + wordLimit;`
			`while (cur < end) {`
			`uint32_t ch = *cur++;`

			`if (NS_IS_HIGH_SURROGATE(ch)) {`
			`if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {`
			`ch = SURROGATE_TO_UCS4(ch, *cur++);`
			`} else {`
			`ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR`
			`}`
			`} else if (NS_IS_LOW_SURROGATE(ch)) {`
			`ch = 0xfffd; // unpaired surrogate`
			`}`

			`// XXX What about language-specific casing? Consider Turkish I/i...`
			`// In practice, it looks like the current patterns will not be`
			`// affected by this, as they treat dotted and undotted i similarly.`
			`ch = ToLowerCase(ch);`

			`if (ch < 0x80) { // U+0000 - U+007F`
			`utf8.Append(ch);`
			`} else if (ch < 0x0800) { // U+0100 - U+07FF`
			`utf8.Append(0xC0 \| (ch >> 6));`
			`utf8.Append(0x80 \| (0x003F & ch));`
			`} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF`
			`utf8.Append(0xE0 \| (ch >> 12));`
			`utf8.Append(0x80 \| (0x003F & (ch >> 6)));`
			`utf8.Append(0x80 \| (0x003F & ch));`
			`} else {`
			`utf8.Append(0xF0 \| (ch >> 18));`
			`utf8.Append(0x80 \| (0x003F & (ch >> 12)));`
			`utf8.Append(0x80 \| (0x003F & (ch >> 6)));`
			`utf8.Append(0x80 \| (0x003F & ch));`
			`}`
			`}`

bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`nsAutoTArray<char,200> utf8hyphens;`
			`utf8hyphens.SetLength(utf8.Length() + 5);`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`char **rep = nullptr;`
			`int *pos = nullptr;`
			`int *cut = nullptr;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,`
			`utf8.BeginReading(), utf8.Length(),`
Bug 777292 part 2 - Change all nsnull to nullptr 2012-07-30 07:20:58 -07:00			`utf8hyphens.Elements(), nullptr,`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`&rep, &pos, &cut);`
			`if (!err) {`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer`
			`// from utf8 code unit indexing (which would match the utf8 input`
			`// string directly) to Unicode character indexing.`
			`// We then need to convert this to utf16 code unit offsets for Gecko.`
			`const char *hyphPtr = utf8hyphens.Elements();`
Bug 927728 - Part 1: Replace PRUnichar with char16_t; r=roc This patch was automatically generated by the following script: #!/bin/bash # Command to convert PRUnichar to char16_t function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "modules/libmar" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name prtypes.h \ ! -name Char16.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRUnichar char16_t 2014-01-04 07:02:17 -08:00			`const char16_t *cur = begin + wordStart;`
			`const char16_t *end = begin + wordLimit;`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`while (cur < end) {`
			`if (*hyphPtr & 0x01) {`
Bug 690892 - Replace PR_TRUE/PR_FALSE with true/false on mozilla-central; rs=dbaron Landing on a CLOSED TREE 2011-10-17 07:59:28 -07:00			`aHyphens[cur - begin] = true;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`cur++;`
			`if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&`
			`NS_IS_HIGH_SURROGATE(*(cur-1)))`
			`{`
			`cur++;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`
bug 672472 - convert hyphenation-point offsets correctly from Unicode characters to UTF16 code units. r=smontagu 2011-07-20 03:15:06 -07:00			`hyphPtr++;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`
			`}`
			`}`

Bug 690892 - Replace PR_TRUE/PR_FALSE with true/false on mozilla-central; rs=dbaron Landing on a CLOSED TREE 2011-10-17 07:59:28 -07:00			`inWord = false;`
bug 253317 - part 4 - implement nsHyphenationManager and nsHyphenator classes. r=smontagu sr=roc 2011-05-04 04:29:45 -07:00			`}`

			`return NS_OK;`
			`}`