bug 763703 - optimize Unicode property lookup and gfxScriptItemizer::Next. r=smontagu

2024-09-13 09:24:08 -07:00 · 2012-06-20 20:58:18 +01:00 · 2012-06-20 20:58:18 +01:00 · 368eaccf10
commit 368eaccf10
parent d0c563d64a
7 changed files with 98 additions and 141 deletions
--- a/gfx/thebes/gfxScriptItemizer.cpp
+++ b/gfx/thebes/gfxScriptItemizer.cpp
@ -48,13 +48,10 @@
 */

 #include "gfxScriptItemizer.h"
-#include "gfxFontUtils.h" // for the FindHighestBit function
 #include "nsUnicodeProperties.h"

 #include "nsCharTraits.h"

-#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
-
 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
 #define INC(sp,count) (MOD((sp) + (count)))
@ -66,61 +63,14 @@
 #define TOP() (parenStack[parenSP])
 #define SYNC_FIXUP() (fixupCount = 0)

-
-static const PRUint16 pairedChars[] = {
-    0x0028, 0x0029, /* ascii paired punctuation */
-    0x003c, 0x003e,
-    0x005b, 0x005d,
-    0x007b, 0x007d,
-    0x00ab, 0x00bb, /* guillemets */
-    0x2018, 0x2019, /* general punctuation */
-    0x201c, 0x201d,
-    0x2039, 0x203a,
-    0x207d, 0x207e, /* superscripts and subscripts */
-    0x208d, 0x208e,
-    0x275b, 0x275c, /* dingbat quotes and brackets */
-    0x275d, 0x275e,
-    0x2768, 0x2769,
-    0x276a, 0x276b,
-    0x276c, 0x276d,
-    0x276e, 0x276f,
-    0x2770, 0x2771,
-    0x2772, 0x2773,
-    0x2774, 0x2775,
-    /* omitted: lots of potentially-paired math symbols */
-    0x2e22, 0x2e23, /* supplemental punctuation */
-    0x2e24, 0x2e25,
-    0x2e26, 0x2e27,
-    0x2e28, 0x2e29,
-    0x3008, 0x3009, /* chinese paired punctuation */
-    0x300a, 0x300b,
-    0x300c, 0x300d,
-    0x300e, 0x300f,
-    0x3010, 0x3011,
-    0x3014, 0x3015,
-    0x3016, 0x3017,
-    0x3018, 0x3019,
-    0x301a, 0x301b,
-    0xfe59, 0xfe5a, /* small form variants */
-    0xfe5b, 0xfe5c,
-    0xfe5d, 0xfe5e,
-    0xfe64, 0xfe65,
-    0xff08, 0xff09, /* half-width and full-width forms */
-    0xff1c, 0xff1e,
-    0xff3b, 0xff3d,
-    0xff5b, 0xff5d,
-    0xff5f, 0xff60,
-    0xff62, 0xff63
-};
-
 void
-gfxScriptItemizer::push(PRInt32 pairIndex, PRInt32 scriptCode)
+gfxScriptItemizer::push(PRUint32 endPairChar, PRInt32 scriptCode)
 {
    pushCount  = LIMIT_INC(pushCount);
    fixupCount = LIMIT_INC(fixupCount);

    parenSP = INC1(parenSP);
-    parenStack[parenSP].pairIndex  = pairIndex;
+    parenStack[parenSP].endPairChar = endPairChar;
    parenStack[parenSP].scriptCode = scriptCode;
 }

@ -157,43 +107,23 @@ gfxScriptItemizer::fixup(PRInt32 scriptCode)
    }
 }

-static PRInt32
-getPairIndex(PRUint32 ch)
-{
-    PRInt32 pairedCharCount = ARRAY_SIZE(pairedChars);
-    PRInt32 pairedCharPower = mozilla::FindHighestBit(pairedCharCount);
-    PRInt32 pairedCharExtra = pairedCharCount - pairedCharPower;
-
-    PRInt32 probe = pairedCharPower;
-    PRInt32 pairIndex = 0;
-
-    if (ch >= pairedChars[pairedCharExtra]) {
-        pairIndex = pairedCharExtra;
-    }
-
-    while (probe > 1) {
-        probe >>= 1;
-
-        if (ch >= pairedChars[pairIndex + probe]) {
-            pairIndex += probe;
-        }
-    }
-
-    if (pairedChars[pairIndex] != ch) {
-        pairIndex = -1;
-    }
-
-    return pairIndex;
-}
-
-static bool
-sameScript(PRInt32 runScript, PRInt32 currCharScript)
+static inline bool
+SameScript(PRInt32 runScript, PRInt32 currCharScript)
 {
    return runScript <= MOZ_SCRIPT_INHERITED ||
           currCharScript <= MOZ_SCRIPT_INHERITED ||
           currCharScript == runScript;
 }

+// Return whether the char has a mirrored-pair counterpart.
+// NOTE that this depends on the implementation of nsCharProps records in
+// nsUnicodeProperties, and may need to be updated if those structures change
+static inline bool
+HasMirroredChar(PRUint32 aCh)
+{
+    return GetCharProps1(aCh).mMirrorOffsetIndex != 0;
+}
+
 gfxScriptItemizer::gfxScriptItemizer(const PRUnichar *src, PRUint32 length)
    : textPtr(src), textLength(length)
 {
@ -224,63 +154,64 @@ gfxScriptItemizer::Next(PRUint32& aRunStart, PRUint32& aRunLimit,
    for (scriptStart = scriptLimit; scriptLimit < textLength; scriptLimit += 1) {
        PRUint32 ch;
        PRInt32 sc;
-        PRInt32 pairIndex;
        PRUint32 startOfChar = scriptLimit;

        ch = textPtr[scriptLimit];

-        /*
-         * MODIFICATION for Gecko - clear the paired-character stack
-         * when we see a space character, because we cannot trust
-         * context outside the current "word" when doing textrun
-         * construction
-         */
-        if (ch == 0x20) {
-            while (STACK_IS_NOT_EMPTY()) {
-                pop();
-            }
-            sc = MOZ_SCRIPT_COMMON;
-            pairIndex = -1;
-        } else {
-            /* decode UTF-16 (may be surrogate pair) */
-            if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) {
-                PRUint32 low = textPtr[scriptLimit + 1];
-                if (NS_IS_LOW_SURROGATE(low)) {
-                    ch = SURROGATE_TO_UCS4(ch, low);
-                    scriptLimit += 1;
-                }
+        /* decode UTF-16 (may be surrogate pair) */
+        if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) {
+            PRUint32 low = textPtr[scriptLimit + 1];
+            if (NS_IS_LOW_SURROGATE(low)) {
+                ch = SURROGATE_TO_UCS4(ch, low);
+                scriptLimit += 1;
            }
+        }

-            sc = mozilla::unicode::GetScriptCode(ch);
+        // Get the nsCharProps2 record for the current character,
+        // so we can read the script and (if needed) the gen category
+        // without needing to do two multi-level lookups.
+        // NOTE that this means we're relying on an implementation detail
+        // of the nsUnicodeProperties tables, and might have to revise this
+        // if the nsCharProps records used there are modified in future.
+        const nsCharProps2& charProps = GetCharProps2(ch);

-            pairIndex = getPairIndex(ch);
+        // Initialize gc to UNASSIGNED; we'll only set it to the true GC
+        // if the character has script=COMMON, otherwise we don't care.
+        PRUint8 gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;

+        sc = charProps.mScriptCode;
+        if (sc == MOZ_SCRIPT_COMMON) {
            /*
             * Paired character handling:
             *
             * if it's an open character, push it onto the stack.
             * if it's a close character, find the matching open on the
             * stack, and use that script code. Any non-matching open
-             * characters above it on the stack will be poped.
+             * characters above it on the stack will be popped.
+             *
+             * We only do this if the script is COMMON; for chars with
+             * specific script assignments, we just use them as-is.
             */
-            if (pairIndex >= 0) {
-                if ((pairIndex & 1) == 0) {
-                    push(pairIndex, scriptCode);
-                } else {
-                    PRInt32 pi = pairIndex & ~1;
+            gc = charProps.mCategory;
+            if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) {
+                PRUint32 endPairChar = mozilla::unicode::GetMirroredChar(ch);
+                if (endPairChar != ch) {
+                    push(endPairChar, scriptCode);
+                }
+            } else if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
+                HasMirroredChar(ch))
+            {
+                while (STACK_IS_NOT_EMPTY() && TOP().endPairChar != ch) {
+                    pop();
+                }

-                    while (STACK_IS_NOT_EMPTY() && TOP().pairIndex != pi) {
-                        pop();
-                    }
-
-                    if (STACK_IS_NOT_EMPTY()) {
-                        sc = TOP().scriptCode;
-                    }
+                if (STACK_IS_NOT_EMPTY()) {
+                    sc = TOP().scriptCode;
                }
            }
        }

-        if (sameScript(scriptCode, sc)) {
+        if (SameScript(scriptCode, sc)) {
            if (scriptCode <= MOZ_SCRIPT_INHERITED &&
                sc > MOZ_SCRIPT_INHERITED)
            {
@ -292,7 +223,8 @@ gfxScriptItemizer::Next(PRUint32& aRunStart, PRUint32& aRunLimit,
             * if this character is a close paired character,
             * pop the matching open character from the stack
             */
-            if (pairIndex >= 0 && (pairIndex & 1) != 0) {
+            if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
+                HasMirroredChar(ch)) {
                pop();
            }
        } else {
--- a/gfx/thebes/gfxScriptItemizer.h
+++ b/gfx/thebes/gfxScriptItemizer.h
@ -77,13 +77,13 @@ protected:
        fixupCount  =  0;
    }

-    void push(PRInt32 pairIndex, PRInt32 scriptCode);
+    void push(PRUint32 endPairChar, PRInt32 scriptCode);
    void pop();
    void fixup(PRInt32 scriptCode);

    struct ParenStackEntry {
-        PRInt32 pairIndex;
-        PRInt32 scriptCode;
+        PRUint32 endPairChar;
+        PRInt32  scriptCode;
    };

    const PRUnichar *textPtr;
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@ -622,6 +622,7 @@ $versionInfo

 #ifndef NS_UNICODE_SCRIPT_CODES
 #define NS_UNICODE_SCRIPT_CODES
+
 __END

 print DATA_TABLES "static const PRUint32 sScriptCodeToTag[] = {\n";
@ -640,12 +641,14 @@ for (my $i = 0; $i < scalar @offsets; ++$i) {
 }
 print DATA_TABLES "};\n\n";

+print HEADER "#pragma pack(1)\n\n";
+
 sub sprintCharProps1
 {
  my $usv = shift;
  return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
 }
-&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char  mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
+&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
           "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);

 sub sprintCharProps2
@ -658,6 +661,8 @@ sub sprintCharProps2
 &genTables("CharProp2", "struct nsCharProps2 {\n  unsigned char mScriptCode:8;\n  unsigned char mEAW:3;\n  unsigned char mCategory:5;\n  unsigned char mBidiCategory:5;\n  unsigned char mXidmod:4;\n  signed char mNumericValue:5;\n  unsigned char mHanVariant:2;\n};",
           "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);

+print HEADER "#pragma pack()\n\n";
+
 sub sprintHanVariants
 {
  my $baseUsv = shift;
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -16,7 +16,7 @@
 #define UNICODE_LIMIT     0x110000


-nsCharProps1
+const nsCharProps1&
 GetCharProps1(PRUint32 aCh)
 {
    if (aCh < UNICODE_BMP_LIMIT) {
@ -30,13 +30,15 @@ GetCharProps1(PRUint32 aCh)
    }

    // Default values for unassigned
-    nsCharProps1 undefined = {0,       // Index to mirrored char offsets
-                              0,       // Hangul Syllable type
-                              0};      // Combining class
+    static const nsCharProps1 undefined = {
+        0,       // Index to mirrored char offsets
+        0,       // Hangul Syllable type
+        0        // Combining class
+    };
    return undefined;
 }

-nsCharProps2
+const nsCharProps2&
 GetCharProps2(PRUint32 aCh)
 {
    if (aCh < UNICODE_BMP_LIMIT) {
@ -51,13 +53,14 @@ GetCharProps2(PRUint32 aCh)

    NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
    // Default values for unassigned
-    nsCharProps2 undefined = {
+    static const nsCharProps2 undefined = {
        MOZ_SCRIPT_UNKNOWN,                      // Script code
        0,                                       // East Asian Width
        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
        eCharType_LeftToRight,                   // Bidi Category
        mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
-        -1                                       // Numeric Value
+        -1,                                      // Numeric Value
+        mozilla::unicode::HVT_NotHan             // Han variant
    };
    return undefined;
 }
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@ -11,8 +11,8 @@
 #include "nsIUGenCategory.h"
 #include "nsUnicodeScriptCodes.h"

-nsCharProps1 GetCharProps1(PRUint32 aCh);
-nsCharProps2 GetCharProps2(PRUint32 aCh);
+const nsCharProps1& GetCharProps1(PRUint32 aCh);
+const nsCharProps2& GetCharProps2(PRUint32 aCh);

 namespace mozilla {

--- a/intl/unicharutil/util/nsUnicodePropertyData.cpp
+++ b/intl/unicharutil/util/nsUnicodePropertyData.cpp
@ -1,11 +1,17 @@

-/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
- * This Source Code Form is subject to the terms of the Mozilla Public
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 /*
- * Created on Mon Apr 23 20:03:29 2012 from UCD data files with version info:
+ * Derived from the Unicode Character Database by genUnicodePropertyData.pl
+ *
+ * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+
+/*
+ * Created on Mon Jun 11 21:04:54 2012 from UCD data files with version info:
 *

 # Date: 2012-01-26, 22:03:00 GMT [KW]
--- a/intl/unicharutil/util/nsUnicodeScriptCodes.h
+++ b/intl/unicharutil/util/nsUnicodeScriptCodes.h
@ -1,11 +1,17 @@

-/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
- * This Source Code Form is subject to the terms of the Mozilla Public
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 /*
- * Created on Mon Apr 23 20:03:29 2012 from UCD data files with version info:
+ * Derived from the Unicode Character Database by genUnicodePropertyData.pl
+ *
+ * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+
+/*
+ * Created on Mon Jun 11 21:04:54 2012 from UCD data files with version info:
 *

 # Date: 2012-01-26, 22:03:00 GMT [KW]
@ -48,8 +54,11 @@ for the Unicode Character Database (UCD) for Unicode 6.1.0.

 #ifndef NS_UNICODE_SCRIPT_CODES
 #define NS_UNICODE_SCRIPT_CODES
+
+#pragma pack(1)
+
 struct nsCharProps1 {
-  unsigned char  mMirrorOffsetIndex:5;
+  unsigned char mMirrorOffsetIndex:5;
  unsigned char mHangulType:3;
  unsigned char mCombiningClass:8;
 };
@ -64,6 +73,8 @@ struct nsCharProps2 {
  unsigned char mHanVariant:2;
 };

+#pragma pack()
+
 enum {
  MOZ_SCRIPT_COMMON = 0,
  MOZ_SCRIPT_INHERITED = 1,