Bug 502789: correct regexp->native compiler of handling case-insensitive matching of characters with multiple lower-case forms, r=lw

2024-09-13 09:24:08 -07:00 · 2009-07-13 12:41:30 -07:00 · 2009-07-13 12:41:30 -07:00 · a10caff6bb
commit a10caff6bb
parent 427ece476a
1 changed files with 98 additions and 32 deletions
--- a/js/src/jsregexp.cpp
+++ b/js/src/jsregexp.cpp
@ -2090,43 +2090,107 @@ class RegExpNativeCompiler {

    LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails) 
    {
-        /* 
-         * Fast case-insensitive test for ASCII letters: convert text
-         * char to lower case by bit-or-ing in 32 and compare.
-         */
-        JSBool useFastCI = JS_FALSE;
-        jschar ch2 = ch;              /* 2nd char to test for if ci */
-        if (cs->flags & JSREG_FOLD) {
-            if ((L'A' <= ch && ch <= L'Z') || (L'a' <= ch && ch <= L'z')) {
-                ch |= 32;
-                ch2 = ch;
-                useFastCI = JS_TRUE;
-            } else if (JS_TOLOWER(ch) != ch) {
-                ch2 = JS_TOLOWER(ch);
-                ch = JS_TOUPPER(ch);
-            }
-        }
-
        LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_lt, pos, cpend), 0);
        fails.add(to_fail);
        LIns* text_ch = lir->insLoad(LIR_ldcs, pos, 0);
-        LIns* comp_ch = useFastCI ? 
-            lir->ins2(LIR_or, text_ch, lir->insImm(32)) : 
-            text_ch;
-        if (ch == ch2) {
-            fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0));
-        } else {
-            LIns* to_ok = lir->insBranch(LIR_jt, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch)), 0);
-            fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, comp_ch, lir->insImm(ch2)), 0));
-            if (!targetCurrentPoint(to_ok))
-                return NULL;
+
+        // Extra characters that need to be compared against when doing folding.
+        struct extra {
+            jschar ch;
+            LIns   *match;
+        };
+        extra extras[5];
+        int   nextras = 0;
+
+        if (cs->flags & JSREG_FOLD) {
+            ch = JS_TOUPPER(ch);
+            jschar lch = JS_TOLOWER(ch);
+
+            if (ch != lch) {
+                if (L'A' <= ch && ch <= L'Z') {
+                    // Fast conversion of text character to lower case by OR-ing with 32.
+                    text_ch = lir->ins2(LIR_or, text_ch, lir->insImm(32));
+                    // These ASCII letters have 2 lower-case forms. We put the ASCII one in
+                    // |extras| so it is tested first, because we expect that to be the common
+                    // case. Note that the code points of the non-ASCII forms both have the
+                    // 32 bit set, so it is OK to compare against the OR-32-converted text char.
+                    ch = lch;
+                    if (ch == L'i') {
+                        extras[nextras++].ch = ch;
+                        ch = 0x131;
+                    } else if (ch == L's') {
+                        extras[nextras++].ch = ch;
+                        ch = 0x17f;
+                    }
+                    goto gen;
+                } else if (0x01c4 <= ch && ch <= 0x1e60) {
+                    // The following group of conditionals handles characters that have 1 or 2
+                    // lower-case forms in addition to JS_TOLOWER(ch).
+                    if (ch <= 0x1f1) {                 // DZ,LJ,NJ
+                        if (ch == 0x01c4) {
+                            extras[nextras++].ch = 0x01c5;
+                        } else if (ch == 0x01c7) {
+                            extras[nextras++].ch = 0x01c8;
+                        } else if (ch == 0x01ca) {
+                            extras[nextras++].ch = 0x01cb;
+                        } else if (ch == 0x01f1) {
+                            extras[nextras++].ch = 0x01f2;
+                        }
+                    } else if (ch < 0x0392) {          // no extra lower-case forms in this range
+                    } else if (ch <= 0x03a6) {         // Greek
+                        if (ch == 0x0392) {
+                            extras[nextras++].ch = 0x03d0;
+                        } else if (ch == 0x0395) {
+                            extras[nextras++].ch = 0x03f5;
+                        } else if (ch == 0x0398) {
+                            extras[nextras++].ch = 0x03d1;
+                        } else if (ch == 0x0399) {
+                            extras[nextras++].ch = 0x0345;
+                            extras[nextras++].ch = 0x1fbe;
+                        } else if (ch == 0x039a) {
+                            extras[nextras++].ch = 0x03f0;
+                        } else if (ch == 0x039c) {
+                            extras[nextras++].ch = 0xb5;
+                        } else if (ch == 0x03a0) {
+                            extras[nextras++].ch = 0x03d6;
+                        } else if (ch == 0x03a1) {
+                            extras[nextras++].ch = 0x03f1;
+                        } else if (ch == 0x03a3) {
+                            extras[nextras++].ch = 0x03c2;
+                        } else if (ch == 0x03a6) {
+                            extras[nextras++].ch = 0x03d5;
+                        }
+                    } else if (ch == 0x1e60) {         // S with dot above
+                        extras[nextras++].ch = 0x1e9b;
+                    }
+                }
+
+                extras[nextras++].ch = lch;
+            }
        }

+    gen:
+        for (int i = 0; i < nextras; ++i) {
+            LIns *test = lir->ins2(LIR_eq, text_ch, lir->insImm(extras[i].ch));
+            LIns *branch = lir->insBranch(LIR_jt, test, 0);
+            extras[i].match = branch;
+        }
+            
+        fails.add(lir->insBranch(LIR_jf, lir->ins2(LIR_eq, text_ch, lir->insImm(ch)), 0));
+
+        for (int i = 0; i < nextras; ++i) {
+            if (!targetCurrentPoint(extras[i].match))
+                return NULL;
+        }
        return lir->ins2(LIR_piadd, pos, lir->insImm(2));
    }

-    LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, 
-                                LInsList& fails) 
+    JS_INLINE bool hasCases(jschar ch) 
+    {
+        return JS_TOLOWER(ch) != JS_TOUPPER(ch);
+    }
+
+    LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, LInsList& fails)
    {
 #ifdef IS_BIG_ENDIAN
        uint32 word = (ch1 << 16) | ch2;
@ -2140,9 +2204,11 @@ class RegExpNativeCompiler {
        JSBool useFastCI = JS_FALSE;
        union { jschar c[2]; uint32 i; } mask;
        if (cs->flags & JSREG_FOLD) {
-            JSBool mask1 = (L'A' <= ch1 && ch1 <= L'Z') || (L'a' <= ch1 && ch1 <= L'z');
-            JSBool mask2 = (L'A' <= ch2 && ch2 <= L'Z') || (L'a' <= ch2 && ch2 <= L'z');
-            if ((!mask1 && JS_TOLOWER(ch1) != ch1) || (!mask2 && JS_TOLOWER(ch2) != ch2)) {
+            jschar uch1 = JS_TOUPPER(ch1);
+            jschar uch2 = JS_TOUPPER(ch2);
+            JSBool mask1 = (L'A' <= uch1 && uch1 <= L'Z' && uch1 != L'I' && uch1 != L'S');
+            JSBool mask2 = (L'A' <= uch2 && uch2 <= L'Z' && uch2 != L'I' && uch2 != L'S');
+            if ((!mask1 && hasCases(ch1)) || (!mask2 && hasCases(ch2))) {
                pos = compileFlatSingleChar(ch1, pos, fails);
                if (!pos) return NULL;
                return compileFlatSingleChar(ch2, pos, fails);