Bug 145975 - Implement nsCaseInsensitiveUTF8StringComparator r=smontagu

--HG-- extra : rebase_source : 8e5d63a6a5ca50b6e95a80421ad96a1d4a52ae77
2024-09-13 09:24:08 -07:00 · 2010-08-31 18:03:40 -07:00 · 2010-08-31 18:03:40 -07:00 · 8b20bfbcec
commit 8b20bfbcec
parent 02ef2fe17a
15 changed files with 572 additions and 210 deletions
--- a/intl/unicharutil/src/casetable.h
+++ b/intl/unicharutil/src/casetable.h
@ -363,3 +363,16 @@ static const PRUint32 gCaseBlocks [8] = {
 0x00000000,
 0x80000000
 };
+
+// We map x -> x, except for upper-case letters,
+// which we map to their lower-case equivalents.
+static const PRUint8 gASCIIToLower [128] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+    0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+};
--- a/intl/unicharutil/tests/UnicharSelfTest.cpp
+++ b/intl/unicharutil/tests/UnicharSelfTest.cpp
@ -281,6 +281,103 @@ static PRUnichar t4result[T4LEN+2] =  {
  0x0041 ,  // Dummy entry to prevent overflow
  0x00  
 };
+ 
+static unsigned char t6lhs[] = {
+  0x31 ,       //  0
+  0x19 ,       //  1
+  0x43 ,       //  2
+  0x67 ,       //  3
+  0xC3, 0x88 , //  4
+  0xC3, 0xA9 , //  5
+  0xC5, 0x87 , //  6
+  0xC7, 0x84 , //  7
+  0xC7, 0x86 , //  8
+  0xC7, 0x85 , //  9
+  0xCF, 0x80 ,  // 10
+  0xCE, 0xB2 ,  // 11
+  0xD0, 0xB8 ,  // 12
+  0xD2, 0xA5 ,  // 13
+  0xD7, 0x90 ,  // 14
+  0xE0, 0xA8, 0xA0 ,  // 15
+  0xE3, 0x82, 0xB0 ,  // 16
+  0xE5, 0x86, 0x85 ,  // 17
+  0xEC, 0x80, 0xA1 ,  // 18
+  0xEF, 0xBD, 0x88 ,  // 19
+  0xC7, 0x87 ,  // 20
+  0xC7, 0x88 ,  // 21
+  0xC7, 0x89 ,  // 22
+  0xC7, 0x8A ,  // 23
+  0xC7, 0x8B ,  // 24
+  0xC7, 0x8C ,  // 25
+  0xC7, 0xB1 ,  // 26
+  0xC7, 0xB2 ,  // 27
+  0xC7, 0xB3 ,  // 28
+  0xC9, 0x90 ,  // 29
+  0xC9, 0xB1 ,  // 30
+  0xEA, 0x99, 0x81 ,  // 31
+  0x00  
+};
+
+static unsigned char t6rhs[] =  {
+  0x31 ,  //  0
+  0x19 ,  //  1
+  0x43 ,  //  2
+  0x47 ,  //  3
+  0xC3, 0x88 ,  //  4
+  0xC3, 0x89 ,  //  5
+  0xC5, 0x87 ,  //  6
+  0xC7, 0x84 ,  //  7
+  0xC7, 0x84 ,  //  8
+  0xC7, 0x84 ,  //  9
+  0xCE, 0xA0 ,  // 10
+  0xCE, 0x92 ,  // 11
+  0xD0, 0x98 ,  // 12
+  0xD2, 0xA4 ,  // 13
+  0xD7, 0x90 ,  // 14
+  0xE0, 0xA8, 0xA0 ,  // 15
+  0xE3, 0x82, 0xB0 ,  // 16
+  0xE5, 0x86, 0x85 ,  // 17
+  0xEC, 0x80, 0xA1 ,  // 18
+  0xEF, 0xBC, 0xA8 ,  // 19
+  0xC7, 0x87 ,  // 20
+  0xC7, 0x87 ,  // 21
+  0xC7, 0x87 ,  // 22
+  0xC7, 0x8a ,  // 23
+  0xC7, 0x8a ,  // 24
+  0xC7, 0x8a ,  // 25
+  0xC7, 0xB1 ,  // 26
+  0xC7, 0xB1 ,  // 27
+  0xC7, 0xB1 ,  // 28
+  0xE2, 0xB1, 0xAF ,  // 29
+  0xE2, 0xB1, 0xAE ,  // 30
+  0xEA, 0x99, 0x80 ,  // 31
+  0x00  
+};
+
+static const char *t7lhs = "aBcDeFGHIJKL1!!2!!a!uuuu";
+static const char *t7rhs = "AbCdEFghijkL1!!2!!A!UUuU";
+
+static const char *t8lhs = "aazzz";
+static const char *t8rhs = "aBa";
+
+static const char *t9lhs = "@a";
+static const char *t9rhs = "`a";
+
+bool CharByCharCompareEqual(const char *a, const char *b,
+                            PRUint32 aLen, PRUint32 bLen)
+{
+  // Do basically a CaseInsensitiveCompare(), but using
+  // CaseInsensitiveUTF8CharsEqual().
+
+  const char *aEnd = a + aLen;
+  const char *bEnd = b + bLen;
+  while (a < aEnd && b < bEnd) {
+    PRBool err;
+    if (!CaseInsensitiveUTF8CharsEqual(a, b, aEnd, bEnd, &a, &b, &err) || err)
+      return PR_FALSE;
+  }
+  return PR_TRUE;
+}

 void TestCaseConversion()
 {
@ -291,7 +388,7 @@ void TestCaseConversion()
  int i;
  PRUnichar buf[256];

-  printf("Test 2 - ToUpper(PRUnichar, PRUnichar*):\n");
+  printf("Test 1 - ToUpper(PRUnichar, PRUnichar*):\n");
  for(i=0;i < T2LEN ; i++)
  {
    PRUnichar ch = ToUpperCase(t2data[i]);
@ -300,7 +397,7 @@ void TestCaseConversion()
  }


-  printf("Test 3 - ToLower(PRUnichar, PRUnichar*):\n");
+  printf("Test 2 - ToLower(PRUnichar, PRUnichar*):\n");
  for(i=0;i < T3LEN; i++)
  {
    PRUnichar ch = ToLowerCase(t3data[i]);
@ -308,7 +405,7 @@ void TestCaseConversion()
      printf("\tFailed!! result unexpected %d\n", i);
  }

-  printf("Test 4 - ToTitle(PRUnichar, PRUnichar*):\n");
+  printf("Test 3 - ToTitle(PRUnichar, PRUnichar*):\n");
  for(i=0;i < T4LEN; i++)
  {
    PRUnichar ch = ToTitleCase(t4data[i]);
@ -316,7 +413,7 @@ void TestCaseConversion()
      printf("\tFailed!! result unexpected %d\n", i);
  }

-  printf("Test 5 - ToUpper(PRUnichar*, PRUnichar*, PRUint32):\n");
+  printf("Test 4 - ToUpper(PRUnichar*, PRUnichar*, PRUint32):\n");
  ToUpperCase(t2data, buf, T2LEN);
  for(i = 0; i < T2LEN; i++)
  {
@ -327,7 +424,7 @@ void TestCaseConversion()
     }
  }

-  printf("Test 6 - ToLower(PRUnichar*, PRUnichar*, PRUint32):\n");
+  printf("Test 5 - ToLower(PRUnichar*, PRUnichar*, PRUint32):\n");
  ToLowerCase(t3data, buf, T3LEN);
  for(i = 0; i < T3LEN; i++)
  {
@ -338,11 +435,85 @@ void TestCaseConversion()
     }
  }

+  printf("Test 6 - CaseInsensitiveCompare UTF-8 (1):\n");
+  if (CaseInsensitiveCompare((char*)t6lhs, (char*)t6rhs, sizeof(t6lhs), sizeof(t6rhs)))
+    printf("\tFailed!\n");
+  if (!CharByCharCompareEqual((char*)t6lhs, (char*)t6rhs, sizeof(t6lhs), sizeof(t6rhs)))
+    printf("\tFailed character-by-character comparison!\n");
+
+  printf("Test 7 - CaseInsensitiveCompare UTF-8 (2):\n");
+  if (CaseInsensitiveCompare(t7lhs, t7rhs, strlen(t7lhs), strlen(t7rhs)))
+    printf("\tFailed!\n");
+  if (!CharByCharCompareEqual(t7lhs, t7rhs, sizeof(t7lhs), sizeof(t7rhs)))
+    printf("\tFailed character-by-character comparison!\n");
+
+  printf("Test 8a - CaseInsensitiveCompare UTF-8 (3):\n");
+  if (CaseInsensitiveCompare(t8lhs, t8rhs, strlen(t8lhs), strlen(t8rhs)) != -1)
+    printf("\tFailed!\n");
+  if (CharByCharCompareEqual(t8lhs, t8rhs, strlen(t8lhs), strlen(t8rhs)))
+    printf("\tFailed character-by-character comparison!\n");
+
+  printf("Test 8b - CaseInsensitiveCompare UTF-8 (4):\n");
+  if (CaseInsensitiveCompare(t8rhs, t8lhs, strlen(t8rhs), strlen(t8lhs)) != 1)
+    printf("\tFailed!\n");
+
+  // This test may seem a bit strange.  But it's actually an easy bug to make
+  // if we tried to be clever and say that two ASCII characters x and y are
+  // case-insensitively equal if (x & ~0x20) == (y & ~0x20).
+  printf("Test 9 - CaseInsensitiveCompare UTF-8 (5):\n");
+  if (CaseInsensitiveCompare(t9rhs, t9lhs, strlen(t9lhs), strlen(t9rhs)) != 1)
+    printf("\tFailed!\n");
+  if (CharByCharCompareEqual(t9lhs, t9rhs, strlen(t9lhs), strlen(t9rhs)))
+    printf("\tFailed character-by-character comparison!\n");
+
  printf("===========================\n");
  printf("Finish case conversion test\n");
  printf("===========================\n");
 }

+static void FuzzOneInvalidCaseConversion()
+{
+  PRUint32 aLen = rand() % 32;
+  PRUint32 bLen = rand() % 32;
+
+  // We could use a static length-32 buffer for these, but then Valgrind
+  // wouldn't be able to detect errors.
+  unsigned char *aBuf = (unsigned char*)malloc(aLen * sizeof(unsigned char));
+  unsigned char *bBuf = (unsigned char*)malloc(bLen * sizeof(unsigned char));
+
+  for (PRUint32 i = 0; i < aLen; i++) {
+    aBuf[i] = rand() & 0xff;
+  }
+
+  for (PRUint32 i = 0; i < bLen; i++) {
+    bBuf[i] = rand() & 0xff;
+  }
+
+  CaseInsensitiveCompare((char*)aBuf, (char*)bBuf, aLen, bLen);
+  CharByCharCompareEqual((char*)aBuf, (char*)bBuf, aLen, bLen);
+
+  free(aBuf);
+  free(bBuf);
+}
+
+static void FuzzCaseConversion()
+{
+  printf("==========================\n");
+  printf("Start fuzz case conversion\n");
+  printf("==========================\n");
+
+  srand(0);
+
+  printf("Fuzzing invalid UTF8 data...\n");
+  for (PRUint32 i = 0; i < 100000; i++) {
+    FuzzOneInvalidCaseConversion();
+  }
+
+  printf("===========================\n");
+  printf("Finish fuzz case conversion\n");
+  printf("===========================\n");
+}
+
 static void TestEntityConversion(PRUint32 version)
 {
  printf("==============================\n");
@ -567,6 +738,10 @@ int main(int argc, char** argv) {

   // --------------------------------------------

+   FuzzCaseConversion();
+
+   // --------------------------------------------
+
   TestEntityConversion(nsIEntityConverter::html40);

   // --------------------------------------------
--- a/intl/unicharutil/tools/README.txt
+++ b/intl/unicharutil/tools/README.txt
@ -1,32 +1,3 @@
+Instructions for using these tools are on the Mozilla Wiki.

-* How to generate various properties files in intl/unicharutils/tables and
-  header files in intl/unicharutils/src
-   ( written by Jungshik Shin for bug 210502 
-     https://bugzilla.mozilla.org/show_bug.cgi?id=210502 on 2005-04-05 )
-
-1. Grab the latest version of idnkit at http://www.nic.ad.jp/en/idn/index.html
-   (http://www.nic.ad.jp/ja/idn/idnkit/download/index.html )
-
-2. There are three files we need in the kit:
-   generate_normalize_data.pl, UCD.pm and SparseMap.pm
-
-3. a. Download the following Unicode data files  :
-     CaseFolding.txt,CompositionExclusions.txt, 
-     SpecialCasing.txt, UnicodeData.txt
-
-   b. Rename UnicodeData.txt to UnicodeData-Latest.txt
-
-   The latest version is, as of this writing, in 
-   ftp://ftp.unicode.org/Public/4.1.0/ucd
-
-4. a. Run generate_normalize_data.pl and save the output to a temporary file
-   b. Edit the file 
-      - remove the case folding part (search for 'Lowercase' and delete 
-        all the lines following it) because we have separate scripts for that, 
-      - replace 'unsigned short' and 'unsigned long' with 'PRUnichar' and 
-        'PRUint32'
-   c. Replace the actual source part (after the license) of 
-      intl/unicharutil/src/normalization_data.h with the file you edited.
-
-5. Generate casetable.h and cattable.h with  gencasetable.pl and gencattable.pl
-   Just running them will put casetable.h and cattable.h in the right place.
+https://wiki.mozilla.org/I18n:Updating_Unicode_version
--- a/intl/unicharutil/tools/gencasetable.pl
+++ b/intl/unicharutil/tools/gencasetable.pl
@ -383,6 +383,38 @@ for($idx=0;$idx<8;$idx++)
      printf OUT "\n";
   }
 }
+print OUT "};\n\n";
+
+######################################################################
+#
+# Print out gASCIIToLower table
+#
+######################################################################
+print OUT "// We map x -> x, except for upper-case letters,\n";
+print OUT "// which we map to their lower-case equivalents.\n";
+print OUT "static const PRUint8 gASCIIToLower [128] = {\n";
+
+# Map x -> x, except for upper-case letters, which we map to lower-case
+# letters.
+for($idx=0; $idx < 128; $idx++)
+{
+  if ($idx % 16 == 0) {
+    print OUT "    "
+  }
+
+  if (65 <= $idx && $idx <= 90) {
+    printf OUT "0x%02x", ($idx + 0x20);
+  } else {
+    printf OUT "0x%02x", $idx;
+  }
+
+  if (($idx+1) % 16 != 0) {
+    print OUT ", ";
+  }
+  else {
+    print OUT ",\n";
+  }
+}
 print OUT "};\n";


--- a/intl/unicharutil/util/nsUnicharUtils.cpp
+++ b/intl/unicharutil/util/nsUnicharUtils.cpp
@ -46,10 +46,11 @@
 #include "nsServiceManagerUtils.h"
 #include "nsXPCOMStrings.h"
 #include "casetable.h"
+#include "nsUTF8Utils.h"

 #include <ctype.h>

-// For gUpperToTitle 
+// For gUpperToTitle
 enum {
  kUpperIdx =0,
  kTitleIdx
@ -62,18 +63,19 @@ enum {
  kDiffIdx
 };

-#define IS_ASCII(u)       ( 0x0000 == ((u) & 0xFF80))
-#define IS_ASCII_UPPER(u) ((0x0041 <= (u)) && ( (u) <= 0x005a))
-#define IS_ASCII_LOWER(u) ((0x0061 <= (u)) && ( (u) <= 0x007a))
+#define IS_ASCII(u)       ((u) < 0x80)
+#define IS_ASCII_UPPER(u) (('A' <= (u)) && ( (u) <= 'Z' ))
+#define IS_ASCII_LOWER(u) (('a' <= (u)) && ( (u) <= 'z'))
 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
-#define IS_ASCII_SPACE(u) ( 0x0020 == (u) )
+#define IS_ASCII_SPACE(u) ( ' ' == (u) )

 #define IS_NOCASE_CHAR(u)  (0==(1&(gCaseBlocks[(u)>>13]>>(0x001F&((u)>>8)))))
-  
+
 // Size of Tables

-#define CASE_MAP_CACHE_SIZE 0x40
-#define CASE_MAP_CACHE_MASK 0x3F
+// Changing these numbers may break UTF-8 caching.  Be careful!
+#define CASE_MAP_CACHE_SIZE 0x100
+#define CASE_MAP_CACHE_MASK 0xFF

 struct nsCompressedMap {
  const PRUnichar *mTable;
@ -83,75 +85,94 @@ struct nsCompressedMap {

  PRUnichar Map(PRUnichar aChar)
  {
-    // no need to worry about thread safety since cached values are
-    // not objects but primitive data types which could be 
-    // accessed in atomic operations. We need to access
-    // the whole 32 bit of cachedData at once in order to make it
-    // thread safe. Never access bits from mCache directly.
+    // We don't need explicit locking here since the cached values are int32s,
+    // which are read and written atomically.  The following code is threadsafe
+    // because we never access bits from mCache directly -- we always first
+    // read the entire entry into a local variable and then mask off the bits
+    // we're interested in.

+    // Check the 256-byte cache first and bail with our answer if we can.
    PRUint32 cachedData = mCache[aChar & CASE_MAP_CACHE_MASK];
-    if(aChar == ((cachedData >> 16) & 0x0000FFFF))
-      return (cachedData & 0x0000FFFF);
+    if (aChar == ((cachedData >> 16) & 0x0000FFFF))
+      return cachedData & 0x0000FFFF;

-    // try the last index first
-    // store into local variable so we can be thread safe
-    PRUint32 base = mLastBase; 
+    // Now try the last index we looked up, storing it into a local variable
+    // for thread-safety.
+    PRUint32 base = mLastBase;
    PRUnichar res = 0;
-    
-    if (( aChar <=  ((mTable[base+kSizeEveryIdx] >> 8) + 
-                  mTable[base+kLowIdx])) &&
-        ( mTable[base+kLowIdx]  <= aChar )) 
-    {
-        // Hit the last base
-        if(((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) && 
+
+    // Does this character fit in the slot?
+    if ((aChar <= ((mTable[base+kSizeEveryIdx] >> 8) +
+                   mTable[base+kLowIdx])) &&
+        (mTable[base+kLowIdx] <= aChar)) {
+
+      // This character uses the same base as our last lookup, so the
+      // conversion is easy.
+      if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) &&
          (0 != ((aChar - mTable[base+kLowIdx]) % 
-                (mTable[base+kSizeEveryIdx] & 0x00FF))))
-        {
-          res = aChar;
-        } else {
-          res = aChar + mTable[base+kDiffIdx];
-        }
+                 (mTable[base+kSizeEveryIdx] & 0x00FF))))
+      {
+        res = aChar;
+      } else {
+        res = aChar + mTable[base+kDiffIdx];
+      }
+
    } else {
-        res = this->Lookup(0, (mSize/2), mSize-1, aChar);
+      // Do the full lookup.
+      res = this->Lookup(0, mSize/2, mSize-1, aChar);
    }

+    // Cache the result and return.
    mCache[aChar & CASE_MAP_CACHE_MASK] =
-        (((aChar << 16) & 0xFFFF0000) | (0x0000FFFF & res));
+        ((aChar << 16) & 0xFFFF0000) | (0x0000FFFF & res);
    return res;
  }

+  // Takes as arguments the left bound, middle, right bound, and character to
+  // search for.  Executes a binary search.
  PRUnichar Lookup(PRUint32 l,
                   PRUint32 m,
                   PRUint32 r,
                   PRUnichar aChar)
  {
-    PRUint32 base = m*3;
-    if ( aChar >  ((mTable[base+kSizeEveryIdx] >> 8) + 
+    PRUint32 base = m*3; // Every line in the table is 3 units wide.
+
+    // Is aChar past the top of the current table entry?  (The upper byte of
+    // the 'every' entry contains the offset to the end of this entry.)
+    if (aChar > ((mTable[base+kSizeEveryIdx] >> 8) + 
                  mTable[base+kLowIdx])) 
    {
-      if( l > m )
+      if (l > m || l == r)
        return aChar;
+      // Advance one round.
      PRUint32 newm = (m+r+1)/2;
-      if(newm == m)
+      if (newm == m)
        newm++;
-      return this->Lookup(m+1, newm , r, aChar);
-      
-    } else if ( mTable[base+kLowIdx]  > aChar ) {
-      if( r < m )
+      return this->Lookup(m+1, newm, r, aChar);
+
+    // Is aChar below the bottom of the current table entry?
+    } else if (mTable[base+kLowIdx] > aChar) {
+      if (r < m || l == r)
        return aChar;
+      // Advance one round
      PRUint32 newm = (l+m-1)/2;
      if(newm == m)
        newm++;
      return this->Lookup(l, newm, m-1, aChar);

-    } else  {
-      if(((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) && 
-        (0 != ((aChar - mTable[base+kLowIdx]) % 
-                (mTable[base+kSizeEveryIdx] & 0x00FF))))
+    // We've found the entry aChar should live in.
+    } else {
+      // Determine if aChar falls in a gap.  (The lower byte of the 'every'
+      // entry contains n for which every nth character from the base is a
+      // character of interest.)
+      if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) && 
+          (0 != ((aChar - mTable[base+kLowIdx]) % 
+                 (mTable[base+kSizeEveryIdx] & 0x00FF))))
      {
        return aChar;
      }
-      mLastBase = base; // cache the base
+      // If aChar doesn't fall in the gap, cache and convert.
+      mLastBase = base;
      return aChar + mTable[base+kDiffIdx];
    }
  }
@ -167,6 +188,29 @@ static nsCompressedMap gLowerMap = {
  gToLowerItems
 };

+// We want ToLowerCase(PRUnichar) and ToLowerCaseASCII(PRUnichar) to be fast
+// when they're called from within the case-insensitive comparators, so we
+// define inlined versions.
+static NS_ALWAYS_INLINE PRUnichar
+ToLowerCase_inline(PRUnichar aChar)
+{
+  if (IS_ASCII(aChar)) {
+    return gASCIIToLower[aChar];
+  } else if (IS_NOCASE_CHAR(aChar)) {
+     return aChar;
+  }
+
+  return gLowerMap.Map(aChar);
+}
+
+static NS_ALWAYS_INLINE PRUnichar
+ToLowerCaseASCII_inline(const PRUnichar aChar)
+{
+  if (IS_ASCII(aChar))
+    return gASCIIToLower[aChar];
+  return aChar;
+}
+
 void
 ToLowerCase(nsAString& aString)
 {
@ -189,9 +233,7 @@ ToLowerCase(const nsAString& aSource,
 PRUnichar
 ToLowerCaseASCII(const PRUnichar aChar)
 {
-  if (IS_ASCII_UPPER(aChar))
-    return aChar + 0x0020;
-  return aChar;
+  return ToLowerCaseASCII_inline(aChar);
 }

 void
@ -218,115 +260,58 @@ ToUpperCase(const nsAString& aSource,
 PRInt32
 nsCaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
                                              const PRUnichar* rhs,
-                                              PRUint32 aLength) const
+                                              PRUint32 lLength,
+                                              PRUint32 rLength) const
 {
-  return CaseInsensitiveCompare(lhs, rhs, aLength);
+  return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
+         (lLength > rLength) ? 1 : -1;
 }

 PRInt32
-nsCaseInsensitiveStringComparator::operator()(PRUnichar lhs,
-                                              PRUnichar rhs) const
+nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
+                                                  const char* rhs,
+                                                  PRUint32 lLength,
+                                                  PRUint32 rLength) const
 {
-  // see if they're an exact match first
-  if (lhs == rhs)
-    return 0;
-  
-  lhs = ToLowerCase(lhs);
-  rhs = ToLowerCase(rhs);
-  
-  if (lhs == rhs)
-    return 0;
-  else if (lhs < rhs)
-    return -1;
-  else
-    return 1;
+  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
 }

 PRInt32
 nsASCIICaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
                                                   const PRUnichar* rhs,
-                                                   PRUint32 aLength) const
+                                                   PRUint32 lLength,
+                                                   PRUint32 rLength) const
 {
-  while (aLength) {
+  if (lLength != rLength) {
+    if (lLength > rLength)
+      return 1;
+    return -1;
+  }
+
+  while (rLength) {
    PRUnichar l = *lhs++;
    PRUnichar r = *rhs++;
    if (l != r) {
-      l = ToLowerCaseASCII(l);
-      r = ToLowerCaseASCII(r);
+      l = ToLowerCaseASCII_inline(l);
+      r = ToLowerCaseASCII_inline(r);

      if (l > r)
        return 1;
      else if (r > l)
        return -1;
    }
-    aLength--;
+    rLength--;
  }

  return 0;
 }

-PRInt32
-nsASCIICaseInsensitiveStringComparator::operator()(PRUnichar lhs,
-                                                   PRUnichar rhs) const
-{
-  // see if they're an exact match first
-  if (lhs == rhs)
-    return 0;
-  
-  lhs = ToLowerCaseASCII(lhs);
-  rhs = ToLowerCaseASCII(rhs);
-  
-  if (lhs == rhs)
-    return 0;
-  else if (lhs < rhs)
-    return -1;
-  else
-    return 1;
-}
-
-
 #endif // MOZILLA_INTERNAL_API

-PRInt32
-CaseInsensitiveCompare(const PRUnichar *a,
-                       const PRUnichar *b,
-                       PRUint32 len)
-{
-  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
-  
-  if (len) {
-    do {
-      PRUnichar c1 = *a++;
-      PRUnichar c2 = *b++;
-      
-      if (c1 != c2) {
-        c1 = ToLowerCase(c1);
-        c2 = ToLowerCase(c2);
-        if (c1 != c2) {
-          if (c1 < c2) {
-            return -1;
-          }
-          return 1;
-        }
-      }
-    } while (--len != 0);
-  }
-  return 0;
-}
-
 PRUnichar
 ToLowerCase(PRUnichar aChar)
 {
-  if (IS_ASCII(aChar)) {
-    if (IS_ASCII_UPPER(aChar))
-      return aChar + 0x0020;
-    else
-      return aChar;
-  } else if (IS_NOCASE_CHAR(aChar)) {
-     return aChar;
-  }
-
-  return gLowerMap.Map(aChar);
+  return ToLowerCase_inline(aChar);
 }

 void
@ -342,7 +327,7 @@ ToUpperCase(PRUnichar aChar)
 {
  if (IS_ASCII(aChar)) {
    if (IS_ASCII_LOWER(aChar))
-      return aChar - 0x0020;
+      return aChar - 0x20;
    else
      return aChar;
  } else if (IS_NOCASE_CHAR(aChar)) {
@ -380,7 +365,7 @@ ToTitleCase(PRUnichar aChar)
  }

  PRUnichar upper = gUpperMap.Map(aChar);
-  
+
  if (0x01C0 == ( upper & 0xFFC0)) {
    for (PRUint32 i = 0 ; i < gUpperToTitleItems; i++) {
      if (upper == gUpperToTitle[(i*2)+kUpperIdx]) {
@ -391,3 +376,167 @@ ToTitleCase(PRUnichar aChar)

  return upper;
 }
+
+PRInt32
+CaseInsensitiveCompare(const PRUnichar *a,
+                       const PRUnichar *b,
+                       PRUint32 len)
+{
+  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
+
+  if (len) {
+    do {
+      PRUnichar c1 = *a++;
+      PRUnichar c2 = *b++;
+
+      if (c1 != c2) {
+        c1 = ToLowerCase_inline(c1);
+        c2 = ToLowerCase_inline(c2);
+        if (c1 != c2) {
+          if (c1 < c2) {
+            return -1;
+          }
+          return 1;
+        }
+      }
+    } while (--len != 0);
+  }
+  return 0;
+}
+
+// Calculates the codepoint of the UTF8 sequence starting at aStr.  Sets aNext
+// to the byte following the end of the sequence.
+//
+// If the sequence is invalid, or if computing the codepoint would take us off
+// the end of the string (as marked by aEnd), returns -1 and does not set
+// aNext.  Note that this function doesn't check that aStr < aEnd -- it assumes
+// you've done that already.
+static NS_ALWAYS_INLINE PRUint32
+GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
+{
+  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
+  // sign extend.
+  const unsigned char *str = (unsigned char*)aStr;
+
+  if (UTF8traits::isASCII(str[0])) {
+    // It's ASCII; just convert to lower-case and return it.
+    *aNext = aStr + 1;
+    return gASCIIToLower[*str];
+  }
+  if (UTF8traits::is2byte(str[0]) && NS_LIKELY(aStr + 1 < aEnd)) {
+    // It's a two-byte sequence, so it looks like
+    //  110XXXXX 10XXXXXX.
+    // This is definitely in the BMP, so we can store straightaway into a
+    // PRUint16.
+
+    PRUint16 c;
+    c  = (str[0] & 0x1F) << 6;
+    c += (str[1] & 0x3F);
+
+    if (!IS_NOCASE_CHAR(c))
+      c = gLowerMap.Map(c);
+
+    *aNext = aStr + 2;
+    return c;
+  }
+  if (UTF8traits::is3byte(str[0]) && NS_LIKELY(aStr + 2 < aEnd)) {
+    // It's a three-byte sequence, so it looks like
+    //  1110XXXX 10XXXXXX 10XXXXXX.
+    // This will just barely fit into 16-bits, so store into a PRUint16.
+
+    PRUint16 c;
+    c  = (str[0] & 0x0F) << 12;
+    c += (str[1] & 0x3F) << 6;
+    c += (str[2] & 0x3F);
+
+    if (!IS_NOCASE_CHAR(c))
+      c = gLowerMap.Map(c);
+
+    *aNext = aStr + 3;
+    return c;
+  }
+  if (UTF8traits::is4byte(str[0]) && NS_LIKELY(aStr + 3 < aEnd)) {
+    // It's a four-byte sequence, so it looks like
+    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
+    // Unless this is an overlong sequence, the codepoint it encodes definitely
+    // isn't in the BMP, so we don't bother trying to convert it to lower-case.
+
+    PRUint32 c;
+    c  = (str[0] & 0x07) << 18;
+    c += (str[1] & 0x3F) << 12;
+    c += (str[2] & 0x3F) << 6;
+    c += (str[3] & 0x3F);
+
+    *aNext = aStr + 4;
+    return c;
+  }
+
+  // Hm, we don't understand this sequence.
+  return -1;
+}
+
+PRInt32 CaseInsensitiveCompare(const char *aLeft,
+                               const char *aRight,
+                               PRUint32 aLeftBytes,
+                               PRUint32 aRightBytes)
+{
+  const char *leftEnd = aLeft + aLeftBytes;
+  const char *rightEnd = aRight + aRightBytes;
+
+  while (aLeft < leftEnd && aRight < rightEnd) {
+    PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
+    if (NS_UNLIKELY(leftChar == PRUint32(-1)))
+      return -1;
+
+    PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
+    if (NS_UNLIKELY(rightChar == PRUint32(-1)))
+      return -1;
+
+    // Now leftChar and rightChar are lower-case, so we can compare them.
+    if (leftChar != rightChar) {
+      if (leftChar > rightChar)
+        return 1;
+      return -1;
+    }
+  }
+
+  // Make sure that if one string is longer than the other we return the
+  // correct result.
+  if (aLeft < leftEnd)
+    return 1;
+  if (aRight < rightEnd)
+    return -1;
+
+  return 0;
+}
+
+PRBool
+CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
+                              const char* aLeftEnd, const char* aRightEnd,
+                              const char** aLeftNext, const char** aRightNext,
+                              PRBool* aErr)
+{
+  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
+  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
+  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
+  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
+  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
+
+  PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
+  if (NS_UNLIKELY(leftChar == PRUint32(-1))) {
+    *aErr = PR_TRUE;
+    return PR_FALSE;
+  }
+
+  PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
+  if (NS_UNLIKELY(rightChar == PRUint32(-1))) {
+    *aErr = PR_TRUE;
+    return PR_FALSE;
+  }
+
+  // Can't have an error past this point.
+  *aErr = PR_FALSE;
+
+  return leftChar == rightChar;
+}
+
--- a/intl/unicharutil/util/nsUnicharUtils.h
+++ b/intl/unicharutil/util/nsUnicharUtils.h
@ -78,9 +78,17 @@ class nsCaseInsensitiveStringComparator : public nsStringComparator
 public:
  virtual PRInt32 operator() (const PRUnichar*,
                              const PRUnichar*,
-                              PRUint32 aLength) const;
-  virtual PRInt32 operator() (PRUnichar,
-                              PRUnichar) const;
+                              PRUint32,
+                              PRUint32) const;
+};
+
+class nsCaseInsensitiveUTF8StringComparator : public nsCStringComparator
+{
+public:
+  virtual PRInt32 operator() (const char*,
+                              const char*,
+                              PRUint32,
+                              PRUint32) const;
 };

 class nsCaseInsensitiveStringArrayComparator
@ -97,9 +105,8 @@ class nsASCIICaseInsensitiveStringComparator : public nsStringComparator
 public:
  virtual int operator() (const PRUnichar*,
                          const PRUnichar*,
-                          PRUint32 aLength) const;
-  virtual int operator() (PRUnichar,
-                          PRUnichar) const;
+                          PRUint32,
+                          PRUint32) const;
 };

 inline PRBool
@ -126,4 +133,33 @@ CaseInsensitiveFindInReadable(const nsAString& aPattern,
 PRInt32
 CaseInsensitiveCompare(const PRUnichar *a, const PRUnichar *b, PRUint32 len);

+PRInt32
+CaseInsensitiveCompare(const char* aLeft, const char* aRight,
+                       PRUint32 aLeftBytes, PRUint32 aRightBytes);
+
+/**
+ * This function determines whether the UTF-8 sequence pointed to by aLeft is
+ * case-insensitively-equal to the UTF-8 sequence pointed to by aRight.
+ *
+ * aLeftEnd marks the first memory location past aLeft that is not part of
+ * aLeft; aRightEnd similarly marks the end of aRight.
+ *
+ * The function assumes that aLeft < aLeftEnd and aRight < aRightEnd.
+ *
+ * The function stores the addresses of the next characters in the sequence
+ * into aLeftNext and aRightNext.  It's up to the caller to make sure that the
+ * returned pointers are valid -- i.e. the function may return aLeftNext >=
+ * aLeftEnd or aRightNext >= aRightEnd.
+ *
+ * If the function encounters invalid text, it sets aErr to true and returns
+ * false, possibly leaving aLeftNext and aRightNext uninitialized.  If the
+ * function returns true, aErr is guaranteed to be false and both aLeftNext and
+ * aRightNext are guaranteed to be initialized.
+ */
+PRBool
+CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
+                              const char* aLeftEnd, const char* aRightEnd,
+                              const char** aLeftNext, const char** aRightNext,
+                              PRBool* aErr);
+
 #endif  /* nsUnicharUtils_h__ */
--- a/parser/htmlparser/src/nsScannerString.cpp
+++ b/parser/htmlparser/src/nsScannerString.cpp
@ -594,7 +594,7 @@ FindInReadable( const nsAString& aPattern,
          {
              // fast inner loop (that's what it's called, not what it is) looks for a potential match
            while ( aSearchStart != aSearchEnd &&
-                    compare(*aPatternStart, *aSearchStart) )
+                    compare(aPatternStart.get(), aSearchStart.get(), 1, 1) )
              ++aSearchStart;

              // if we broke out of the `fast' loop because we're out of string ... we're done: no match
@ -631,7 +631,7 @@ FindInReadable( const nsAString& aPattern,

                  // else if we mismatched ... it's time to advance to the next search position
                  //  and get back into the `fast' loop
-                if ( compare(*testPattern, *testSearch) )
+                if ( compare(testPattern.get(), testSearch.get(), 1, 1) )
                  {
                    ++aSearchStart;
                    break;
--- a/toolkit/components/places/src/SQLFunctions.cpp
+++ b/toolkit/components/places/src/SQLFunctions.cpp
@ -140,7 +140,7 @@ namespace places {
                           testSourceItr(sourceStart);

      // Keep trying to match the token one by one until it doesn't match.
-      while (!caseInsensitiveCompare(*testTokenItr, *testSourceItr)) {
+      while (!caseInsensitiveCompare(testTokenItr, testSourceItr, 1, 1)) {
        // We matched something, so move down one.
        testTokenItr++;
        testSourceItr++;
--- a/widget/src/windows/nsWindow.cpp
+++ b/widget/src/windows/nsWindow.cpp
@ -6437,7 +6437,7 @@ StringCaseInsensitiveEquals(const PRUnichar* aChars1, const PRUint32 aNumChars1,
    return PR_FALSE;

  nsCaseInsensitiveStringComparator comp;
-  return comp(aChars1, aChars2, aNumChars1) == 0;
+  return comp(aChars1, aChars2, aNumChars1, aNumChars2) == 0;
 }

 UINT nsWindow::MapFromNativeToDOM(UINT aNativeKeyCode)
--- a/xpcom/string/public/nsAString.h
+++ b/xpcom/string/public/nsAString.h
@ -80,8 +80,7 @@ class NS_COM nsCaseInsensitiveCStringComparator
    public:
      typedef char char_type;

-      virtual int operator()( const char_type*, const char_type*, PRUint32 length ) const;
-      virtual int operator()( char_type, char_type ) const;
+      virtual int operator()( const char_type*, const char_type*, PRUint32, PRUint32 ) const;
  };

 class nsCaseInsensitiveCStringArrayComparator
--- a/xpcom/string/public/nsTSubstring.h
+++ b/xpcom/string/public/nsTSubstring.h
@ -50,8 +50,7 @@ class NS_COM nsTStringComparator_CharT

      nsTStringComparator_CharT() {}

-      virtual int operator()( const char_type*, const char_type*, PRUint32 length ) const = 0;
-      virtual int operator()( char_type, char_type ) const = 0;
+      virtual int operator()( const char_type*, const char_type*, PRUint32, PRUint32 ) const = 0;
  };


@ -66,8 +65,7 @@ class NS_COM nsTDefaultStringComparator_CharT

      nsTDefaultStringComparator_CharT() {}

-      virtual int operator()( const char_type*, const char_type*, PRUint32 length ) const;
-      virtual int operator()( char_type, char_type ) const;
+      virtual int operator()( const char_type*, const char_type*, PRUint32, PRUint32 ) const;
  };

  /**
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -802,7 +802,7 @@ FindInReadable_Impl( const StringT& aPattern, IteratorT& aSearchStart, IteratorT
          {
              // fast inner loop (that's what it's called, not what it is) looks for a potential match
            while ( aSearchStart != aSearchEnd &&
-                    compare(*aPatternStart, *aSearchStart) )
+                    compare(aPatternStart.get(), aSearchStart.get(), 1, 1) )
              ++aSearchStart;

              // if we broke out of the `fast' loop because we're out of string ... we're done: no match
@ -839,7 +839,7 @@ FindInReadable_Impl( const StringT& aPattern, IteratorT& aSearchStart, IteratorT

                  // else if we mismatched ... it's time to advance to the next search position
                  //  and get back into the `fast' loop
-                if ( compare(*testPattern, *testSearch) )
+                if ( compare(testPattern.get(), testSearch.get(), 1, 1) )
                  {
                    ++aSearchStart;
                    break;
@ -871,7 +871,7 @@ RFindInReadable_Impl( const StringT& aPattern, IteratorT& aSearchStart, Iterator
        --searchEnd;
    
          // Check last character, if a match, explore further from here
-        if ( compare(*patternEnd, *searchEnd) == 0 )
+        if ( compare(patternEnd.get(), searchEnd.get(), 1, 1) == 0 )
          {  
              // We're at a potential match, let's see if we really hit one
            IteratorT testPattern(patternEnd);
@ -900,7 +900,7 @@ RFindInReadable_Impl( const StringT& aPattern, IteratorT& aSearchStart, Iterator
                --testPattern;
                --testSearch;
              }
-            while ( compare(*testPattern, *testSearch) == 0 );
+            while ( compare(testPattern.get(), testSearch.get(), 1, 1) == 0 );
          }
      }

--- a/xpcom/string/src/nsStringComparator.cpp
+++ b/xpcom/string/src/nsStringComparator.cpp
@ -53,23 +53,17 @@


 int
-nsCaseInsensitiveCStringComparator::operator()( const char_type* lhs, const char_type* rhs, PRUint32 aLength ) const
+nsCaseInsensitiveCStringComparator::operator()( const char_type* lhs,
+                                                const char_type* rhs,
+                                                PRUint32 lLength,
+                                                PRUint32 rLength ) const
  {
-    PRInt32 result=PRInt32(PL_strncasecmp(lhs, rhs, aLength));
+    if (lLength != rLength)
+      return (lLength > rLength) ? 1 : -1;
+    PRInt32 result=PRInt32(PL_strncasecmp(lhs, rhs, lLength));
    //Egads. PL_strncasecmp is returning *very* negative numbers.
    //Some folks expect -1,0,1, so let's temper its enthusiasm.
    if (result<0) 
      result=-1;
    return result;
  }
-
-int
-nsCaseInsensitiveCStringComparator::operator()( char lhs, char rhs ) const
-  {
-    if (lhs == rhs) return 0;
-    
-    lhs = tolower(lhs);
-    rhs = tolower(rhs);
-
-    return lhs - rhs;
-  }
--- a/xpcom/string/src/nsTStringComparator.cpp
+++ b/xpcom/string/src/nsTStringComparator.cpp
@ -53,7 +53,7 @@ Compare( const nsTSubstring_CharT::base_string_type& lhs, const nsTSubstring_Cha
    size_type lengthToCompare = NS_MIN(lLength, rLength);

    int result;
-    if ( (result = comp(leftIter.get(), rightIter.get(), lengthToCompare)) == 0 )
+    if ( (result = comp(leftIter.get(), rightIter.get(), lengthToCompare, lengthToCompare)) == 0 )
      {
        if ( lLength < rLength )
          result = -1;
@ -67,13 +67,8 @@ Compare( const nsTSubstring_CharT::base_string_type& lhs, const nsTSubstring_Cha
  }

 int
-nsTDefaultStringComparator_CharT::operator()( const char_type* lhs, const char_type* rhs, PRUint32 aLength ) const
+nsTDefaultStringComparator_CharT::operator()( const char_type* lhs, const char_type* rhs, PRUint32 lLength, PRUint32 rLength) const
  {
-    return nsCharTraits<CharT>::compare(lhs, rhs, aLength);
+    return (lLength == rLength) ? nsCharTraits<CharT>::compare(lhs, rhs, lLength) :
+           (lLength > rLength) ? 1 : -1;
  }
-
-int
-nsTDefaultStringComparator_CharT::operator()( char_type lhs, char_type rhs) const
-  {
-    return lhs - rhs;
-  } 
--- a/xpcom/string/src/nsTSubstring.cpp
+++ b/xpcom/string/src/nsTSubstring.cpp
@ -605,7 +605,7 @@ nsTSubstring_CharT::Equals( const self_type& str ) const
 PRBool
 nsTSubstring_CharT::Equals( const self_type& str, const comparator_type& comp ) const
  {
-    return mLength == str.mLength && comp(mData, str.mData, mLength) == 0;
+    return mLength == str.mLength && comp(mData, str.mData, mLength, str.mLength) == 0;
  }

 PRBool
@ -635,7 +635,7 @@ nsTSubstring_CharT::Equals( const char_type* data, const comparator_type& comp )

    // XXX avoid length calculation?
    size_type length = char_traits::length(data);
-    return mLength == length && comp(mData, data, mLength) == 0;
+    return mLength == length && comp(mData, data, mLength, length) == 0;
  }

 PRBool