Bug 570975 - Don't convert UTF-8 strings to UTF-16 in MatchAutoCompleteFunction r=sdwilsh, a2.0=blocking

--HG--
extra : rebase_source : 70dd986a655e1670fec86e21c8cea7a596e3b0ac
This commit is contained in:
Justin Lebar 2010-08-31 18:03:40 -07:00
parent 8b20bfbcec
commit 241ab761e5
2 changed files with 225 additions and 126 deletions

View File

@ -43,9 +43,161 @@
#include "nsEscape.h"
#include "mozIPlacesAutoComplete.h"
#include "SQLFunctions.h"
#include "nsUTF8Utils.h"
using namespace mozilla::storage;
////////////////////////////////////////////////////////////////////////////////
//// Anonymous Helpers
namespace {
typedef nsACString::const_char_iterator const_char_iterator;
/**
* Get a pointer to the word boundary after aStart if aStart points to an
* ASCII letter (i.e. [a-zA-Z]). Otherwise, return aNext, which we assume
* points to the next character in the UTF-8 sequence.
*
* We define a word boundary as anything that's not [a-z] -- this lets us
* match CamelCase words.
*
* @param aStart the beginning of the UTF-8 sequence
* @param aNext the next character in the sequence
* @param aEnd the first byte which is not part of the sequence
*
* @return a pointer to the next word boundary after aStart
*/
static
NS_ALWAYS_INLINE const_char_iterator
nextWordBoundary(const_char_iterator const aStart,
const_char_iterator const aNext,
const_char_iterator const aEnd) {
const_char_iterator cur = aStart;
if (('a' <= *cur && *cur <= 'z') ||
('A' <= *cur && *cur <= 'Z')) {
// Since we'll halt as soon as we see a non-ASCII letter, we can do a
// simple byte-by-byte comparison here and avoid the overhead of a
// UTF8CharEnumerator.
do {
cur++;
} while (cur < aEnd && 'a' <= *cur && *cur <= 'z');
}
else {
cur = aNext;
}
return cur;
}
enum FindInStringBehavior {
eFindOnBoundary,
eFindAnywhere
};
/**
* findAnywhere and findOnBoundary do almost the same thing, so it's natural
* to implement them in terms of a single function. They're both
* performance-critical functions, however, and checking aBehavior makes them
* a bit slower. Our solution is to define findInString as NS_ALWAYS_INLINE
* and rely on the compiler to optimize out the aBehavior check.
*
* @param aToken
* The token we're searching for
* @param aSourceString
* The string in which we're searching
* @param aBehavior
* eFindOnBoundary if we should only consider matchines which occur on
* word boundaries, or eFindAnywhere if we should consider matches
* which appear anywhere.
*
* @return true if aToken was found in aSourceString, false otherwise.
*/
static
NS_ALWAYS_INLINE bool
findInString(const nsDependentCSubstring &aToken,
const nsACString &aSourceString,
FindInStringBehavior aBehavior)
{
// CaseInsensitiveUTF8CharsEqual assumes that there's at least one byte in
// the both strings, so don't pass an empty token here.
NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
// We cannot match anything if there is nothing to search.
if (aSourceString.IsEmpty()) {
return false;
}
const_char_iterator tokenStart(aToken.BeginReading()),
tokenEnd(aToken.EndReading()),
sourceStart(aSourceString.BeginReading()),
sourceEnd(aSourceString.EndReading());
do {
// We are on a word boundary (if aBehavior == eFindOnBoundary). See if
// aToken matches sourceStart.
// Check whether the first character in the token matches the character
// at sourceStart. At the same time, get a pointer to the next character
// in both the token and the source.
const_char_iterator sourceNext, tokenCur;
PRBool error;
if (CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
sourceEnd, tokenEnd,
&sourceNext, &tokenCur, &error)) {
// We don't need to check |error| here -- if
// CaseInsensitiveUTF8CharCompare encounters an error, it'll also
// return false and we'll catch the error outside the if.
const_char_iterator sourceCur = sourceNext;
while (true) {
if (tokenCur >= tokenEnd) {
// We matched the whole token!
return true;
}
if (sourceCur >= sourceEnd) {
// We ran into the end of source while matching a token. This
// means we'll never find the token we're looking for.
return false;
}
if (!CaseInsensitiveUTF8CharsEqual(sourceCur, tokenCur,
sourceEnd, tokenEnd,
&sourceCur, &tokenCur, &error)) {
// sourceCur doesn't match tokenCur (or there's an error), so break
// out of this loop.
break;
}
}
}
// If something went wrong above, get out of here!
if (NS_UNLIKELY(error)) {
return false;
}
// We didn't match the token. If we're searching for matches on word
// boundaries, skip to the next word boundary. Otherwise, advance
// forward one character, using the sourceNext pointer we saved earlier.
if (aBehavior == eFindOnBoundary) {
sourceStart = nextWordBoundary(sourceStart, sourceNext, sourceEnd);
}
else {
sourceStart = sourceNext;
}
} while (sourceStart < sourceEnd);
return false;
}
} // End anonymous namespace
namespace mozilla {
namespace places {
@ -73,117 +225,87 @@ namespace places {
/* static */
void
MatchAutoCompleteFunction::fixupURISpec(const nsCString &aURISpec,
nsString &_fixedSpec)
nsCString &_fixedSpec)
{
nsCString unescapedSpec;
(void)NS_UnescapeURL(aURISpec, esc_SkipControl | esc_AlwaysCopy,
unescapedSpec);
// If this unescaped string is valid UTF-8, we'll convert it. Otherwise,
// we will simply convert our original string.
// If this unescaped string is valid UTF-8, we'll use it. Otherwise,
// we will simply use our original string.
NS_ASSERTION(_fixedSpec.IsEmpty(),
"Passing a non-empty string as an out parameter!");
if (IsUTF8(unescapedSpec))
CopyUTF8toUTF16(unescapedSpec, _fixedSpec);
_fixedSpec.Assign(unescapedSpec);
else
CopyUTF8toUTF16(aURISpec, _fixedSpec);
_fixedSpec.Assign(aURISpec);
if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("http://")))
if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("http://")))
_fixedSpec.Cut(0, 7);
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("https://")))
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("https://")))
_fixedSpec.Cut(0, 8);
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("ftp://")))
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("ftp://")))
_fixedSpec.Cut(0, 6);
if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("www.")))
if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("www.")))
_fixedSpec.Cut(0, 4);
}
/* static */
bool
MatchAutoCompleteFunction::findAnywhere(const nsDependentSubstring &aToken,
const nsAString &aSourceString)
MatchAutoCompleteFunction::findAnywhere(const nsDependentCSubstring &aToken,
const nsACString &aSourceString)
{
return !!CaseInsensitiveFindInReadable(aToken, aSourceString);
// We can't use FindInReadable here; it works only for ASCII.
return findInString(aToken, aSourceString, eFindAnywhere);
}
/* static */
bool
MatchAutoCompleteFunction::findBeginning(const nsDependentSubstring &aToken,
const nsAString &aSourceString)
MatchAutoCompleteFunction::findOnBoundary(const nsDependentCSubstring &aToken,
const nsACString &aSourceString)
{
return !!StringBeginsWith(aSourceString, aToken,
nsCaseInsensitiveStringComparator());
return findInString(aToken, aSourceString, eFindOnBoundary);
}
/* static */
bool
MatchAutoCompleteFunction::findOnBoundary(const nsDependentSubstring &aToken,
const nsAString &aSourceString)
MatchAutoCompleteFunction::findBeginning(const nsDependentCSubstring &aToken,
const nsACString &aSourceString)
{
// We cannot match anything if there is nothing to search.
if (aSourceString.IsEmpty())
return false;
NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
// Define a const instance of this class so it is created once.
const nsCaseInsensitiveStringComparator caseInsensitiveCompare;
// We can't use StringBeginsWith here, unfortunately. Although it will
// happily take a case-insensitive UTF8 comparator, it eventually calls
// nsACString::Equals, which checks that the two strings contain the same
// number of bytes before calling the comparator. This is clearly not what
// we want.
const_wchar_iterator tokenStart(aToken.BeginReading()),
tokenEnd(aToken.EndReading()),
sourceStart(aSourceString.BeginReading()),
sourceEnd(aSourceString.EndReading());
const_char_iterator tokenStart(aToken.BeginReading()),
tokenEnd(aToken.EndReading()),
sourceStart(aSourceString.BeginReading()),
sourceEnd(aSourceString.EndReading());
// The start of aSourceString is considered a word boundary, so start there.
do {
// We are on a word boundary, so start by copying the iterators.
const_wchar_iterator testTokenItr(tokenStart),
testSourceItr(sourceStart);
PRBool dummy;
while (sourceStart < sourceEnd &&
CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
sourceEnd, tokenEnd,
&sourceStart, &tokenStart, &dummy)) {
// Keep trying to match the token one by one until it doesn't match.
while (!caseInsensitiveCompare(testTokenItr, testSourceItr, 1, 1)) {
// We matched something, so move down one.
testTokenItr++;
testSourceItr++;
// Matched the full token, so we are done!
if (testTokenItr == tokenEnd)
return true;
// However, if we ran into the end of the source while matching the
// token, we will not find it.
if (testSourceItr == sourceEnd)
return false;
// We found the token!
if (tokenStart >= tokenEnd) {
return true;
}
}
// Always advance our starting iterator, and if we are not currently on a
// word boundary, advance to the next word boundary.
if (!isWordBoundary(ToLowerCase(*sourceStart++)))
sourceStart = nextWordBoundary(sourceStart, sourceEnd);
} while (sourceStart != sourceEnd);
// We don't need to check CaseInsensitiveUTF8CharsEqual's error condition
// (stored in |dummy|), since the function will return false if it
// encounters an error.
return false;
}
/* static */
MatchAutoCompleteFunction::const_wchar_iterator
MatchAutoCompleteFunction::nextWordBoundary(const_wchar_iterator aStart,
const_wchar_iterator aEnd)
{
while (aStart != aEnd && !isWordBoundary(*aStart))
aStart++;
return aStart;
}
/* static */
bool
MatchAutoCompleteFunction::isWordBoundary(const PRUnichar &aChar)
{
// Only check lowercase alphabetic characters so we can match CamelCase
// words. This means that matches will happen after an upper-case
// character.
return !(PRUnichar('a') <= aChar && aChar <= PRUnichar('z'));
}
/* static */
MatchAutoCompleteFunction::searchFunctionPtr
MatchAutoCompleteFunction::getSearchFunction(PRInt32 aBehavior)
@ -217,15 +339,15 @@ namespace places {
#define HAS_BEHAVIOR(aBitName) \
(searchBehavior & mozIPlacesAutoComplete::BEHAVIOR_##aBitName)
nsAutoString searchString;
(void)aArguments->GetString(kArgSearchString, searchString);
nsCAutoString searchString;
(void)aArguments->GetUTF8String(kArgSearchString, searchString);
nsCString url;
(void)aArguments->GetUTF8String(kArgIndexURL, url);
// We only want to filter javascript: URLs if we are not supposed to search
// for them, and the search does not start with "javascript:".
if (!HAS_BEHAVIOR(JAVASCRIPT) &&
!StringBeginsWith(searchString, NS_LITERAL_STRING("javascript:")) &&
!StringBeginsWith(searchString, NS_LITERAL_CSTRING("javascript:")) &&
StringBeginsWith(url, NS_LITERAL_CSTRING("javascript:"))) {
NS_IF_ADDREF(*_result = new IntegerVariant(0));
NS_ENSURE_TRUE(*_result, NS_ERROR_OUT_OF_MEMORY);
@ -235,8 +357,8 @@ namespace places {
PRInt32 visitCount = aArguments->AsInt32(kArgIndexVisitCount);
bool typed = aArguments->AsInt32(kArgIndexTyped) ? true : false;
bool bookmark = aArguments->AsInt32(kArgIndexBookmark) ? true : false;
nsAutoString tags;
(void)aArguments->GetString(kArgIndexTags, tags);
nsCAutoString tags;
(void)aArguments->GetUTF8String(kArgIndexTags, tags);
PRInt32 openPageCount = aArguments->AsInt32(kArgIndexOpenPageCount);
// Make sure we match all the filter requirements. If a given restriction
@ -255,21 +377,21 @@ namespace places {
}
// Clean up our URI spec and prepare it for searching.
nsString fixedURI;
nsCString fixedURI;
fixupURISpec(url, fixedURI);
// Obtain our search function.
PRInt32 matchBehavior = aArguments->AsInt32(kArgIndexMatchBehavior);
searchFunctionPtr searchFunction = getSearchFunction(matchBehavior);
nsAutoString title;
(void)aArguments->GetString(kArgIndexTitle, title);
nsCAutoString title;
(void)aArguments->GetUTF8String(kArgIndexTitle, title);
// Determine if every token matches either the bookmark title, tags, page
// title, or page URL.
nsWhitespaceTokenizer tokenizer(searchString);
nsCWhitespaceTokenizer tokenizer(searchString);
while (matches && tokenizer.hasMoreTokens()) {
const nsDependentSubstring &token = tokenizer.nextToken();
const nsDependentCSubstring &token = tokenizer.nextToken();
bool matchTags = searchFunction(token, tags);
bool matchTitle = searchFunction(token, title);

View File

@ -15,7 +15,8 @@
* The Original Code is Places code.
*
* The Initial Developer of the Original Code is
* Mozilla Corporation.
* the Mozilla Foundation.
*
* Portions created by the Initial Developer are Copyright (C) 2009
* the Initial Developer. All Rights Reserved.
*
@ -118,10 +119,10 @@ private:
/**
* Typedefs
*/
typedef bool (*searchFunctionPtr)(const nsDependentSubstring &aToken,
const nsAString &aSourceString);
typedef bool (*searchFunctionPtr)(const nsDependentCSubstring &aToken,
const nsACString &aSourceString);
typedef nsAString::const_char_iterator const_wchar_iterator;
typedef nsACString::const_char_iterator const_char_iterator;
/**
* Obtains the search function to match on.
@ -133,6 +134,18 @@ private:
*/
static searchFunctionPtr getSearchFunction(PRInt32 aBehavior);
/**
* Tests if aSourceString starts with aToken.
*
* @param aToken
* The string to search for.
* @param aSourceString
* The string to search.
* @return true if found, false otherwise.
*/
static bool findBeginning(const nsDependentCSubstring &aToken,
const nsACString &aSourceString);
/**
* Searches aSourceString for aToken anywhere in the string in a case-
* insensitive way.
@ -143,20 +156,8 @@ private:
* The string to search.
* @return true if found, false otherwise.
*/
static bool findAnywhere(const nsDependentSubstring &aToken,
const nsAString &aSourceString);
/**
* Tests if aSourceString starts with aToken.
*
* @param aToken
* The string to search for.
* @param aSourceString
* The string to search.
* @return true if found, false otherwise.
*/
static bool findBeginning(const nsDependentSubstring &aToken,
const nsAString &aSourceString);
static bool findAnywhere(const nsDependentCSubstring &aToken,
const nsACString &aSourceString);
/**
* Tests if aToken is found on a word boundary in aSourceString.
@ -167,33 +168,9 @@ private:
* The string to search.
* @return true if found, false otherwise.
*/
static bool findOnBoundary(const nsDependentSubstring &aToken,
const nsAString &aSourceString);
static bool findOnBoundary(const nsDependentCSubstring &aToken,
const nsACString &aSourceString);
/**
* Obtains an iterator to the next word boundary as defined by isWordBoundary.
*
* @param aStart
* An iterator pointing to the start of the string.
* @param aEnd
* An iterator pointing to the end of the string.
* @return an iterator pointing to the next word boundary.
*/
static const_wchar_iterator nextWordBoundary(const_wchar_iterator aStart,
const_wchar_iterator aEnd);
/**
* Determines if aChar is a word boundary. A 'word boundary' is anything that
* is not used to build up a word from a string of characters. We are very
* conservative here because anything that we do not list will be treated as a
* word boundary. This means searching for that not-actually-a-word-boundary
* character can still be matched in the middle of a word.
*
* @param aChar
* The Unicode character to check against.
* @return true if the character is considered a word boundary, false
* otherwise.
*/
static inline bool isWordBoundary(const PRUnichar &aChar);
/**
* Fixes a URI's spec such that it is ready to be searched. This includes
@ -205,7 +182,7 @@ private:
* @param _fixedSpec
* An out parameter that is the fixed up string.
*/
static void fixupURISpec(const nsCString &aURISpec, nsString &_fixedSpec);
static void fixupURISpec(const nsCString &aURISpec, nsCString &_fixedSpec);
};
} // namespace places