Add new bidi control characters defined in Unicode 6.3. Bug 922530, r=jfkthame

This commit is contained in:
Simon Montagu 2013-10-02 16:07:02 +03:00
parent b570e91848
commit 74c3067fe1
12 changed files with 76 additions and 34 deletions

View File

@ -2155,7 +2155,7 @@ function losslessDecodeURI(aURI) {
// except ZWNJ (U+200C) and ZWJ (U+200D) (bug 582186).
// This includes all bidirectional formatting characters.
// (RFC 3987 sections 3.2 and 4.1 paragraph 6)
value = value.replace(/[\u00ad\u034f\u115f-\u1160\u17b4-\u17b5\u180b-\u180d\u200b\u200e-\u200f\u202a-\u202e\u2060-\u206f\u3164\ufe00-\ufe0f\ufeff\uffa0\ufff0-\ufff8]|\ud834[\udd73-\udd7a]|[\udb40-\udb43][\udc00-\udfff]/g,
value = value.replace(/[\u00ad\u034f\u061c\u115f-\u1160\u17b4-\u17b5\u180b-\u180d\u200b\u200e-\u200f\u202a-\u202e\u2060-\u206f\u3164\ufe00-\ufe0f\ufeff\uffa0\ufff0-\ufff8]|\ud834[\udd73-\udd7a]|[\udb40-\udb43][\udc00-\udfff]/g,
encodeURIComponent);
return value;
}

View File

@ -423,7 +423,7 @@ nsTextFragment::UpdateBidiFlag(const PRUnichar* aBuffer, uint32_t aLength)
PRUnichar ch2 = *cp++;
utf32Char = SURROGATE_TO_UCS4(ch1, ch2);
}
if (UTF32_CHAR_IS_BIDI(utf32Char) || IS_BIDI_CONTROL_CHAR(utf32Char)) {
if (UTF32_CHAR_IS_BIDI(utf32Char) || IsBidiControl(utf32Char)) {
mState.mIsBidi = true;
break;
}

View File

@ -4177,9 +4177,9 @@ gfxFontGroup::IsInvalidChar(PRUnichar ch)
if (ch <= 0x9f) {
return true;
}
return ((ch & 0xFF00) == 0x2000 /* Unicode control character */ &&
(ch == 0x200B/*ZWSP*/ || ch == 0x2028/*LSEP*/ || ch == 0x2029/*PSEP*/ ||
IS_BIDI_CONTROL_CHAR(ch)));
return (((ch & 0xFF00) == 0x2000 /* Unicode control character */ &&
(ch == 0x200B/*ZWSP*/ || ch == 0x2028/*LSEP*/ || ch == 0x2029/*PSEP*/)) ||
IsBidiControl(ch));
}
bool
@ -5343,11 +5343,14 @@ gfxShapedText::SetGlyphs(uint32_t aIndex, CompressedGlyph aGlyph,
#define ZWNJ 0x200C
#define ZWJ 0x200D
// U+061C ARABIC LETTER MARK is expected to be added to XIDMOD_DEFAULT_IGNORABLE
// in a future Unicode update. Add it manually for now
#define ALM 0x061C
static inline bool
IsDefaultIgnorable(uint32_t aChar)
{
return GetIdentifierModification(aChar) == XIDMOD_DEFAULT_IGNORABLE ||
aChar == ZWNJ || aChar == ZWJ;
aChar == ZWNJ || aChar == ZWJ || aChar == ALM;
}
void

View File

@ -82,17 +82,6 @@ nsresult HandleNumbers(PRUnichar* aBuffer, uint32_t aSize, uint32_t aNumFlag)
return NS_OK;
}
#define LRM_CHAR 0x200e
#define LRE_CHAR 0x202a
#define RLO_CHAR 0x202e
bool IsBidiControl(uint32_t aChar)
{
// This method is used when stripping Bidi control characters for
// display, so it will return TRUE for LRM, RLM, LRE, RLE, PDF, LRO and RLO
return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
((aChar)&0xfffffe)==LRM_CHAR);
}
bool HasRTLChars(const nsAString& aString)
{
// This is used to determine whether to enable bidi if a string has

View File

@ -82,16 +82,28 @@ typedef enum nsCharType nsCharType;
/**
* Give a UTF-32 codepoint
* return true if the codepoint is a Bidi control character (LRE, RLE, PDF, LRO, RLO, LRM, RLM)
* return false, otherwise
* return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
* LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
* Return false, otherwise
*/
bool IsBidiControl(uint32_t aChar);
#define LRM_CHAR 0x200e
#define LRE_CHAR 0x202a
#define RLO_CHAR 0x202e
#define LRI_CHAR 0x2066
#define PDI_CHAR 0x2069
#define ALM_CHAR 0x061C
inline bool IsBidiControl(uint32_t aChar) {
return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
(LRI_CHAR <= aChar && aChar <= PDI_CHAR) ||
(aChar == ALM_CHAR) ||
(aChar & 0xfffffe) == LRM_CHAR);
}
/**
* Give an nsString.
* @return true if the string contains right-to-left characters
*/
bool HasRTLChars(const nsAString& aString);
bool HasRTLChars(const nsAString& aString);
// --------------------------------------------------
// IBMBIDI
@ -202,8 +214,6 @@ typedef enum nsCharType nsCharType;
(c) >= 0x08a0 ) )
#define IS_ARABIC_ALPHABETIC(c) (IS_ARABIC_CHAR(c) && \
!(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
#define IS_BIDI_CONTROL_CHAR(c) (((0x202a <= (c)) && ((c) <= 0x202e)) \
|| ((c) == 0x200e) || ((c) == 0x200f))
/**
* The codepoint ranges in the following macros are based on the blocks

View File

@ -21,11 +21,7 @@ static bool IsDiscardable(PRUnichar ch, uint32_t* aFlags)
*aFlags |= nsTextFrameUtils::TEXT_HAS_SHY;
return true;
}
if ((ch & 0xFF00) != 0x2000) {
// Not a Bidi control character
return false;
}
return IS_BIDI_CONTROL_CHAR(ch);
return IsBidiControl(ch);
}
static bool IsDiscardable(uint8_t ch, uint32_t* aFlags)

View File

@ -17,11 +17,6 @@ struct nsStyleText;
#define CH_SHY 173
#define CH_CJKSP 12288 // U+3000 IDEOGRAPHIC SPACE (CJK Full-Width Space)
#define CH_LRM 8206 //<!ENTITY lrm CDATA "&#8206;" -- left-to-right mark, U+200E NEW RFC 2070 -->
#define CH_RLM 8207 //<!ENTITY rlm CDATA "&#8207;" -- right-to-left mark, U+200F NEW RFC 2070 -->
#define CH_LRE 8234 //<!CDATA "&#8234;" -- left-to-right embedding, U+202A -->
#define CH_RLO 8238 //<!CDATA "&#8238;" -- right-to-left override, U+202E -->
class nsTextFrameUtils {
public:
// These constants are used as textrun flags for textframe textruns.

View File

@ -0,0 +1,13 @@
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>ALM character</title>
<style type="text/css">
p { font-size: 2em; }
</style>
</head>
<body>
<p>a2+1</p>
</body>
</html>

View File

@ -0,0 +1,25 @@
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>ALM character</title>
<style type="text/css">
p { font-size: 2em; }
</style>
<!--
Test that U+061C functions as an invisible Arabic character.
In the test text it is followed by two European numbers with a
European separator between them. By rules W2, W3 and W6 of the
Bidi algorithm (http://unicode.org/reports/tr9/#W2) the
European numbers change to Arabic numbers, the ARABIC LETTER
MARK changes to R and the separator changes to Other Neutral,
and then by rule N1 the Arabic numbers make the separator
change to R, so the final visual order is "a2+1&#x061c;", but
the &#x061c should not appear
-->
</head>
<body>
<p>a&#x061c;1+2</p>
</body>
</html>

View File

@ -138,3 +138,4 @@ skip-if(B2G) == 726420-1.html 726420-1-ref.html
== 746987-4.html 746987-4-ref.html
== 779003-1.html 779003-1-ref.html
== 779003-1-dynamic.html 779003-1-ref.html
== 922530-1.html 922530-1-ref.html

View File

@ -12,6 +12,9 @@ const testcases = [
// non-XID character
["I♥NY", "xn--iny-zx5a", false, false, false],
// new non-XID character in Unicode 6.3
["حلا\u061cل", "xn--bgbvr6gc", false, false, false],
// U+30FB KATAKANA MIDDLE DOT is excluded from non-XID characters (bug 857490)
["乾燥肌・石けん", "xn--08j4gylj12hz80b0uhfup", false, true, true],

View File

@ -1127,11 +1127,18 @@ nsExternalAppHandler::nsExternalAppHandler(nsIMIMEInfo * aMIMEInfo,
// Remove unsafe bidi characters which might have spoofing implications (bug 511521).
const PRUnichar unsafeBidiCharacters[] = {
PRUnichar(0x061c), // Arabic Letter Mark
PRUnichar(0x200e), // Left-to-Right Mark
PRUnichar(0x200f), // Right-to-Left Mark
PRUnichar(0x202a), // Left-to-Right Embedding
PRUnichar(0x202b), // Right-to-Left Embedding
PRUnichar(0x202c), // Pop Directional Formatting
PRUnichar(0x202d), // Left-to-Right Override
PRUnichar(0x202e) // Right-to-Left Override
PRUnichar(0x202e), // Right-to-Left Override
PRUnichar(0x2066), // Left-to-Right Isolate
PRUnichar(0x2067), // Right-to-Left Isolate
PRUnichar(0x2068), // First Strong Isolate
PRUnichar(0x2069) // Pop Directional Isolate
};
for (uint32_t i = 0; i < ArrayLength(unsafeBidiCharacters); ++i) {
mSuggestedFileName.ReplaceChar(unsafeBidiCharacters[i], '_');