Bug 724538 - When ICU is available in the build, replace most of nsCharProps2 fields with ICU property accessors. r=emk

This commit is contained in:
Jonathan Kew 2016-01-13 15:45:22 +00:00
parent d75819d824
commit 5d8aeb6596
6 changed files with 220 additions and 268 deletions

View File

@ -1625,9 +1625,14 @@ gfxPangoFontGroup::FindFontForChar(uint32_t aCh, uint32_t aPrevCh,
nextFont = 1;
}
// Pango, GLib, and Thebes (but not harfbuzz!) all happen to use the same
// script codes, so we can just cast the value here.
const PangoScript script = static_cast<PangoScript>(aRunScript);
// Our MOZ_SCRIPT_* codes may not match the PangoScript enumeration values
// (if we're using ICU's codes), so convert by mapping through ISO 15924 tag.
// Note that PangoScript is defined to be compatible with GUnicodeScript:
// https://developer.gnome.org/pango/stable/pango-Scripts-and-Languages.html#PangoScript
const hb_tag_t scriptTag = GetScriptTagForCode(aRunScript);
const PangoScript script =
(const PangoScript)g_unicode_script_from_iso15924(scriptTag);
// Might be nice to call pango_language_includes_script only once for the
// run rather than for each character.
PangoLanguage *scriptLang;
@ -1654,19 +1659,6 @@ gfxPangoFontGroup::FindFontForChar(uint32_t aCh, uint32_t aPrevCh,
return nullptr;
}
// Sanity-check: spot-check a few constants to confirm that Thebes and
// Pango script codes really do match
#define CHECK_SCRIPT_CODE(script) \
PR_STATIC_ASSERT(int32_t(MOZ_SCRIPT_##script) == \
int32_t(PANGO_SCRIPT_##script))
CHECK_SCRIPT_CODE(COMMON);
CHECK_SCRIPT_CODE(INHERITED);
CHECK_SCRIPT_CODE(ARABIC);
CHECK_SCRIPT_CODE(LATIN);
CHECK_SCRIPT_CODE(UNKNOWN);
CHECK_SCRIPT_CODE(NKO);
/**
** gfxFcFont
**/

View File

@ -158,19 +158,11 @@ gfxScriptItemizer::Next(uint32_t& aRunStart, uint32_t& aRunLimit,
}
}
// Get the nsCharProps2 record for the current character,
// so we can read the script and (if needed) the gen category
// without needing to do two multi-level lookups.
// NOTE that this means we're relying on an implementation detail
// of the nsUnicodeProperties tables, and might have to revise this
// if the nsCharProps records used there are modified in future.
const nsCharProps2& charProps = GetCharProps2(ch);
// Initialize gc to UNASSIGNED; we'll only set it to the true GC
// if the character has script=COMMON, otherwise we don't care.
uint8_t gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
sc = charProps.mScriptCode;
sc = GetScriptCode(ch);
if (sc == MOZ_SCRIPT_COMMON) {
/*
* Paired character handling:
@ -183,7 +175,7 @@ gfxScriptItemizer::Next(uint32_t& aRunStart, uint32_t& aRunLimit,
* We only do this if the script is COMMON; for chars with
* specific script assignments, we just use them as-is.
*/
gc = charProps.mCategory;
GetGeneralCategory(ch);
if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) {
uint32_t endPairChar = mozilla::unicode::GetMirroredChar(ch);
if (endPairChar != ch) {

View File

@ -40,8 +40,9 @@
#
# (2) Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \
# /path/to/harfbuzz/src \
# perl genUnicodePropertyData.pl \
# /path/to/harfbuzz/src \
# /path/to/icu/common/unicode \
# /path/to/UCD-directory
#
# This will generate (or overwrite!) the files
@ -54,15 +55,17 @@
use strict;
use List::Util qw(first);
if ($#ARGV != 1) {
if ($#ARGV != 2) {
print <<__EOT;
# Run this tool using a command line of the form
#
# perl genUnicodePropertyData.pl \\
# /path/to/harfbuzz/src \\
# perl genUnicodePropertyData.pl \\
# /path/to/harfbuzz/src \\
# /path/to/icu/common/unicode \\
# /path/to/UCD-directory
#
# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
# icu/common/unicode is the directory containing ICU 'common' public headers,
# and UCD-directory is a directory containing the current Unicode Character
# Database files (UnicodeData.txt, etc), available from
# http://www.unicode.org/Public/UNIDATA/, with additional resources as
@ -78,190 +81,20 @@ __EOT
exit 0;
}
# load HB_Script and HB_Category constants
my $HARFBUZZ = $ARGV[0];
my $ICU = $ARGV[1];
my $UNICODE = $ARGV[2];
# NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
# script codes as used by Glib/Pango/etc.
# We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
# compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
# load HB_Category constants
# CHECK that this matches Pango source (as found for example at
# http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
# for as many codes as that defines (currently up through Unicode 5.1)
# and the GLib enumeration
# http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
# (currently defined up through Unicode 6.0).
# Constants beyond these may be regarded as unstable for now, but we don't actually
# depend on the specific values.
my %scriptCode = (
INVALID => -1,
COMMON => 0,
INHERITED => 1,
ARABIC => 2,
ARMENIAN => 3,
BENGALI => 4,
BOPOMOFO => 5,
CHEROKEE => 6,
COPTIC => 7,
CYRILLIC => 8,
DESERET => 9,
DEVANAGARI => 10,
ETHIOPIC => 11,
GEORGIAN => 12,
GOTHIC => 13,
GREEK => 14,
GUJARATI => 15,
GURMUKHI => 16,
HAN => 17,
HANGUL => 18,
HEBREW => 19,
HIRAGANA => 20,
KANNADA => 21,
KATAKANA => 22,
KHMER => 23,
LAO => 24,
LATIN => 25,
MALAYALAM => 26,
MONGOLIAN => 27,
MYANMAR => 28,
OGHAM => 29,
OLD_ITALIC => 30,
ORIYA => 31,
RUNIC => 32,
SINHALA => 33,
SYRIAC => 34,
TAMIL => 35,
TELUGU => 36,
THAANA => 37,
THAI => 38,
TIBETAN => 39,
CANADIAN_ABORIGINAL => 40,
YI => 41,
TAGALOG => 42,
HANUNOO => 43,
BUHID => 44,
TAGBANWA => 45,
# unicode 4.0 additions
BRAILLE => 46,
CYPRIOT => 47,
LIMBU => 48,
OSMANYA => 49,
SHAVIAN => 50,
LINEAR_B => 51,
TAI_LE => 52,
UGARITIC => 53,
# unicode 4.1 additions
NEW_TAI_LUE => 54,
BUGINESE => 55,
GLAGOLITIC => 56,
TIFINAGH => 57,
SYLOTI_NAGRI => 58,
OLD_PERSIAN => 59,
KHAROSHTHI => 60,
# unicode 5.0 additions
UNKNOWN => 61,
BALINESE => 62,
CUNEIFORM => 63,
PHOENICIAN => 64,
PHAGS_PA => 65,
NKO => 66,
# unicode 5.1 additions
KAYAH_LI => 67,
LEPCHA => 68,
REJANG => 69,
SUNDANESE => 70,
SAURASHTRA => 71,
CHAM => 72,
OL_CHIKI => 73,
VAI => 74,
CARIAN => 75,
LYCIAN => 76,
LYDIAN => 77,
# unicode 5.2 additions
AVESTAN => 78,
BAMUM => 79,
EGYPTIAN_HIEROGLYPHS => 80,
IMPERIAL_ARAMAIC => 81,
INSCRIPTIONAL_PAHLAVI => 82,
INSCRIPTIONAL_PARTHIAN => 83,
JAVANESE => 84,
KAITHI => 85,
LISU => 86,
MEETEI_MAYEK => 87,
OLD_SOUTH_ARABIAN => 88,
OLD_TURKIC => 89,
SAMARITAN => 90,
TAI_THAM => 91,
TAI_VIET => 92,
# unicode 6.0 additions
BATAK => 93,
BRAHMI => 94,
MANDAIC => 95,
# unicode 6.1 additions
CHAKMA => 96,
MEROITIC_CURSIVE => 97,
MEROITIC_HIEROGLYPHS => 98,
MIAO => 99,
SHARADA => 100,
SORA_SOMPENG => 101,
TAKRI => 102,
# unicode 7.0 additions
BASSA_VAH => 103,
CAUCASIAN_ALBANIAN => 104,
DUPLOYAN => 105,
ELBASAN => 106,
GRANTHA => 107,
KHOJKI => 108,
KHUDAWADI => 109,
LINEAR_A => 110,
MAHAJANI => 111,
MANICHAEAN => 112,
MENDE_KIKAKUI => 113,
MODI => 114,
MRO => 115,
NABATAEAN => 116,
OLD_NORTH_ARABIAN => 117,
OLD_PERMIC => 118,
PAHAWH_HMONG => 119,
PALMYRENE => 120,
PAU_CIN_HAU => 121,
PSALTER_PAHLAVI => 122,
SIDDHAM => 123,
TIRHUTA => 124,
WARANG_CITI => 125,
# unicode 8.0 additions
AHOM => 126,
ANATOLIAN_HIEROGLYPHS => 127,
HATRAN => 128,
MULTANI => 129,
OLD_HUNGARIAN => 130,
SIGNWRITING => 131,
# additional "script" code, not from Unicode (but matches ISO 15924's Zmth tag)
MATHEMATICAL_NOTATION => 132,
);
my $sc = -1;
my $cc = -1;
my %catCode;
my @scriptCodeToTag;
my @scriptCodeToName;
sub readHarfBuzzHeader
{
my $file = shift;
open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
open FH, "< $HARFBUZZ/$file" or die "can't open harfbuzz header $HARFBUZZ/$file\n";
while (<FH>) {
s/CANADIAN_SYLLABICS/CANADIAN_ABORIGINAL/; # harfbuzz and unicode disagree on this name :(
if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
unless (exists $scriptCode{$1}) {
warn "unknown script name $1 found in $file\n";
next;
}
$sc = $scriptCode{$1};
$scriptCodeToTag[$sc] = $2;
$scriptCodeToName[$sc] = $1;
}
if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
$cc++;
$catCode{$1} = $cc;
@ -270,16 +103,40 @@ sub readHarfBuzzHeader
close FH;
}
&readHarfBuzzHeader("hb-common.h");
&readHarfBuzzHeader("hb-unicode.h");
die "didn't find HarfBuzz script codes\n" if $sc == -1;
die "didn't find HarfBuzz category codes\n" if $cc == -1;
# Additional code not present in HarfBuzz headers:
$sc = $scriptCode{"MATHEMATICAL_NOTATION"};
$scriptCodeToTag[$sc] = "'Z','m','t','h'";
$scriptCodeToName[$sc] = "MATHEMATICAL_NOTATION";
my %scriptCode;
my @scriptCodeToTag;
my @scriptCodeToName;
my $sc = -1;
sub readIcuHeader
{
my $file = shift;
open FH, "< $ICU/$file" or die "can't open ICU header $ICU/$file\n";
while (<FH>) {
# adjust for ICU vs UCD naming discrepancies
s/LANNA/TAI_THAM/;
s/MEITEI_MAYEK/MEETEI_MAYEK/;
s/ORKHON/OLD_TURKIC/;
s/MENDE/MENDE_KIKAKUI/;
s/SIGN_WRITING/SIGNWRITING/;
if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
$sc = $2;
$scriptCode{$1} = $sc;
$scriptCodeToTag[$sc] = $3;
$scriptCodeToName[$sc] = $1;
}
}
close FH;
}
&readIcuHeader("uscript.h");
die "didn't find ICU script codes\n" if $sc == -1;
my %xidmodCode = (
'Recommended' => 0,
@ -317,9 +174,9 @@ my %bidicategoryCode = (
"PDF" => "16", # Pop Directional Format
"NSM" => "17", # Non-Spacing Mark
"BN" => "18", # Boundary Neutral
"LRI" => "19", # Left-to-Right Isolate
"RLI" => "20", # Right-to-left Isolate
"FSI" => "21", # First Strong Isolate
"FSI" => "19", # First Strong Isolate
"LRI" => "20", # Left-to-Right Isolate
"RLI" => "21", # Right-to-left Isolate
"PDI" => "22" # Pop Direcitonal Isolate
);
@ -404,7 +261,7 @@ my %ucd2hb = (
# read ReadMe.txt
my @versionInfo;
open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
while (<FH>) {
chomp;
push @versionInfo, $_;
@ -418,7 +275,7 @@ my $kLowerToUpper = 0x10000000;
my $kCaseMapCharMask = 0x001fffff;
# read UnicodeData.txt
open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
while (<FH>) {
chomp;
my @fields = split /;/;
@ -490,7 +347,7 @@ while (<FH>) {
close FH;
# read Scripts.txt
open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
open FH, "< $UNICODE/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -500,8 +357,8 @@ while (<FH>) {
while (<FH>) {
if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
my $script = uc($3);
warn "unknown script $script" unless exists $scriptCode{$script};
$script = $scriptCode{$script};
warn "unknown ICU script $script" unless exists $scriptCode{$script};
my $script = $scriptCode{$script};
my $start = hex "0x$1";
my $end = (defined $2) ? hex "0x$2" : $start;
for (my $i = $start; $i <= $end; ++$i) {
@ -515,7 +372,7 @@ close FH;
my @offsets = ();
push @offsets, 0;
open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
open FH, "< $UNICODE/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -543,7 +400,7 @@ my %pairedBracketTypeCode = (
'O' => 1,
'C' => 2
);
open FH, "< $ARGV[1]/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
open FH, "< $UNICODE/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -570,7 +427,7 @@ my %hangulType = (
'LV' => 0x03,
'LVT' => 0x07
);
open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
open FH, "< $UNICODE/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -593,7 +450,7 @@ while (<FH>) {
close FH;
# read xidmodifications.txt
open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -616,7 +473,7 @@ while (<FH>) {
}
close FH;
open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -653,7 +510,7 @@ while (<FH>) {
close FH;
# read VerticalOrientation-13.txt
open FH, "< $ARGV[1]/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -732,21 +589,25 @@ $versionInfo
__END
print DATA_TABLES "#if !ENABLE_INTL_API\n";
print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i];
printf DATA_TABLES " HB_TAG('%c','%c','%c','%c')", unpack('cccc', $scriptCodeToTag[$i]);
print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
}
print DATA_TABLES "};\n\n";
print DATA_TABLES "};\n";
print DATA_TABLES "#endif\n\n";
our $totalData = 0;
print DATA_TABLES "#if !ENABLE_INTL_API\n";
print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
for (my $i = 0; $i < scalar @offsets; ++$i) {
printf DATA_TABLES " $offsets[$i]";
print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
}
print DATA_TABLES "};\n\n";
print DATA_TABLES "};\n";
print DATA_TABLES "#endif\n\n";
print HEADER "#pragma pack(1)\n\n";
@ -762,11 +623,26 @@ struct nsCharProps1 {
unsigned char mCombiningClass:8;
};
/;
print DATA_TABLES "#ifndef ENABLE_INTL_API\n";
&genTables("CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
print DATA_TABLES "#endif\n\n";
&genTables("#if !ENABLE_INTL_API", "#endif",
"CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
sub sprintCharProps2
sub sprintCharProps2_short
{
my $usv = shift;
return sprintf("{%d,%d,%d},",
$pairedBracketType[$usv], $verticalOrientation[$usv], $xidmod[$usv]);
}
$type = q/
struct nsCharProps2 {
unsigned char mPairedBracketType:2;
unsigned char mVertOrient:2;
unsigned char mXidmod:4;
};
/;
&genTables("#if ENABLE_INTL_API", "#endif",
"CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2_short, 16, 1, 1);
sub sprintCharProps2_full
{
my $usv = shift;
return sprintf("{%d,%d,%d,%d,%d,%d,%d},",
@ -785,7 +661,8 @@ struct nsCharProps2 {
unsigned char mVertOrient:2;
};
|;
&genTables("CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
&genTables("#if !ENABLE_INTL_API", "#endif",
"CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2_full, 16, 4, 1);
print HEADER "#pragma pack()\n\n";
@ -800,21 +677,22 @@ sub sprintHanVariants
}
return sprintf("0x%02x,", $val);
}
&genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
## Han Variant data currently unused but may be needed in future, see bug 857481
## &genTables("", "", "HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
sub sprintFullWidth
{
my $usv = shift;
return sprintf("0x%04x,", $fullWidth[$usv]);
}
&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
&genTables("", "", "FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
sub sprintCasemap
{
my $usv = shift;
return sprintf("0x%08x,", $casemap[$usv]);
}
&genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
&genTables("", "", "CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
print STDERR "Total data = $totalData\n";
@ -826,8 +704,16 @@ printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCha
sub genTables
{
my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
my ($guardBegin, $guardEnd,
$prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
if ($typedef ne '') {
print HEADER "$guardBegin\n";
print HEADER "$typedef\n";
print HEADER "$guardEnd\n\n";
}
print DATA_TABLES "\n$guardBegin\n";
print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
@ -888,8 +774,6 @@ sub genTables
}
print DATA_TABLES "};\n\n";
print HEADER "$typedef\n\n" if $typedef ne '';
my $pageLen = $charsPerPage / $charsPerEntry;
print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
for (my $i = 0; $i < scalar @char; ++$i) {
@ -897,7 +781,8 @@ sub genTables
print DATA_TABLES $char[$i];
print DATA_TABLES $i < $#char ? "},\n" : "}\n";
}
print DATA_TABLES "};\n\n";
print DATA_TABLES "};\n";
print DATA_TABLES "$guardEnd\n";
my $dataSize = $pmCount * $indexLen * $pmBits/8 +
$chCount * $pageLen * $bytesPerEntry +

View File

@ -14,7 +14,8 @@
* for the detailed definition of the following categories
*
* The values here must match the equivalents in %bidicategorycode in
* mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl
* mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
* and must also match the values used by ICU's UCharDirection.
*/
enum nsCharType {
@ -37,9 +38,9 @@ enum nsCharType {
eCharType_PopDirectionalFormat = 16,
eCharType_DirNonSpacingMark = 17,
eCharType_BoundaryNeutral = 18,
eCharType_LeftToRightIsolate = 19,
eCharType_RightToLeftIsolate = 20,
eCharType_FirstStrongIsolate = 21,
eCharType_FirstStrongIsolate = 19,
eCharType_LeftToRightIsolate = 20,
eCharType_RightToLeftIsolate = 21,
eCharType_PopDirectionalIsolate = 22,
eCharType_CharTypeCount
};

View File

@ -11,12 +11,12 @@
#if ENABLE_INTL_API
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#endif
#define UNICODE_BMP_LIMIT 0x10000
#define UNICODE_LIMIT 0x110000
#ifndef ENABLE_INTL_API
static const nsCharProps1&
GetCharProps1(uint32_t aCh)
@ -56,14 +56,21 @@ GetCharProps2(uint32_t aCh)
NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
// Default values for unassigned
using namespace mozilla::unicode;
static const nsCharProps2 undefined = {
MOZ_SCRIPT_UNKNOWN, // Script code
0, // East Asian Width
HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category
eCharType_LeftToRight, // Bidi Category
mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod
-1, // Numeric Value
mozilla::unicode::HVT_NotHan // Han variant
#if ENABLE_INTL_API
PAIRED_BRACKET_TYPE_NONE,
VERTICAL_ORIENTATION_R,
XIDMOD_NOT_CHARS
#else
MOZ_SCRIPT_UNKNOWN,
PAIRED_BRACKET_TYPE_NONE,
HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
eCharType_LeftToRight,
XIDMOD_NOT_CHARS,
-1, // Numeric Value
VERTICAL_ORIENTATION_R
#endif
};
return undefined;
}
@ -93,7 +100,7 @@ to provide the most compact storage, depending on the distribution
of values.
*/
nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
const nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
/*
* The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
* of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
@ -130,6 +137,69 @@ nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
/* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator
};
#ifdef ENABLE_INTL_API
const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
};
#endif
uint8_t GetGeneralCategory(uint32_t aCh) {
#if ENABLE_INTL_API
return sICUtoHBcategory[u_charType(aCh)];
#else
return GetCharProps2(aCh).mCategory;
#endif
}
nsCharType GetBidiCat(uint32_t aCh) {
#if ENABLE_INTL_API
return nsCharType(u_charDirection(aCh));
#else
return nsCharType(GetCharProps2(aCh).mBidiCategory);
#endif
}
int8_t GetNumericValue(uint32_t aCh) {
#if ENABLE_INTL_API
UNumericType type =
UNumericType(u_getIntPropertyValue(aCh, UCHAR_NUMERIC_TYPE));
return type == U_NT_DECIMAL || type == U_NT_DIGIT
? int8_t(u_getNumericValue(aCh))
: -1;
#else
return GetCharProps2(aCh).mNumericValue;
#endif
}
uint32_t
GetMirroredChar(uint32_t aCh)
{
@ -160,14 +230,30 @@ GetCombiningClass(uint32_t aCh)
#endif
}
uint8_t
GetScriptCode(uint32_t aCh)
{
#if ENABLE_INTL_API
UErrorCode err = U_ZERO_ERROR;
return uscript_getScript(aCh, &err);
#else
return GetCharProps2(aCh).mScriptCode;
#endif
}
uint32_t
GetScriptTagForCode(int32_t aScriptCode)
{
#if ENABLE_INTL_API
const char* tag = uscript_getShortName(UScriptCode(aScriptCode));
return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
#else
// this will safely return 0 for negative script codes, too :)
if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
return 0;
}
return sScriptCodeToTag[aScriptCode];
#endif
}
PairedBracketType GetPairedBracketType(uint32_t aCh)
@ -254,6 +340,7 @@ GetTitlecaseForAll(uint32_t aCh)
return aCh;
}
#if 0 // currently unused - bug 857481
HanVariantType
GetHanVariant(uint32_t aCh)
{
@ -272,6 +359,7 @@ GetHanVariant(uint32_t aCh)
// extract the appropriate 2-bit field from the value
return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
}
#endif
uint32_t
GetFullWidth(uint32_t aCh)

View File

@ -16,7 +16,7 @@ namespace mozilla {
namespace unicode {
extern nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[];
extern const nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[];
// Return whether the char has a mirrored-pair counterpart.
uint32_t GetMirroredChar(uint32_t aCh);
@ -26,25 +26,19 @@ bool HasMirroredChar(uint32_t aChr);
uint8_t GetCombiningClass(uint32_t aCh);
// returns the detailed General Category in terms of HB_UNICODE_* values
inline uint8_t GetGeneralCategory(uint32_t aCh) {
return GetCharProps2(aCh).mCategory;
}
uint8_t GetGeneralCategory(uint32_t aCh);
// returns the simplified Gen Category as defined in nsIUGenCategory
inline nsIUGenCategory::nsUGenCategory GetGenCategory(uint32_t aCh) {
return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
}
inline uint8_t GetScriptCode(uint32_t aCh) {
return GetCharProps2(aCh).mScriptCode;
}
nsCharType GetBidiCat(uint32_t aCh);
uint8_t GetScriptCode(uint32_t aCh);
uint32_t GetScriptTagForCode(int32_t aScriptCode);
inline nsCharType GetBidiCat(uint32_t aCh) {
return nsCharType(GetCharProps2(aCh).mBidiCategory);
}
/* This MUST match the values assigned by genUnicodePropertyData.pl! */
enum VerticalOrientation {
VERTICAL_ORIENTATION_U = 0,
@ -93,10 +87,9 @@ inline XidmodType GetIdentifierModification(uint32_t aCh) {
* To restrict to decimal digits, the caller should also check whether
* GetGeneralCategory returns HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER
*/
inline int8_t GetNumericValue(uint32_t aCh) {
return GetCharProps2(aCh).mNumericValue;
}
int8_t GetNumericValue(uint32_t aCh);
#if 0 // currently unused - bug 857481
enum HanVariantType {
HVT_NotHan = 0x0,
HVT_SimplifiedOnly = 0x1,
@ -105,6 +98,7 @@ enum HanVariantType {
};
HanVariantType GetHanVariant(uint32_t aCh);
#endif
uint32_t GetFullWidth(uint32_t aCh);