bug 744357 - implement mappings from Unicode's SpecialCasing.txt for text-transform. r=smontagu

This commit is contained in:
Jonathan Kew 2012-04-24 18:53:39 +01:00
parent 8bd558debc
commit 15627b7207
6 changed files with 607 additions and 46 deletions

View File

@ -0,0 +1,287 @@
#!/usr/bin/env perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
# This tool is used to extract "special" (one-to-many) case mappings
# into a form that can be used by nsTextRunTransformations.
use strict;
if ($#ARGV != 1) {
print <<__EOT;
# Run this tool using a command line of the form
#
# perl genSpecialCasingData.pl UnicodeData.txt SpecialCasing.txt
#
# The nsSpecialCasingData.cpp file will be written to standard output.
#
# This tool will also write up-to-date versions of the test files
# all-{upper,lower,title}.html
# and corresponding -ref files in the current directory.
#
__EOT
exit 0;
}
my %allLower;
my %allUpper;
my %allTitle;
my %compositions;
my %gc;
open FH, "< $ARGV[0]" or die "can't open $ARGV[0] (should be UnicodeData.txt)\n";
while (<FH>) {
chomp;
my @fields = split /;/;
next if ($fields[1] =~ /</); # ignore ranges etc
my $usv = hex "0x$fields[0]";
$allUpper{$usv} = $fields[12] if $fields[12] ne '';
$allLower{$usv} = $fields[13] if $fields[13] ne '';
$allTitle{$usv} = $fields[14] if $fields[14] ne '';
$gc{$usv} = $fields[2];
# we only care about non-singleton canonical decomps
my $decomp = $fields[5];
next if $decomp eq '' or $decomp =~ /</ or not $decomp =~ / /;
$compositions{$decomp} = sprintf("%04X", $usv);
}
close FH;
my %specialLower;
my %specialUpper;
my %specialTitle;
my %charName;
my @headerLines;
open FH, "< $ARGV[1]" or die "can't open $ARGV[1] (should be SpecialCasing.txt)\n";
while (<FH>) {
chomp;
m/#\s*(.+)$/;
my $comment = $1;
if ($comment =~ /^(SpecialCasing-|Date:)/) {
push @headerLines, $comment;
next;
}
s/#.*//;
s/;\s*$//;
next if $_ eq '';
my @fields = split /; */;
next unless (scalar @fields) == 4;
my $usv = hex "0x$fields[0]";
addIfSpecial(\%specialLower, $usv, $fields[1]);
addIfSpecial(\%specialTitle, $usv, $fields[2]);
addIfSpecial(\%specialUpper, $usv, $fields[3]);
$charName{$usv} = $comment;
}
close FH;
print <<__END__;
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
/* Auto-generated from files in the Unicode Character Database
by genSpecialCasingData.pl - do not edit! */
#include "nsSpecialCasingData.h"
#include "mozilla/Util.h" // for ArrayLength
#include <stdlib.h> // for bsearch
__END__
map { print "/* $_ */\n" } @headerLines;
print <<__END__;
using mozilla::unicode::MultiCharMapping;
__END__
printMappings('Lower', \%specialLower);
printMappings('Upper', \%specialUpper);
printMappings('Title', \%specialTitle);
print <<__END__;
static int CompareMCM(const void* aKey, const void* aElement)
{
const PRUint32 ch = *static_cast<const PRUint32*>(aKey);
const MultiCharMapping* mcm = static_cast<const MultiCharMapping*>(aElement);
return int(ch) - int(mcm->mOriginalChar);
}
#define MAKE_SPECIAL_CASE_ACCESSOR(which) \\
const MultiCharMapping* \\
Special##which(PRUint32 aChar) \\
{ \\
const void* p = bsearch(&aChar, CaseSpecials_##which, \\
mozilla::ArrayLength(CaseSpecials_##which), \\
sizeof(MultiCharMapping), CompareMCM); \\
return static_cast<const MultiCharMapping*>(p); \\
}
namespace mozilla {
namespace unicode {
MAKE_SPECIAL_CASE_ACCESSOR(Lower)
MAKE_SPECIAL_CASE_ACCESSOR(Upper)
MAKE_SPECIAL_CASE_ACCESSOR(Title)
} // namespace unicode
} // namespace mozilla
__END__
addSpecialsTo(\%allLower, \%specialLower);
addSpecialsTo(\%allUpper, \%specialUpper);
addSpecialsTo(\%allTitle, \%specialTitle);
my $testFont = "../fonts/dejavu-sans/DejaVuSans.ttf";
genTest('lower', \%allLower);
genTest('upper', \%allUpper);
genTitleTest();
sub printMappings {
my ($whichMapping, $hash) = @_;
print "static const MultiCharMapping CaseSpecials_${whichMapping}[] = {\n";
foreach my $key (sort { $a <=> $b } keys %$hash) {
my @chars = split(/ /, $hash->{$key});
printf " { 0x%04x, {0x%04x, 0x%04x, 0x%04x} }, // %s\n", $key,
hex "0x0$chars[0]", hex "0x0$chars[1]", hex "0x0$chars[2]",
"$charName{$key}";
}
print "};\n\n";
};
sub addIfSpecial {
my ($hash, $usv, $mapping) = @_;
return unless $mapping =~ / /;
# only do compositions that start with the initial char
foreach (keys %compositions) {
$mapping =~ s/^$_/$compositions{$_}/;
}
$hash->{$usv} = $mapping;
};
sub addSpecialsTo {
my ($hash, $specials) = @_;
foreach my $key (keys %$specials) {
$hash->{$key} = $specials->{$key};
}
};
sub genTest {
my ($whichMapping, $hash) = @_;
open OUT, "> all-$whichMapping.html";
print OUT <<__END__;
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<style type="text/css">
\@font-face { font-family: foo; src: url($testFont); }
p { font-family: foo; text-transform: ${whichMapping}case; }
</style>
</head>
<body>
<p>
__END__
foreach my $key (sort { $a <=> $b } keys %$hash) {
printf OUT "&#x%04X;", $key;
print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
print OUT "\n";
}
print OUT <<__END__;
</p>
</body>
</html>
__END__
close OUT;
open OUT, "> all-$whichMapping-ref.html";
print OUT <<__END__;
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<style type="text/css">
\@font-face { font-family: foo; src: url($testFont); }
p { font-family: foo; }
</style>
</head>
<body>
<p>
__END__
foreach my $key (sort { $a <=> $b } keys %$hash) {
print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $hash->{$key}));
print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
print OUT "\n";
}
print OUT <<__END__;
</p>
</body>
</html>
__END__
close OUT;
};
sub genTitleTest {
open OUT, "> all-title.html";
print OUT <<__END__;
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<style type="text/css">
\@font-face { font-family: foo; src: url($testFont); }
p { font-family: foo; text-transform: capitalize; }
</style>
</head>
<body>
<p>
__END__
foreach my $key (sort { $a <=> $b } keys %allTitle) {
printf OUT "&#x%04X;x", $key;
print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
print OUT "\n";
}
print OUT <<__END__;
</p>
</body>
</html>
__END__
close OUT;
open OUT, "> all-title-ref.html";
print OUT <<__END__;
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<style type="text/css">
\@font-face { font-family: foo; src: url($testFont); }
p { font-family: foo; }
</style>
</head>
<body>
<p>
__END__
foreach my $key (sort { $a <=> $b } keys %allTitle) {
# capitalize is only applied to characters with GC=L* or N*...
if ($gc{$key} =~ /^[LN]/) {
# ...and those that are already uppercase are not transformed
if (exists $allUpper{$key}) {
print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $allTitle{$key}));
} else {
printf OUT "&#x%04X;", $key;
}
print OUT "x";
} else {
printf OUT "&#x%04X;X", $key;
}
print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
print OUT "\n";
}
print OUT <<__END__;
</p>
</body>
</html>
__END__
close OUT;
};

View File

@ -62,6 +62,7 @@ SDK_HEADERS = \
EXPORTS = \
nsBidiUtils.h \
nsSpecialCasingData.h \
nsUnicodeProperties.h \
nsUnicodeScriptCodes.h \
$(NULL)
@ -69,6 +70,7 @@ EXPORTS = \
CPPSRCS = \
nsUnicharUtils.cpp \
nsBidiUtils.cpp \
nsSpecialCasingData.cpp \
nsUnicodeProperties.cpp \
$(NULL)

View File

@ -0,0 +1,202 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
/* Auto-generated from files in the Unicode Character Database
by genSpecialCasingData.pl - do not edit! */
#include "nsSpecialCasingData.h"
#include "mozilla/Util.h" // for ArrayLength
#include <stdlib.h> // for bsearch
/* SpecialCasing-6.1.0.txt */
/* Date: 2011-11-27, 05:10:51 GMT [MD] */
using mozilla::unicode::MultiCharMapping;
static const MultiCharMapping CaseSpecials_Lower[] = {
{ 0x0130, {0x0069, 0x0307, 0x0000} }, // LATIN CAPITAL LETTER I WITH DOT ABOVE
};
static const MultiCharMapping CaseSpecials_Upper[] = {
{ 0x00df, {0x0053, 0x0053, 0x0000} }, // LATIN SMALL LETTER SHARP S
{ 0x0149, {0x02bc, 0x004e, 0x0000} }, // LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
{ 0x01f0, {0x004a, 0x030c, 0x0000} }, // LATIN SMALL LETTER J WITH CARON
{ 0x0390, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
{ 0x03b0, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
{ 0x0587, {0x0535, 0x0552, 0x0000} }, // ARMENIAN SMALL LIGATURE ECH YIWN
{ 0x1e96, {0x0048, 0x0331, 0x0000} }, // LATIN SMALL LETTER H WITH LINE BELOW
{ 0x1e97, {0x0054, 0x0308, 0x0000} }, // LATIN SMALL LETTER T WITH DIAERESIS
{ 0x1e98, {0x0057, 0x030a, 0x0000} }, // LATIN SMALL LETTER W WITH RING ABOVE
{ 0x1e99, {0x0059, 0x030a, 0x0000} }, // LATIN SMALL LETTER Y WITH RING ABOVE
{ 0x1e9a, {0x0041, 0x02be, 0x0000} }, // LATIN SMALL LETTER A WITH RIGHT HALF RING
{ 0x1f50, {0x03a5, 0x0313, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PSILI
{ 0x1f52, {0x03a5, 0x0313, 0x0300} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
{ 0x1f54, {0x03a5, 0x0313, 0x0301} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
{ 0x1f56, {0x03a5, 0x0313, 0x0342} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
{ 0x1f80, {0x1f08, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
{ 0x1f81, {0x1f09, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
{ 0x1f82, {0x1f0a, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
{ 0x1f83, {0x1f0b, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
{ 0x1f84, {0x1f0c, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
{ 0x1f85, {0x1f0d, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
{ 0x1f86, {0x1f0e, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1f87, {0x1f0f, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1f88, {0x1f08, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
{ 0x1f89, {0x1f09, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
{ 0x1f8a, {0x1f0a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
{ 0x1f8b, {0x1f0b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
{ 0x1f8c, {0x1f0c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
{ 0x1f8d, {0x1f0d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
{ 0x1f8e, {0x1f0e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1f8f, {0x1f0f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1f90, {0x1f28, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
{ 0x1f91, {0x1f29, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
{ 0x1f92, {0x1f2a, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
{ 0x1f93, {0x1f2b, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
{ 0x1f94, {0x1f2c, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
{ 0x1f95, {0x1f2d, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
{ 0x1f96, {0x1f2e, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1f97, {0x1f2f, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1f98, {0x1f28, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
{ 0x1f99, {0x1f29, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
{ 0x1f9a, {0x1f2a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
{ 0x1f9b, {0x1f2b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
{ 0x1f9c, {0x1f2c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
{ 0x1f9d, {0x1f2d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
{ 0x1f9e, {0x1f2e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1f9f, {0x1f2f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1fa0, {0x1f68, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
{ 0x1fa1, {0x1f69, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
{ 0x1fa2, {0x1f6a, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
{ 0x1fa3, {0x1f6b, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
{ 0x1fa4, {0x1f6c, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
{ 0x1fa5, {0x1f6d, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
{ 0x1fa6, {0x1f6e, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fa7, {0x1f6f, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fa8, {0x1f68, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
{ 0x1fa9, {0x1f69, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
{ 0x1faa, {0x1f6a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
{ 0x1fab, {0x1f6b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
{ 0x1fac, {0x1f6c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
{ 0x1fad, {0x1f6d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
{ 0x1fae, {0x1f6e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1faf, {0x1f6f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
{ 0x1fb2, {0x1fba, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
{ 0x1fb3, {0x0391, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
{ 0x1fb4, {0x0386, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
{ 0x1fb6, {0x0391, 0x0342, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
{ 0x1fb7, {0x0391, 0x0342, 0x0399} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fbc, {0x0391, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
{ 0x1fc2, {0x1fca, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
{ 0x1fc3, {0x0397, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
{ 0x1fc4, {0x0389, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
{ 0x1fc6, {0x0397, 0x0342, 0x0000} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI
{ 0x1fc7, {0x0397, 0x0342, 0x0399} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fcc, {0x0397, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
{ 0x1fd2, {0x03aa, 0x0300, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
{ 0x1fd3, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
{ 0x1fd6, {0x0399, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH PERISPOMENI
{ 0x1fd7, {0x03aa, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
{ 0x1fe2, {0x03ab, 0x0300, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
{ 0x1fe3, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
{ 0x1fe4, {0x03a1, 0x0313, 0x0000} }, // GREEK SMALL LETTER RHO WITH PSILI
{ 0x1fe6, {0x03a5, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PERISPOMENI
{ 0x1fe7, {0x03ab, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
{ 0x1ff2, {0x1ffa, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
{ 0x1ff3, {0x03a9, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
{ 0x1ff4, {0x038f, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
{ 0x1ff6, {0x03a9, 0x0342, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI
{ 0x1ff7, {0x03a9, 0x0342, 0x0399} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0x1ffc, {0x03a9, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
{ 0xfb00, {0x0046, 0x0046, 0x0000} }, // LATIN SMALL LIGATURE FF
{ 0xfb01, {0x0046, 0x0049, 0x0000} }, // LATIN SMALL LIGATURE FI
{ 0xfb02, {0x0046, 0x004c, 0x0000} }, // LATIN SMALL LIGATURE FL
{ 0xfb03, {0x0046, 0x0046, 0x0049} }, // LATIN SMALL LIGATURE FFI
{ 0xfb04, {0x0046, 0x0046, 0x004c} }, // LATIN SMALL LIGATURE FFL
{ 0xfb05, {0x0053, 0x0054, 0x0000} }, // LATIN SMALL LIGATURE LONG S T
{ 0xfb06, {0x0053, 0x0054, 0x0000} }, // LATIN SMALL LIGATURE ST
{ 0xfb13, {0x0544, 0x0546, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN NOW
{ 0xfb14, {0x0544, 0x0535, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN ECH
{ 0xfb15, {0x0544, 0x053b, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN INI
{ 0xfb16, {0x054e, 0x0546, 0x0000} }, // ARMENIAN SMALL LIGATURE VEW NOW
{ 0xfb17, {0x0544, 0x053d, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN XEH
};
static const MultiCharMapping CaseSpecials_Title[] = {
{ 0x00df, {0x0053, 0x0073, 0x0000} }, // LATIN SMALL LETTER SHARP S
{ 0x0149, {0x02bc, 0x004e, 0x0000} }, // LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
{ 0x01f0, {0x004a, 0x030c, 0x0000} }, // LATIN SMALL LETTER J WITH CARON
{ 0x0390, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
{ 0x03b0, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
{ 0x0587, {0x0535, 0x0582, 0x0000} }, // ARMENIAN SMALL LIGATURE ECH YIWN
{ 0x1e96, {0x0048, 0x0331, 0x0000} }, // LATIN SMALL LETTER H WITH LINE BELOW
{ 0x1e97, {0x0054, 0x0308, 0x0000} }, // LATIN SMALL LETTER T WITH DIAERESIS
{ 0x1e98, {0x0057, 0x030a, 0x0000} }, // LATIN SMALL LETTER W WITH RING ABOVE
{ 0x1e99, {0x0059, 0x030a, 0x0000} }, // LATIN SMALL LETTER Y WITH RING ABOVE
{ 0x1e9a, {0x0041, 0x02be, 0x0000} }, // LATIN SMALL LETTER A WITH RIGHT HALF RING
{ 0x1f50, {0x03a5, 0x0313, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PSILI
{ 0x1f52, {0x03a5, 0x0313, 0x0300} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
{ 0x1f54, {0x03a5, 0x0313, 0x0301} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
{ 0x1f56, {0x03a5, 0x0313, 0x0342} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
{ 0x1fb2, {0x1fba, 0x0345, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
{ 0x1fb4, {0x0386, 0x0345, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
{ 0x1fb6, {0x0391, 0x0342, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
{ 0x1fb7, {0x0391, 0x0342, 0x0345} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fc2, {0x1fca, 0x0345, 0x0000} }, // GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
{ 0x1fc4, {0x0389, 0x0345, 0x0000} }, // GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
{ 0x1fc6, {0x0397, 0x0342, 0x0000} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI
{ 0x1fc7, {0x0397, 0x0342, 0x0345} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0x1fd2, {0x03aa, 0x0300, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
{ 0x1fd3, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
{ 0x1fd6, {0x0399, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH PERISPOMENI
{ 0x1fd7, {0x03aa, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
{ 0x1fe2, {0x03ab, 0x0300, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
{ 0x1fe3, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
{ 0x1fe4, {0x03a1, 0x0313, 0x0000} }, // GREEK SMALL LETTER RHO WITH PSILI
{ 0x1fe6, {0x03a5, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PERISPOMENI
{ 0x1fe7, {0x03ab, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
{ 0x1ff2, {0x1ffa, 0x0345, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
{ 0x1ff4, {0x038f, 0x0345, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
{ 0x1ff6, {0x03a9, 0x0342, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI
{ 0x1ff7, {0x03a9, 0x0342, 0x0345} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
{ 0xfb00, {0x0046, 0x0066, 0x0000} }, // LATIN SMALL LIGATURE FF
{ 0xfb01, {0x0046, 0x0069, 0x0000} }, // LATIN SMALL LIGATURE FI
{ 0xfb02, {0x0046, 0x006c, 0x0000} }, // LATIN SMALL LIGATURE FL
{ 0xfb03, {0x0046, 0x0066, 0x0069} }, // LATIN SMALL LIGATURE FFI
{ 0xfb04, {0x0046, 0x0066, 0x006c} }, // LATIN SMALL LIGATURE FFL
{ 0xfb05, {0x0053, 0x0074, 0x0000} }, // LATIN SMALL LIGATURE LONG S T
{ 0xfb06, {0x0053, 0x0074, 0x0000} }, // LATIN SMALL LIGATURE ST
{ 0xfb13, {0x0544, 0x0576, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN NOW
{ 0xfb14, {0x0544, 0x0565, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN ECH
{ 0xfb15, {0x0544, 0x056b, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN INI
{ 0xfb16, {0x054e, 0x0576, 0x0000} }, // ARMENIAN SMALL LIGATURE VEW NOW
{ 0xfb17, {0x0544, 0x056d, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN XEH
};
static int CompareMCM(const void* aKey, const void* aElement)
{
const PRUint32 ch = *static_cast<const PRUint32*>(aKey);
const MultiCharMapping* mcm = static_cast<const MultiCharMapping*>(aElement);
return int(ch) - int(mcm->mOriginalChar);
}
#define MAKE_SPECIAL_CASE_ACCESSOR(which) \
const MultiCharMapping* \
Special##which(PRUint32 aChar) \
{ \
const void* p = bsearch(&aChar, CaseSpecials_##which, \
mozilla::ArrayLength(CaseSpecials_##which), \
sizeof(MultiCharMapping), CompareMCM); \
return static_cast<const MultiCharMapping*>(p); \
}
namespace mozilla {
namespace unicode {
MAKE_SPECIAL_CASE_ACCESSOR(Lower)
MAKE_SPECIAL_CASE_ACCESSOR(Upper)
MAKE_SPECIAL_CASE_ACCESSOR(Title)
} // namespace unicode
} // namespace mozilla

View File

@ -0,0 +1,26 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "prtypes.h"
namespace mozilla {
namespace unicode {
// Multi-character mappings (from SpecialCasing.txt) map a single Unicode
// value to a sequence of 2 or 3 Unicode characters. There are currently none
// defined outside the BMP, so we can use PRUnichar here. Unused trailing
// positions in mMappedChars are set to 0.
struct MultiCharMapping {
PRUnichar mOriginalChar;
PRUnichar mMappedChars[3];
};
// Return a pointer to the special case mapping for the given character;
// returns NULL if no such mapping is defined.
const MultiCharMapping* SpecialUpper(PRUint32 aCh);
const MultiCharMapping* SpecialLower(PRUint32 aCh);
const MultiCharMapping* SpecialTitle(PRUint32 aCh);
} // namespace unicode
} // namespace mozilla

View File

@ -37,6 +37,7 @@
INTL_UNICHARUTIL_UTIL_LCPPSRCS = \
nsUnicharUtils.cpp \
nsBidiUtils.cpp \
nsSpecialCasingData.cpp \
nsUnicodeProperties.cpp \
$(NULL)

View File

@ -47,8 +47,7 @@
#include "nsContentUtils.h"
#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"
#define SZLIG 0x00DF
#include "nsSpecialCasingData.h"
// Unicode characters needing special casing treatment in tr/az languages
#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130
@ -158,11 +157,18 @@ nsTransformingTextRunFactory::MakeTextRun(const PRUint8* aString, PRUint32 aLeng
* are identical.
*
* This is used for text-transform:uppercase when we encounter a SZLIG,
* whose uppercase form is "SS".
* whose uppercase form is "SS", or other ligature or precomposed form
* that expands to multiple codepoints during case transformation.
*
* This function is unable to merge characters when they occur in different
* glyph runs. It's hard to see how this could happen, but if it does, we just
* discard the characters-to-merge.
* glyph runs. This only happens in tricky edge cases where a character was
* decomposed by case-mapping (e.g. there's no precomposed uppercase version
* of an accented lowercase letter), and then font-matching caused the
* diacritics to be assigned to a different font than the base character.
* In this situation, the diacritic(s) get discarded, which is less than
* ideal, but they probably weren't going to render very well anyway.
* Bug 543200 will improve this by making font-matching operate on entire
* clusters instead of individual codepoints.
*
* For simplicity, this produces a textrun containing all DetailedGlyphs,
* no simple glyphs. So don't call it unless you really have merging to do.
@ -188,9 +194,11 @@ MergeCharactersInTextRun(gfxTextRun* aDest, gfxTextRun* aSrc,
bool anyMissing = false;
PRUint32 mergeRunStart = iter.GetStringStart();
PRUint32 k;
for (k = iter.GetStringStart(); k < iter.GetStringEnd(); ++k) {
const gfxTextRun::CompressedGlyph g = aSrc->GetCharacterGlyphs()[k];
const gfxTextRun::CompressedGlyph *srcGlyphs = aSrc->GetCharacterGlyphs();
gfxTextRun::CompressedGlyph mergedGlyph = srcGlyphs[mergeRunStart];
PRUint32 stringEnd = iter.GetStringEnd();
for (PRUint32 k = iter.GetStringStart(); k < stringEnd; ++k) {
const gfxTextRun::CompressedGlyph g = srcGlyphs[k];
if (g.IsSimpleGlyph()) {
if (!anyMissing) {
gfxTextRun::DetailedGlyph details;
@ -210,40 +218,39 @@ MergeCharactersInTextRun(gfxTextRun* aDest, gfxTextRun* aSrc,
}
}
// We could teach this method to handle merging of characters that aren't
// cluster starts or ligature group starts, but this is really only used
// to merge S's (uppercase &szlig;), so it's not worth it.
if (k + 1 < iter.GetStringEnd() && aCharsToMerge[k + 1]) {
NS_ASSERTION(g.IsClusterStart() && g.IsLigatureGroupStart(),
"Don't know how to merge this stuff");
// next char is supposed to merge with current, so loop without
// writing current merged glyph to the destination
continue;
}
NS_ASSERTION(mergeRunStart == k ||
(g.IsClusterStart() && g.IsLigatureGroupStart()),
"Don't know how to merge this stuff");
// If the start of the merge run is actually a character that should
// have been merged with the previous character (this can happen
// if there's a font change in the middle of a szlig, for example),
// if there's a font change in the middle of a case-mapped character,
// that decomposed into a sequence of base+diacritics, for example),
// just discard the entire merge run. See comment at start of this
// function.
NS_WARN_IF_FALSE(!aCharsToMerge[mergeRunStart],
"unable to merge across a glyph run boundary, "
"glyph(s) discarded");
if (!aCharsToMerge[mergeRunStart]) {
gfxTextRun::CompressedGlyph mergedGlyphs =
aSrc->GetCharacterGlyphs()[mergeRunStart];
if (anyMissing) {
mergedGlyphs.SetMissing(glyphs.Length());
mergedGlyph.SetMissing(glyphs.Length());
} else {
mergedGlyphs.SetComplex(true, true, glyphs.Length());
mergedGlyph.SetComplex(mergedGlyph.IsClusterStart(),
mergedGlyph.IsLigatureGroupStart(),
glyphs.Length());
}
aDest->SetGlyphs(offset, mergedGlyphs, glyphs.Elements());
aDest->SetGlyphs(offset, mergedGlyph, glyphs.Elements());
++offset;
}
glyphs.Clear();
anyMissing = false;
mergeRunStart = k + 1;
if (mergeRunStart < stringEnd) {
mergedGlyph = srcGlyphs[mergeRunStart];
}
}
NS_ASSERTION(glyphs.Length() == 0,
"Leftover glyphs, don't request merging of the last character with its next!");
@ -310,7 +317,7 @@ nsFontVariantTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
ch = SURROGATE_TO_UCS4(ch, str[i + 1]);
}
PRUint32 ch2 = ToUpperCase(ch);
isLowercase = ch != ch2 || ch == SZLIG;
isLowercase = ch != ch2 || mozilla::unicode::SpecialUpper(ch);
} else {
// Don't transform the character! I.e., pretend that it's not lowercase
}
@ -399,7 +406,8 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
PRUint8 style = mAllUppercase ? NS_STYLE_TEXT_TRANSFORM_UPPERCASE
: styleContext->GetStyleText()->mTextTransform;
bool extraChar = false;
int extraChars = 0;
const mozilla::unicode::MultiCharMapping *mcm;
if (NS_IS_HIGH_SURROGATE(ch) && i < length - 1 && NS_IS_LOW_SURROGATE(str[i + 1])) {
ch = SURROGATE_TO_UCS4(ch, str[i + 1]);
@ -420,11 +428,19 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
switch (style) {
case NS_STYLE_TEXT_TRANSFORM_LOWERCASE:
if (languageSpecificCasing == eTurkish && ch == 'I') {
ch = LATIN_SMALL_LETTER_DOTLESS_I;
prevIsLetter = true;
sigmaIndex = PRUint32(-1);
break;
if (languageSpecificCasing == eTurkish) {
if (ch == 'I') {
ch = LATIN_SMALL_LETTER_DOTLESS_I;
prevIsLetter = true;
sigmaIndex = PRUint32(-1);
break;
}
if (ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
ch = 'i';
prevIsLetter = true;
sigmaIndex = PRUint32(-1);
break;
}
}
// Special lowercasing behavior for Greek Sigma: note that this is listed
@ -473,8 +489,6 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
break;
}
ch = ToLowerCase(ch);
// ignore diacritics for the purpose of contextual sigma mapping;
// otherwise, reset prevIsLetter appropriately and clear the
// sigmaIndex marker
@ -482,19 +496,40 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
prevIsLetter = (cat == nsIUGenCategory::kLetter);
sigmaIndex = PRUint32(-1);
}
mcm = mozilla::unicode::SpecialLower(ch);
if (mcm) {
int j = 0;
while (j < 2 && mcm->mMappedChars[j + 1]) {
convertedString.Append(mcm->mMappedChars[j]);
++extraChars;
++j;
}
ch = mcm->mMappedChars[j];
break;
}
ch = ToLowerCase(ch);
break;
case NS_STYLE_TEXT_TRANSFORM_UPPERCASE:
if (ch == SZLIG) {
convertedString.Append('S');
extraChar = true;
ch = 'S';
break;
}
if (languageSpecificCasing == eTurkish && ch == 'i') {
ch = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
break;
}
mcm = mozilla::unicode::SpecialUpper(ch);
if (mcm) {
int j = 0;
while (j < 2 && mcm->mMappedChars[j + 1]) {
convertedString.Append(mcm->mMappedChars[j]);
++extraChars;
++j;
}
ch = mcm->mMappedChars[j];
break;
}
ch = ToUpperCase(ch);
break;
@ -506,12 +541,6 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
}
capitalizeDutchIJ = false;
if (i < aTextRun->mCapitalize.Length() && aTextRun->mCapitalize[i]) {
if (ch == SZLIG) {
convertedString.Append('S');
extraChar = true;
ch = 'S';
break;
}
if (languageSpecificCasing == eTurkish && ch == 'i') {
ch = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
break;
@ -521,6 +550,19 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
capitalizeDutchIJ = true;
break;
}
mcm = mozilla::unicode::SpecialTitle(ch);
if (mcm) {
int j = 0;
while (j < 2 && mcm->mMappedChars[j + 1]) {
convertedString.Append(mcm->mMappedChars[j]);
++extraChars;
++j;
}
ch = mcm->mMappedChars[j];
break;
}
ch = ToTitleCase(ch);
}
break;
@ -540,11 +582,12 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun,
canBreakBeforeArray.AppendElement(false);
}
if (extraChar) {
while (extraChars > 0) {
++extraCharsCount;
charsToMergeArray.AppendElement(true);
styleArray.AppendElement(styleContext);
canBreakBeforeArray.AppendElement(false);
--extraChars;
}
}