gecko/intl/icu/source/common/stringtriebuilder.cpp
Jeff Walden 805dd78c93 Bug 924839 - Update our embedded ICU to 52.1, plus a very few local patches. r=lots of people, see subsequent lines in this commit message for the original subcomponents (merged together for landing), and the original bug for the original patch divisions
Bug 924839 - Remove a patch already part of ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10283 but also note the relevant code was removed completely upstream.  r=glandium
* * *
Bug 924839 - Remove another patch already part of ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10290 for that.  r=gaston
* * *
Bug 924839 - Remove another patch already in ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10045 for more.  r=Norbert
* * *
Bug 924839 - Remove another patch already applied upstream.  See http://bugs.icu-project.org/trac/changeset/32937 for more.  r=gaston
* * *
Bug 924839 - Update the ICU update script to update to 52.1, *without* applying any of our local patches.  r=glandium
* * *
Bug 924839 - Make the ICU update script only do updating within intl/icu/source and nowhere else.  r=glandium
* * *
Bug 924839 - Implement the changes that would be made by |cd intl/; ./update-icu.sh http://source.icu-project.org/repos/icu/icu/tags/release-52-1/;|, run with the prior changesets' changes made (thus not applying any of our local patches).  These changes don't actually work without subsequent adjustments, but this provides a codebase upon which those adjustments can be made, for the purpose of generating local patches to be kept in intl/icu-patches/.  rs=the-usual-suspects
* * *
Bug 924839 - Update the bug 899722 local patch to make runConfigureICU not override CC/CXX on BSD systems.  r=gaston
* * *
Bug 924839 - Update the bug 724533 patch that makes ICU builds with MozillaBuild on Windows.  r=glandium
* * *
Bug 924839 - Import an upstream patch fixing the genrb tool to properly handle the -R (--omitCollationRules) option.  See http://bugs.icu-project.org/trac/ticket/10043 for the original bug report and a link to the ultimate upstream landing.  r=Norbert
* * *
Bug 924839 - Import the upstream fix for http://bugs.icu-project.org/trac/ticket/10486 so that ICU with -DU_USING_ICU_NAMESPACE=0 will compile on Windows.  r=Norbert
* * *
Bug 924839 - Adjust the update script to update ICU, then to apply all local patches (rather than skipping the second step).  Thus if the update script is properly run, now, the final result should be no changes at all to the tree.  NOT REVIEWED YET
* * *
Bug 924839 - Update jstests that depend on CLDR locale data to match CLDR 24.  r=Norbert
2013-11-12 16:23:48 -08:00

617 lines
20 KiB
C++

/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: stringtriebuilder.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec24
* created by: Markus W. Scherer
*/
#include "utypeinfo.h" // for 'typeid' to work
#include "unicode/utypes.h"
#include "unicode/stringtriebuilder.h"
#include "uassert.h"
#include "uhash.h"
U_CDECL_BEGIN
static int32_t U_CALLCONV
hashStringTrieNode(const UHashTok key) {
return icu::StringTrieBuilder::hashNode(key.pointer);
}
static UBool U_CALLCONV
equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
}
U_CDECL_END
U_NAMESPACE_BEGIN
StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
StringTrieBuilder::~StringTrieBuilder() {
deleteCompactBuilder();
}
void
StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
sizeGuess, &errorCode);
if(U_SUCCESS(errorCode)) {
if(nodes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
uhash_setKeyDeleter(nodes, uprv_deleteUObject);
}
}
}
void
StringTrieBuilder::deleteCompactBuilder() {
uhash_close(nodes);
nodes=NULL;
}
void
StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
UErrorCode &errorCode) {
if(buildOption==USTRINGTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* USTRINGTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
int32_t
StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
UBool hasValue=FALSE;
int32_t value=0;
int32_t type;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return writeValueAndFinal(value, TRUE); // final-value node
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
writeNode(start, limit, lastUnitIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
write(getMinLinearMatch()+maxLinearMatchLength-1);
}
writeElementUnits(start, unitIndex, length);
type=getMinLinearMatch()+length-1;
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
writeBranchSubNode(start, limit, unitIndex, length);
if(--length<getMinLinearMatch()) {
type=length;
} else {
write(length);
type=0;
}
}
return writeValueAndType(hasValue, value, type);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
int32_t
StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
UChar middleUnits[kMaxSplitBranchLevels];
int32_t lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Encode the less-than branch first.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t starts[kMaxBranchLinearSubNodeLength];
UBool isFinal[kMaxBranchLinearSubNodeLength-1];
int32_t unitNumber=0;
do {
int32_t i=starts[unitNumber]=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
starts[unitNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
do {
--unitNumber;
if(!isFinal[unitNumber]) {
jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
writeNode(start, limit, unitIndex+1);
int32_t offset=write(getElementUnit(start, unitIndex));
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
start=starts[unitNumber];
int32_t value;
if(isFinal[unitNumber]) {
// Write the final value for the one string ending with this unit.
value=getElementValue(start);
} else {
// Write the delta to the start position of the sub-node.
value=offset-jumpTargets[unitNumber];
}
writeValueAndFinal(value, isFinal[unitNumber]);
offset=write(getElementUnit(start, unitIndex));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDeltaTo(lessThan[ltLength]);
offset=write(middleUnits[ltLength]);
}
return offset;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
StringTrieBuilder::Node *
StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
Node *node;
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
nextNode=registerNode(node, errorCode);
}
node=createLinearMatchNode(start, unitIndex, length, nextNode);
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
node=new BranchHeadNode(length, subNode);
}
if(hasValue && node!=NULL) {
if(matchNodesCanHaveValues()) {
((ValueNode *)node)->setValue(value);
} else {
node=new IntermediateValueNode(value, registerNode(node, errorCode));
}
}
return registerNode(node, errorCode);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
StringTrieBuilder::Node *
StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UChar middleUnits[kMaxSplitBranchLevels];
Node *lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Create the less-than branch.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
ListBranchNode *listNode=new ListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t unitNumber=0;
do {
int32_t i=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
}
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
UChar unit=getElementUnit(start, unitIndex);
if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
StringTrieBuilder::Node *
StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
const UHashElement *old=uhash_find(nodes, newNode);
if(old!=NULL) {
delete newNode;
return (Node *)old->key.pointer;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if U_DEBUG
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
StringTrieBuilder::Node *
StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
FinalValueNode key(value);
const UHashElement *old=uhash_find(nodes, &key);
if(old!=NULL) {
return (Node *)old->key.pointer;
}
Node *newNode=new FinalValueNode(value);
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if U_DEBUG
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
UBool
StringTrieBuilder::hashNode(const void *node) {
return ((const Node *)node)->hashCode();
}
UBool
StringTrieBuilder::equalNodes(const void *left, const void *right) {
return *(const Node *)left==*(const Node *)right;
}
UBool
StringTrieBuilder::Node::operator==(const Node &other) const {
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
}
int32_t
StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber;
}
return edgeNumber;
}
UBool
StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const FinalValueNode &o=(const FinalValueNode &)other;
return value==o.value;
}
void
StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
offset=builder.writeValueAndFinal(value, TRUE);
}
UBool
StringTrieBuilder::ValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ValueNode &o=(const ValueNode &)other;
return hasValue==o.hasValue && (!hasValue || value==o.value);
}
UBool
StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const IntermediateValueNode &o=(const IntermediateValueNode &)other;
return next==o.next;
}
int32_t
StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
next->write(builder);
offset=builder.writeValueAndFinal(value, FALSE);
}
UBool
StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const LinearMatchNode &o=(const LinearMatchNode &)other;
return length==o.length && next==o.next;
}
int32_t
StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
UBool
StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ListBranchNode &o=(const ListBranchNode &)other;
for(int32_t i=0; i<length; ++i) {
if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
return FALSE;
}
}
return TRUE;
}
int32_t
StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
int32_t step=0;
int32_t i=length;
do {
Node *edge=equal[--i];
if(edge!=NULL) {
edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
}
// For all but the rightmost edge, decrement the edge number.
step=1;
} while(i>0);
offset=edgeNumber;
}
return edgeNumber;
}
void
StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t unitNumber=length-1;
Node *rightEdge=equal[unitNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=NULL) {
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==NULL) {
builder.writeValueAndFinal(values[unitNumber], TRUE);
} else {
rightEdge->write(builder);
}
offset=builder.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[unitNumber]==NULL) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[unitNumber]->getOffset()>0);
value=offset-equal[unitNumber]->getOffset();
isFinal=FALSE;
}
builder.writeValueAndFinal(value, isFinal);
offset=builder.write(units[unitNumber]);
}
}
UBool
StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const SplitBranchNode &o=(const SplitBranchNode &)other;
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
}
int32_t
StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
}
return edgeNumber;
}
void
StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
builder.writeDeltaTo(lessThan->getOffset()); // less-than
offset=builder.write(unit);
}
UBool
StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const BranchHeadNode &o=(const BranchHeadNode &)other;
return length==o.length && next==o.next;
}
int32_t
StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
next->write(builder);
if(length<=builder.getMinLinearMatch()) {
offset=builder.writeValueAndType(hasValue, value, length-1);
} else {
builder.write(length-1);
offset=builder.writeValueAndType(hasValue, value, 0);
}
}
U_NAMESPACE_END