gecko/intl/icu/source/common/uiter.cpp
Jeff Walden 805dd78c93 Bug 924839 - Update our embedded ICU to 52.1, plus a very few local patches. r=lots of people, see subsequent lines in this commit message for the original subcomponents (merged together for landing), and the original bug for the original patch divisions
Bug 924839 - Remove a patch already part of ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10283 but also note the relevant code was removed completely upstream.  r=glandium
* * *
Bug 924839 - Remove another patch already part of ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10290 for that.  r=gaston
* * *
Bug 924839 - Remove another patch already in ICU 52.1.  See http://bugs.icu-project.org/trac/ticket/10045 for more.  r=Norbert
* * *
Bug 924839 - Remove another patch already applied upstream.  See http://bugs.icu-project.org/trac/changeset/32937 for more.  r=gaston
* * *
Bug 924839 - Update the ICU update script to update to 52.1, *without* applying any of our local patches.  r=glandium
* * *
Bug 924839 - Make the ICU update script only do updating within intl/icu/source and nowhere else.  r=glandium
* * *
Bug 924839 - Implement the changes that would be made by |cd intl/; ./update-icu.sh http://source.icu-project.org/repos/icu/icu/tags/release-52-1/;|, run with the prior changesets' changes made (thus not applying any of our local patches).  These changes don't actually work without subsequent adjustments, but this provides a codebase upon which those adjustments can be made, for the purpose of generating local patches to be kept in intl/icu-patches/.  rs=the-usual-suspects
* * *
Bug 924839 - Update the bug 899722 local patch to make runConfigureICU not override CC/CXX on BSD systems.  r=gaston
* * *
Bug 924839 - Update the bug 724533 patch that makes ICU builds with MozillaBuild on Windows.  r=glandium
* * *
Bug 924839 - Import an upstream patch fixing the genrb tool to properly handle the -R (--omitCollationRules) option.  See http://bugs.icu-project.org/trac/ticket/10043 for the original bug report and a link to the ultimate upstream landing.  r=Norbert
* * *
Bug 924839 - Import the upstream fix for http://bugs.icu-project.org/trac/ticket/10486 so that ICU with -DU_USING_ICU_NAMESPACE=0 will compile on Windows.  r=Norbert
* * *
Bug 924839 - Adjust the update script to update ICU, then to apply all local patches (rather than skipping the second step).  Thus if the update script is properly run, now, the final result should be no changes at all to the tree.  NOT REVIEWED YET
* * *
Bug 924839 - Update jstests that depend on CLDR locale data to match CLDR 24.  r=Norbert
2013-11-12 16:23:48 -08:00

1107 lines
31 KiB
C++

/*
*******************************************************************************
*
* Copyright (C) 2002-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uiter.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan18
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/chariter.h"
#include "unicode/rep.h"
#include "unicode/uiter.h"
#include "unicode/utf.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "cstring.h"
U_NAMESPACE_USE
#define IS_EVEN(n) (((n)&1)==0)
#define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
U_CDECL_BEGIN
/* No-Op UCharIterator implementation for illegal input --------------------- */
static int32_t U_CALLCONV
noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
return 0;
}
static int32_t U_CALLCONV
noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
return 0;
}
static UBool U_CALLCONV
noopHasNext(UCharIterator * /*iter*/) {
return FALSE;
}
static UChar32 U_CALLCONV
noopCurrent(UCharIterator * /*iter*/) {
return U_SENTINEL;
}
static uint32_t U_CALLCONV
noopGetState(const UCharIterator * /*iter*/) {
return UITER_NO_STATE;
}
static void U_CALLCONV
noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
*pErrorCode=U_UNSUPPORTED_ERROR;
}
static const UCharIterator noopIterator={
0, 0, 0, 0, 0, 0,
noopGetIndex,
noopMove,
noopHasNext,
noopHasNext,
noopCurrent,
noopCurrent,
noopCurrent,
NULL,
noopGetState,
noopSetState
};
/* UCharIterator implementation for simple strings -------------------------- */
/*
* This is an implementation of a code unit (UChar) iterator
* for UChar * strings.
*
* The UCharIterator.context field holds a pointer to the string.
*/
static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
return 0;
case UITER_START:
return iter->start;
case UITER_CURRENT:
return iter->index;
case UITER_LIMIT:
return iter->limit;
case UITER_LENGTH:
return iter->length;
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static int32_t U_CALLCONV
stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
int32_t pos;
switch(origin) {
case UITER_ZERO:
pos=delta;
break;
case UITER_START:
pos=iter->start+delta;
break;
case UITER_CURRENT:
pos=iter->index+delta;
break;
case UITER_LIMIT:
pos=iter->limit+delta;
break;
case UITER_LENGTH:
pos=iter->length+delta;
break;
default:
return -1; /* Error */
}
if(pos<iter->start) {
pos=iter->start;
} else if(pos>iter->limit) {
pos=iter->limit;
}
return iter->index=pos;
}
static UBool U_CALLCONV
stringIteratorHasNext(UCharIterator *iter) {
return iter->index<iter->limit;
}
static UBool U_CALLCONV
stringIteratorHasPrevious(UCharIterator *iter) {
return iter->index>iter->start;
}
static UChar32 U_CALLCONV
stringIteratorCurrent(UCharIterator *iter) {
if(iter->index<iter->limit) {
return ((const UChar *)(iter->context))[iter->index];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
stringIteratorNext(UCharIterator *iter) {
if(iter->index<iter->limit) {
return ((const UChar *)(iter->context))[iter->index++];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
stringIteratorPrevious(UCharIterator *iter) {
if(iter->index>iter->start) {
return ((const UChar *)(iter->context))[--iter->index];
} else {
return U_SENTINEL;
}
}
static uint32_t U_CALLCONV
stringIteratorGetState(const UCharIterator *iter) {
return (uint32_t)iter->index;
}
static void U_CALLCONV
stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
iter->index=(int32_t)state;
}
}
static const UCharIterator stringIterator={
0, 0, 0, 0, 0, 0,
stringIteratorGetIndex,
stringIteratorMove,
stringIteratorHasNext,
stringIteratorHasPrevious,
stringIteratorCurrent,
stringIteratorNext,
stringIteratorPrevious,
NULL,
stringIteratorGetState,
stringIteratorSetState
};
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
if(iter!=0) {
if(s!=0 && length>=-1) {
*iter=stringIterator;
iter->context=s;
if(length>=0) {
iter->length=length;
} else {
iter->length=u_strlen(s);
}
iter->limit=iter->length;
} else {
*iter=noopIterator;
}
}
}
/* UCharIterator implementation for UTF-16BE strings ------------------------ */
/*
* This is an implementation of a code unit (UChar) iterator
* for UTF-16BE strings, i.e., strings in byte-vectors where
* each UChar is stored as a big-endian pair of bytes.
*
* The UCharIterator.context field holds a pointer to the string.
* Everything works just like with a normal UChar iterator (uiter_setString),
* except that UChars are assembled from byte pairs.
*/
/* internal helper function */
static inline UChar32
utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
const uint8_t *p=(const uint8_t *)iter->context;
return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
}
static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator *iter) {
int32_t index;
if((index=iter->index)<iter->limit) {
return utf16BEIteratorGet(iter, index);
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator *iter) {
int32_t index;
if((index=iter->index)<iter->limit) {
iter->index=index+1;
return utf16BEIteratorGet(iter, index);
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator *iter) {
int32_t index;
if((index=iter->index)>iter->start) {
iter->index=--index;
return utf16BEIteratorGet(iter, index);
} else {
return U_SENTINEL;
}
}
static const UCharIterator utf16BEIterator={
0, 0, 0, 0, 0, 0,
stringIteratorGetIndex,
stringIteratorMove,
stringIteratorHasNext,
stringIteratorHasPrevious,
utf16BEIteratorCurrent,
utf16BEIteratorNext,
utf16BEIteratorPrevious,
NULL,
stringIteratorGetState,
stringIteratorSetState
};
/*
* Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
* i.e., before a pair of 0 bytes where the first 0 byte is at an even
* offset from s.
*/
static int32_t
utf16BE_strlen(const char *s) {
if(IS_POINTER_EVEN(s)) {
/*
* even-aligned, call u_strlen(s)
* we are probably on a little-endian machine, but searching for UChar NUL
* does not care about endianness
*/
return u_strlen((const UChar *)s);
} else {
/* odd-aligned, search for pair of 0 bytes */
const char *p=s;
while(!(*p==0 && p[1]==0)) {
p+=2;
}
return (int32_t)((p-s)/2);
}
}
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
if(iter!=NULL) {
/* allow only even-length strings (the input length counts bytes) */
if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
/* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
length>>=1;
if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
/* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
uiter_setString(iter, (const UChar *)s, length);
return;
}
*iter=utf16BEIterator;
iter->context=s;
if(length>=0) {
iter->length=length;
} else {
iter->length=utf16BE_strlen(s);
}
iter->limit=iter->length;
} else {
*iter=noopIterator;
}
}
}
/* UCharIterator wrapper around CharacterIterator --------------------------- */
/*
* This is wrapper code around a C++ CharacterIterator to
* look like a C UCharIterator.
*
* The UCharIterator.context field holds a pointer to the CharacterIterator.
*/
static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
return 0;
case UITER_START:
return ((CharacterIterator *)(iter->context))->startIndex();
case UITER_CURRENT:
return ((CharacterIterator *)(iter->context))->getIndex();
case UITER_LIMIT:
return ((CharacterIterator *)(iter->context))->endIndex();
case UITER_LENGTH:
return ((CharacterIterator *)(iter->context))->getLength();
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static int32_t U_CALLCONV
characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
((CharacterIterator *)(iter->context))->setIndex(delta);
return ((CharacterIterator *)(iter->context))->getIndex();
case UITER_START:
case UITER_CURRENT:
case UITER_LIMIT:
return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
case UITER_LENGTH:
((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
return ((CharacterIterator *)(iter->context))->getIndex();
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static UBool U_CALLCONV
characterIteratorHasNext(UCharIterator *iter) {
return ((CharacterIterator *)(iter->context))->hasNext();
}
static UBool U_CALLCONV
characterIteratorHasPrevious(UCharIterator *iter) {
return ((CharacterIterator *)(iter->context))->hasPrevious();
}
static UChar32 U_CALLCONV
characterIteratorCurrent(UCharIterator *iter) {
UChar32 c;
c=((CharacterIterator *)(iter->context))->current();
if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
return c;
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
characterIteratorNext(UCharIterator *iter) {
if(((CharacterIterator *)(iter->context))->hasNext()) {
return ((CharacterIterator *)(iter->context))->nextPostInc();
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
characterIteratorPrevious(UCharIterator *iter) {
if(((CharacterIterator *)(iter->context))->hasPrevious()) {
return ((CharacterIterator *)(iter->context))->previous();
} else {
return U_SENTINEL;
}
}
static uint32_t U_CALLCONV
characterIteratorGetState(const UCharIterator *iter) {
return ((CharacterIterator *)(iter->context))->getIndex();
}
static void U_CALLCONV
characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL || iter->context==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
}
}
static const UCharIterator characterIteratorWrapper={
0, 0, 0, 0, 0, 0,
characterIteratorGetIndex,
characterIteratorMove,
characterIteratorHasNext,
characterIteratorHasPrevious,
characterIteratorCurrent,
characterIteratorNext,
characterIteratorPrevious,
NULL,
characterIteratorGetState,
characterIteratorSetState
};
U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
if(iter!=0) {
if(charIter!=0) {
*iter=characterIteratorWrapper;
iter->context=charIter;
} else {
*iter=noopIterator;
}
}
}
/* UCharIterator wrapper around Replaceable --------------------------------- */
/*
* This is an implementation of a code unit (UChar) iterator
* based on a Replaceable object.
*
* The UCharIterator.context field holds a pointer to the Replaceable.
* UCharIterator.length and UCharIterator.index hold Replaceable.length()
* and the iteration index.
*/
static UChar32 U_CALLCONV
replaceableIteratorCurrent(UCharIterator *iter) {
if(iter->index<iter->limit) {
return ((Replaceable *)(iter->context))->charAt(iter->index);
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
replaceableIteratorNext(UCharIterator *iter) {
if(iter->index<iter->limit) {
return ((Replaceable *)(iter->context))->charAt(iter->index++);
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
replaceableIteratorPrevious(UCharIterator *iter) {
if(iter->index>iter->start) {
return ((Replaceable *)(iter->context))->charAt(--iter->index);
} else {
return U_SENTINEL;
}
}
static const UCharIterator replaceableIterator={
0, 0, 0, 0, 0, 0,
stringIteratorGetIndex,
stringIteratorMove,
stringIteratorHasNext,
stringIteratorHasPrevious,
replaceableIteratorCurrent,
replaceableIteratorNext,
replaceableIteratorPrevious,
NULL,
stringIteratorGetState,
stringIteratorSetState
};
U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
if(iter!=0) {
if(rep!=0) {
*iter=replaceableIterator;
iter->context=rep;
iter->limit=iter->length=rep->length();
} else {
*iter=noopIterator;
}
}
}
/* UCharIterator implementation for UTF-8 strings --------------------------- */
/*
* Possible, probably necessary only for an implementation for arbitrary
* converters:
* Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
* This would require to turn reservedFn into a close function and
* to introduce a uiter_close(iter).
*/
#define UITER_CNV_CAPACITY 16
/*
* Minimal implementation:
* Maintain a single-UChar buffer for an additional surrogate.
* The caller must not modify start and limit because they are used internally.
*
* Use UCharIterator fields as follows:
* context pointer to UTF-8 string
* length UTF-16 length of the string; -1 until lazy evaluation
* start current UTF-8 index
* index current UTF-16 index; may be -1="unknown" after setState()
* limit UTF-8 length of the string
* reservedField supplementary code point
*
* Since UCharIterator delivers 16-bit code units, the iteration can be
* currently in the middle of the byte sequence for a supplementary code point.
* In this case, reservedField will contain that code point and start will
* point to after the corresponding byte sequence. The UTF-16 index will be
* one less than what it would otherwise be corresponding to the UTF-8 index.
* Otherwise, reservedField will be 0.
*/
/*
* Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
* Add implementations that do not call strlen() for iteration but check for NUL.
*/
static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
case UITER_START:
return 0;
case UITER_CURRENT:
if(iter->index<0) {
/* the current UTF-16 index is unknown after setState(), count from the beginning */
const uint8_t *s;
UChar32 c;
int32_t i, limit, index;
s=(const uint8_t *)iter->context;
i=index=0;
limit=iter->start; /* count up to the UTF-8 index */
while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
index+=U16_LENGTH(c);
}
iter->start=i; /* just in case setState() did not get us to a code point boundary */
if(i==iter->limit) {
iter->length=index; /* in case it was <0 or wrong */
}
if(iter->reservedField!=0) {
--index; /* we are in the middle of a supplementary code point */
}
iter->index=index;
}
return iter->index;
case UITER_LIMIT:
case UITER_LENGTH:
if(iter->length<0) {
const uint8_t *s;
UChar32 c;
int32_t i, limit, length;
s=(const uint8_t *)iter->context;
if(iter->index<0) {
/*
* the current UTF-16 index is unknown after setState(),
* we must first count from the beginning to here
*/
i=length=0;
limit=iter->start;
/* count from the beginning to the current index */
while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
length+=U16_LENGTH(c);
}
/* assume i==limit==iter->start, set the UTF-16 index */
iter->start=i; /* just in case setState() did not get us to a code point boundary */
iter->index= iter->reservedField!=0 ? length-1 : length;
} else {
i=iter->start;
length=iter->index;
if(iter->reservedField!=0) {
++length;
}
}
/* count from the current index to the end */
limit=iter->limit;
while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
length+=U16_LENGTH(c);
}
iter->length=length;
}
return iter->length;
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
const uint8_t *s;
UChar32 c;
int32_t pos; /* requested UTF-16 index */
int32_t i; /* UTF-8 index */
UBool havePos;
/* calculate the requested UTF-16 index */
switch(origin) {
case UITER_ZERO:
case UITER_START:
pos=delta;
havePos=TRUE;
/* iter->index<0 (unknown) is possible */
break;
case UITER_CURRENT:
if(iter->index>=0) {
pos=iter->index+delta;
havePos=TRUE;
} else {
/* the current UTF-16 index is unknown after setState(), use only delta */
pos=0;
havePos=FALSE;
}
break;
case UITER_LIMIT:
case UITER_LENGTH:
if(iter->length>=0) {
pos=iter->length+delta;
havePos=TRUE;
} else {
/* pin to the end, avoid counting the length */
iter->index=-1;
iter->start=iter->limit;
iter->reservedField=0;
if(delta>=0) {
return UITER_UNKNOWN_INDEX;
} else {
/* the current UTF-16 index is unknown, use only delta */
pos=0;
havePos=FALSE;
}
}
break;
default:
return -1; /* Error */
}
if(havePos) {
/* shortcuts: pinning to the edges of the string */
if(pos<=0) {
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(iter->length>=0 && pos>=iter->length) {
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
return iter->index;
}
/* minimize the number of U8_NEXT/PREV operations */
if(iter->index<0 || pos<iter->index/2) {
/* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
} else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
/*
* if we have the UTF-16 index and length and the new position is
* closer to the end than the current index,
* then go backward from the end instead of forward from the current index
*/
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
}
delta=pos-iter->index;
if(delta==0) {
return iter->index; /* nothing to do */
}
} else {
/* move relative to unknown UTF-16 index */
if(delta==0) {
return UITER_UNKNOWN_INDEX; /* nothing to do */
} else if(-delta>=iter->start) {
/* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(delta>=(iter->limit-iter->start)) {
/* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
iter->index=iter->length; /* may or may not be <0 (unknown) */
iter->start=iter->limit;
iter->reservedField=0;
return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
}
}
/* delta!=0 */
/* move towards the requested position, pin to the edges of the string */
s=(const uint8_t *)iter->context;
pos=iter->index; /* could be <0 (unknown) */
i=iter->start;
if(delta>0) {
/* go forward */
int32_t limit=iter->limit;
if(iter->reservedField!=0) {
iter->reservedField=0;
++pos;
--delta;
}
while(delta>0 && i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
if(c<=0xffff) {
++pos;
--delta;
} else if(delta>=2) {
pos+=2;
delta-=2;
} else /* delta==1 */ {
/* stop in the middle of a supplementary code point */
iter->reservedField=c;
++pos;
break; /* delta=0; */
}
}
if(i==limit) {
if(iter->length<0 && iter->index>=0) {
iter->length= iter->reservedField==0 ? pos : pos+1;
} else if(iter->index<0 && iter->length>=0) {
iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
}
}
} else /* delta<0 */ {
/* go backward */
if(iter->reservedField!=0) {
iter->reservedField=0;
i-=4; /* we stayed behind the supplementary code point; go before it now */
--pos;
++delta;
}
while(delta<0 && i>0) {
U8_PREV_OR_FFFD(s, 0, i, c);
if(c<=0xffff) {
--pos;
++delta;
} else if(delta<=-2) {
pos-=2;
delta+=2;
} else /* delta==-1 */ {
/* stop in the middle of a supplementary code point */
i+=4; /* back to behind this supplementary code point for consistent state */
iter->reservedField=c;
--pos;
break; /* delta=0; */
}
}
}
iter->start=i;
if(iter->index>=0) {
return iter->index=pos;
} else {
/* we started with index<0 (unknown) so pos is bogus */
if(i<=1) {
return iter->index=i; /* reached the beginning */
} else {
/* we still don't know the UTF-16 index */
return UITER_UNKNOWN_INDEX;
}
}
}
static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator *iter) {
return iter->start<iter->limit || iter->reservedField!=0;
}
static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator *iter) {
return iter->start>0;
}
static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator *iter) {
if(iter->reservedField!=0) {
return U16_TRAIL(iter->reservedField);
} else if(iter->start<iter->limit) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
int32_t i=iter->start;
U8_NEXT_OR_FFFD(s, i, iter->limit, c);
if(c<=0xffff) {
return c;
} else {
return U16_LEAD(c);
}
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator *iter) {
int32_t index;
if(iter->reservedField!=0) {
UChar trail=U16_TRAIL(iter->reservedField);
iter->reservedField=0;
if((index=iter->index)>=0) {
iter->index=index+1;
}
return trail;
} else if(iter->start<iter->limit) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
if((index=iter->index)>=0) {
iter->index=++index;
if(iter->length<0 && iter->start==iter->limit) {
iter->length= c<=0xffff ? index : index+1;
}
} else if(iter->start==iter->limit && iter->length>=0) {
iter->index= c<=0xffff ? iter->length : iter->length-1;
}
if(c<=0xffff) {
return c;
} else {
iter->reservedField=c;
return U16_LEAD(c);
}
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator *iter) {
int32_t index;
if(iter->reservedField!=0) {
UChar lead=U16_LEAD(iter->reservedField);
iter->reservedField=0;
iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
if((index=iter->index)>0) {
iter->index=index-1;
}
return lead;
} else if(iter->start>0) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
U8_PREV_OR_FFFD(s, 0, iter->start, c);
if((index=iter->index)>0) {
iter->index=index-1;
} else if(iter->start<=1) {
iter->index= c<=0xffff ? iter->start : iter->start+1;
}
if(c<=0xffff) {
return c;
} else {
iter->start+=4; /* back to behind this supplementary code point for consistent state */
iter->reservedField=c;
return U16_TRAIL(c);
}
} else {
return U_SENTINEL;
}
}
static uint32_t U_CALLCONV
utf8IteratorGetState(const UCharIterator *iter) {
uint32_t state=(uint32_t)(iter->start<<1);
if(iter->reservedField!=0) {
state|=1;
}
return state;
}
static void U_CALLCONV
utf8IteratorSetState(UCharIterator *iter,
uint32_t state,
UErrorCode *pErrorCode)
{
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if(state==utf8IteratorGetState(iter)) {
/* setting to the current state: no-op */
} else {
int32_t index=(int32_t)(state>>1); /* UTF-8 index */
state&=1; /* 1 if in surrogate pair, must be index>=4 */
if((state==0 ? index<0 : index<4) || iter->limit<index) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
iter->start=index; /* restore UTF-8 byte index */
if(index<=1) {
iter->index=index;
} else {
iter->index=-1; /* unknown UTF-16 index */
}
if(state==0) {
iter->reservedField=0;
} else {
/* verified index>=4 above */
UChar32 c;
U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
if(c<=0xffff) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
iter->reservedField=c;
}
}
}
}
}
static const UCharIterator utf8Iterator={
0, 0, 0, 0, 0, 0,
utf8IteratorGetIndex,
utf8IteratorMove,
utf8IteratorHasNext,
utf8IteratorHasPrevious,
utf8IteratorCurrent,
utf8IteratorNext,
utf8IteratorPrevious,
NULL,
utf8IteratorGetState,
utf8IteratorSetState
};
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
if(iter!=0) {
if(s!=0 && length>=-1) {
*iter=utf8Iterator;
iter->context=s;
if(length>=0) {
iter->limit=length;
} else {
iter->limit=(int32_t)uprv_strlen(s);
}
iter->length= iter->limit<=1 ? iter->limit : -1;
} else {
*iter=noopIterator;
}
}
}
/* Helper functions --------------------------------------------------------- */
U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator *iter) {
UChar32 c, c2;
c=iter->current(iter);
if(U16_IS_SURROGATE(c)) {
if(U16_IS_SURROGATE_LEAD(c)) {
/*
* go to the next code unit
* we know that we are not at the limit because c!=U_SENTINEL
*/
iter->move(iter, 1, UITER_CURRENT);
if(U16_IS_TRAIL(c2=iter->current(iter))) {
c=U16_GET_SUPPLEMENTARY(c, c2);
}
/* undo index movement */
iter->move(iter, -1, UITER_CURRENT);
} else {
if(U16_IS_LEAD(c2=iter->previous(iter))) {
c=U16_GET_SUPPLEMENTARY(c2, c);
}
if(c2>=0) {
/* undo index movement */
iter->move(iter, 1, UITER_CURRENT);
}
}
}
return c;
}
U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator *iter) {
UChar32 c, c2;
c=iter->next(iter);
if(U16_IS_LEAD(c)) {
if(U16_IS_TRAIL(c2=iter->next(iter))) {
c=U16_GET_SUPPLEMENTARY(c, c2);
} else if(c2>=0) {
/* unmatched first surrogate, undo index movement */
iter->move(iter, -1, UITER_CURRENT);
}
}
return c;
}
U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator *iter) {
UChar32 c, c2;
c=iter->previous(iter);
if(U16_IS_TRAIL(c)) {
if(U16_IS_LEAD(c2=iter->previous(iter))) {
c=U16_GET_SUPPLEMENTARY(c2, c);
} else if(c2>=0) {
/* unmatched second surrogate, undo index movement */
iter->move(iter, 1, UITER_CURRENT);
}
}
return c;
}
U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator *iter) {
if(iter==NULL || iter->getState==NULL) {
return UITER_NO_STATE;
} else {
return iter->getState(iter);
}
}
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if(iter->setState==NULL) {
*pErrorCode=U_UNSUPPORTED_ERROR;
} else {
iter->setState(iter, state, pErrorCode);
}
}
U_CDECL_END