mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
1370 lines
48 KiB
C++
1370 lines
48 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//
|
|
// State Table follower for scanning UTF-8 strings without converting to
|
|
// 32- or 16-bit Unicode values.
|
|
//
|
|
|
|
#ifdef COMPILER_MSVC
|
|
// MSVC warns: warning C4309: 'initializing' : truncation of constant value
|
|
// But the value is in fact not truncated. 0xFF still comes out 0xFF at
|
|
// runtime.
|
|
#pragma warning ( disable : 4309 )
|
|
#endif
|
|
|
|
#include "utf8statetable.h"
|
|
|
|
#include <stdint.h> // for uintptr_t
|
|
#include <string.h> // for NULL, memcpy, memmove
|
|
|
|
#include "integral_types.h" // for uint8, uint32, int8
|
|
#include "stringpiece.h"
|
|
#include "offsetmap.h"
|
|
|
|
|
|
namespace CLD2 {
|
|
|
|
static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
|
|
// optional next-state field
|
|
// after replacement text
|
|
static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish
|
|
// HTML replacement vs. plaintext
|
|
|
|
|
|
/**
|
|
* This code implements a little interpreter for UTF8 state
|
|
* tables. There are three kinds of quite-similar state tables,
|
|
* property, scanning, and replacement. Each state in one of
|
|
* these tables consists of an array of 256 or 64 one-byte
|
|
* entries. The state is subscripted by an incoming source byte,
|
|
* and the entry either specifies the next state or specifies an
|
|
* action. Space-optimized tables have full 256-entry states for
|
|
* the first byte of a UTF-8 character, but only 64-entry states
|
|
* for continuation bytes. Space-optimized tables may only be
|
|
* used with source input that has been checked to be
|
|
* structurally- (or stronger interchange-) valid.
|
|
*
|
|
* A property state table has an unsigned one-byte property for
|
|
* each possible UTF-8 character. One-byte character properties
|
|
* are in the state[0] array, while for other lengths the
|
|
* state[0] array gives the next state, which contains the
|
|
* property value for two-byte characters or yet another state
|
|
* for longer ones. The code simply loads the right number of
|
|
* next-state values, then returns the final byte as property
|
|
* value. There are no actions specified in property tables.
|
|
* States are typically shared for multi-byte UTF-8 characters
|
|
* that all have the same property value.
|
|
*
|
|
* A scanning state table has entries that are either a
|
|
* next-state specifier for bytes that are accepted by the
|
|
* scanner, or an exit action for the last byte of each
|
|
* character that is rejected by the scanner.
|
|
*
|
|
* Scanning long strings involves a tight loop that picks up one
|
|
* byte at a time and follows next-state value back to state[0]
|
|
* for each accepted UTF-8 character. Scanning stops at the end
|
|
* of the string or at the first character encountered that has
|
|
* an exit action such as "reject". Timing information is given
|
|
* below.
|
|
*
|
|
* Since so much of Google's text is 7-bit-ASCII values
|
|
* (approximately 94% of the bytes of web documents), the
|
|
* scanning interpreter has two speed optimizations. One checks
|
|
* 8 bytes at a time to see if they are all in the range lo..hi,
|
|
* as specified in constants in the overall statetable object.
|
|
* The check involves ORing together four 4-byte values that
|
|
* overflow into the high bit of some byte when a byte is out of
|
|
* range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
|
|
* loop is about 8x faster than the one-byte-at-a-time loop.
|
|
*
|
|
* If checking for exit bytes in the 0x00-0x1F and 7F range is
|
|
* unneeded, an even faster loop just looks at the high bits of
|
|
* 8 bytes at once, and is about 1.33x faster than the lo..hi
|
|
* loop.
|
|
*
|
|
* Exit from the scanning routines backs up to the first byte of
|
|
* the rejected character, so the text spanned is always a
|
|
* complete number of UTF-8 characters. The normal scanning exit
|
|
* is at the first rejected character, or at the end of the
|
|
* input text. Scanning also exits on any detected ill-formed
|
|
* character or at a special do-again action built into some
|
|
* exit-optimized tables. The do-again action gets back to the
|
|
* top of the scanning loop to retry eight-byte ASCII scans. It
|
|
* is typically put into state tables after four seven-bit-ASCII
|
|
* characters in a row are seen, to allow restarting the fast
|
|
* scan after some slower processing of multi-byte characters.
|
|
*
|
|
* A replacement state table is similar to a scanning state
|
|
* table but has more extensive actions. The default
|
|
* byte-at-a-time loop copies one byte from source to
|
|
* destination and goes to the next state. The replacement
|
|
* actions overwrite 1-3 bytes of the destination with different
|
|
* bytes, possibly shortening the output by 1 or 2 bytes. The
|
|
* replacement bytes come from within the state table, from
|
|
* dummy states inserted just after any state that contains a
|
|
* replacement action. This gives a quick address calculation for
|
|
* the replacement byte(s) and gives some cache locality.
|
|
*
|
|
* Additional replacement actions use one or two bytes from
|
|
* within dummy states to index a side table of more-extensive
|
|
* replacements. The side table specifies a length of 0..15
|
|
* destination bytes to overwrite and a length of 0..127 bytes
|
|
* to overwrite them with, plus the actual replacement bytes.
|
|
*
|
|
* This side table uses one extra bit to specify a pair of
|
|
* replacements, the first to be used in an HTML context and the
|
|
* second to be used in a plaintext context. This allows
|
|
* replacements that are spelled with "<" in the former
|
|
* context and "<" in the latter.
|
|
*
|
|
* The side table also uses an extra bit to specify a non-zero
|
|
* next state after a replacement. This allows a combination
|
|
* replacement and state change, used to implement a limited
|
|
* version of the Boyer-Moore algorithm for multi-character
|
|
* replacement without backtracking. This is useful when there
|
|
* are overlapping replacements, such as ch => x and also c =>
|
|
* y, the latter to be used only if the character after c is not
|
|
* h. in this case, the state[0] table's entry for c would
|
|
* change c to y and also have a next-state of say n, and the
|
|
* state[n] entry for h would specify a replacement of the two
|
|
* bytes yh by x. No backtracking is needed.
|
|
*
|
|
* A replacement table may also include the exit actions of a
|
|
* scanning state table, so some character sequences can
|
|
* terminate early.
|
|
*
|
|
* During replacement, an optional data structure called an
|
|
* offset map can be updated to reflect each change in length
|
|
* between source and destination. This offset map can later be
|
|
* used to map destination-string offsets to corresponding
|
|
* source-string offsets or vice versa.
|
|
*
|
|
* The routines below also have variants in which state-table
|
|
* entries are all two bytes instead of one byte. This allows
|
|
* tables with more than 240 total states, but takes up twice as
|
|
* much space per state.
|
|
*
|
|
**/
|
|
|
|
// Return true if current Tbl pointer is within state0 range
|
|
// Note that unsigned compare checks both ends of range simultaneously
|
|
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
|
|
const uint8* Tbl0 = &st->state_table[st->state0];
|
|
return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
|
|
}
|
|
|
|
static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
|
|
const unsigned short int* Tbl) {
|
|
const unsigned short int* Tbl0 = &st->state_table[st->state0];
|
|
// Word difference, not byte difference
|
|
return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
|
|
}
|
|
|
|
// UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
|
|
// UTF8MachineObj.
|
|
|
|
static bool IsPropObj(const UTF8StateMachineObj& obj) {
|
|
return obj.fast_state == NULL
|
|
&& obj.max_expand == 0;
|
|
}
|
|
|
|
static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
|
|
return obj.fast_state == NULL
|
|
&& obj.max_expand == 0;
|
|
}
|
|
|
|
static bool IsScanObj(const UTF8StateMachineObj& obj) {
|
|
return obj.fast_state != NULL
|
|
&& obj.max_expand == 0;
|
|
}
|
|
|
|
static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
|
|
// Normally, obj.fast_state != NULL, but the handwritten tables
|
|
// in utf8statetable_unittest don't handle fast_states.
|
|
return obj.max_expand > 0;
|
|
}
|
|
|
|
static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
|
|
return obj.max_expand > 0;
|
|
}
|
|
|
|
// Look up property of one UTF-8 character and advance over it
|
|
// Return 0 if input length is zero
|
|
// Return 0 and advance one byte if input is ill-formed
|
|
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
|
const uint8** src,
|
|
int* srclen) {
|
|
if (*srclen <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
const uint8* lsrc = *src;
|
|
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
const uint8* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st->entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
*src += 2;
|
|
*srclen -= 2;
|
|
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
*src += 3;
|
|
*srclen -= 3;
|
|
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[3]];
|
|
*src += 4;
|
|
*srclen -= 4;
|
|
} else { // Ill-formed
|
|
e = 0;
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
}
|
|
return e;
|
|
}
|
|
|
|
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
|
|
const uint8* lsrc = reinterpret_cast<const uint8*>(src);
|
|
const uint8* Tbl_0 = &st.state_table[st.state0];
|
|
const uint8* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st.entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
} else if ((c & 0xe0) == 0xc0) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
} else if ((c & 0xf0) == 0xe0) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
} else { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[3]];
|
|
}
|
|
return e;
|
|
}
|
|
|
|
|
|
// BigOneByte versions are needed for tables > 240 states, but most
|
|
// won't need the TwoByte versions.
|
|
// Internally, to next-to-last offset is multiplied by 16 and the last
|
|
// offset is relative instead of absolute.
|
|
// Look up property of one UTF-8 character and advance over it
|
|
// Return 0 if input length is zero
|
|
// Return 0 and advance one byte if input is ill-formed
|
|
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
|
const uint8** src,
|
|
int* srclen) {
|
|
if (*srclen <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
const uint8* lsrc = *src;
|
|
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
const uint8* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st->entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
*src += 2;
|
|
*srclen -= 2;
|
|
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
|
|
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
e = Tbl[lsrc[2]];
|
|
*src += 3;
|
|
*srclen -= 3;
|
|
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
|
|
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
e = Tbl[lsrc[3]];
|
|
*src += 4;
|
|
*srclen -= 4;
|
|
} else { // Ill-formed
|
|
e = 0;
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
}
|
|
return e;
|
|
}
|
|
|
|
// BigOneByte versions are needed for tables > 240 states, but most
|
|
// won't need the TwoByte versions.
|
|
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
|
|
const uint8* lsrc = reinterpret_cast<const uint8*>(src);
|
|
const uint8* Tbl_0 = &st.state_table[st.state0];
|
|
const uint8* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st.entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
} else if ((c & 0xe0) == 0xc0) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
} else if ((c & 0xf0) == 0xe0) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
|
|
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
e = Tbl[lsrc[2]];
|
|
} else { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
|
|
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
e = Tbl[lsrc[3]];
|
|
}
|
|
return e;
|
|
}
|
|
|
|
|
|
// TwoByte versions are needed for tables > 240 states
|
|
// Look up property of one UTF-8 character and advance over it
|
|
// Return 0 if input length is zero
|
|
// Return 0 and advance one byte if input is ill-formed
|
|
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
|
const uint8** src,
|
|
int* srclen) {
|
|
if (*srclen <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
const uint8* lsrc = *src;
|
|
const unsigned short* Tbl_0 = &st->state_table[st->state0];
|
|
const unsigned short* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st->entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
*src += 2;
|
|
*srclen -= 2;
|
|
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
*src += 3;
|
|
*srclen -= 3;
|
|
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[3]];
|
|
*src += 4;
|
|
*srclen -= 4;
|
|
} else { // Ill-formed
|
|
e = 0;
|
|
*src += 1;
|
|
*srclen -= 1;
|
|
}
|
|
return e;
|
|
}
|
|
|
|
// TwoByte versions are needed for tables > 240 states
|
|
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
|
|
const uint8* lsrc = reinterpret_cast<const uint8*>(src);
|
|
const unsigned short* Tbl_0 = &st.state_table[st.state0];
|
|
const unsigned short* Tbl = Tbl_0;
|
|
int e;
|
|
int eshift = st.entry_shift;
|
|
|
|
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
unsigned char c = lsrc[0];
|
|
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
e = Tbl[c];
|
|
} else if ((c & 0xe0) == 0xc0) { // two bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
} else if ((c & 0xf0) == 0xe0) { // three bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
} else { // four bytes
|
|
e = Tbl[c];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[1]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[2]];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
e = Tbl[lsrc[3]];
|
|
}
|
|
return e;
|
|
}
|
|
|
|
|
|
// Approximate speeds on 2.8 GHz Pentium 4:
|
|
// GenericScan 1-byte loop 300 MB/sec *
|
|
// GenericScan 4-byte loop 1200 MB/sec
|
|
// GenericScan 8-byte loop 2400 MB/sec *
|
|
// GenericScanFastAscii 4-byte loop 3000 MB/sec
|
|
// GenericScanFastAscii 8-byte loop 3200 MB/sec *
|
|
//
|
|
// * Implemented below. FastAscii loop is memory-bandwidth constrained.
|
|
|
|
// Scan a UTF-8 stringpiece based on state table.
|
|
// Always scan complete UTF-8 characters
|
|
// Set number of bytes scanned. Return reason for exiting
|
|
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
const StringPiece& str,
|
|
int* bytes_consumed) {
|
|
int eshift = st->entry_shift; // 6 (space optimized) or 8
|
|
// int nEntries = (1 << eshift); // 64 or 256 entries per state
|
|
|
|
const uint8* isrc =
|
|
reinterpret_cast<const uint8*>(str.data());
|
|
const uint8* src = isrc;
|
|
const int len = str.length();
|
|
const uint8* srclimit = isrc + len;
|
|
const uint8* srclimit8 = srclimit - 7;
|
|
*bytes_consumed = 0;
|
|
if (len == 0) return kExitOK;
|
|
|
|
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
|
|
DoAgain:
|
|
// Do state-table scan
|
|
int e = 0;
|
|
uint8 c;
|
|
|
|
// Do fast for groups of 8 identity bytes.
|
|
// This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
|
|
// including slowing slightly on cr/lf/ht
|
|
//----------------------------
|
|
const uint8* Tbl2 = &st->fast_state[0];
|
|
uint32 losub = st->losub;
|
|
uint32 hiadd = st->hiadd;
|
|
while (src < srclimit8) {
|
|
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
|
|
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
|
|
src += 8;
|
|
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
|
|
uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
|
|
(s4567 - losub) | (s4567 + hiadd);
|
|
if ((temp & 0x80808080) != 0) {
|
|
// We typically end up here on cr/lf/ht; src was incremented
|
|
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
|
|
(Tbl2[src[-6]] | Tbl2[src[-5]]);
|
|
if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
|
|
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
|
|
(Tbl2[src[-2]] | Tbl2[src[-1]]);
|
|
if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
|
|
// Else OK, go around again
|
|
}
|
|
}
|
|
//----------------------------
|
|
|
|
// Byte-at-a-time scan
|
|
//----------------------------
|
|
const uint8* Tbl = Tbl_0;
|
|
while (src < srclimit) {
|
|
c = *src;
|
|
e = Tbl[c];
|
|
src++;
|
|
if (e >= kExitIllegalStructure) {break;}
|
|
Tbl = &Tbl_0[e << eshift];
|
|
}
|
|
//----------------------------
|
|
|
|
|
|
// Exit possibilities:
|
|
// Some exit code, !state0, back up over last char
|
|
// Some exit code, state0, back up one byte exactly
|
|
// source consumed, !state0, back up over partial char
|
|
// source consumed, state0, exit OK
|
|
// For illegal byte in state0, avoid backup up over PREVIOUS char
|
|
// For truncated last char, back up to beginning of it
|
|
|
|
if (e >= kExitIllegalStructure) {
|
|
// Back up over exactly one byte of rejected/illegal UTF-8 character
|
|
src--;
|
|
// Back up more if needed
|
|
if (!InStateZero(st, Tbl)) {
|
|
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
}
|
|
} else if (!InStateZero(st, Tbl)) {
|
|
// Back up over truncated UTF-8 character
|
|
e = kExitIllegalStructure;
|
|
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
} else {
|
|
// Normal termination, source fully consumed
|
|
e = kExitOK;
|
|
}
|
|
|
|
if (e == kExitDoAgain) {
|
|
// Loop back up to the fast scan
|
|
goto DoAgain;
|
|
}
|
|
|
|
*bytes_consumed = src - isrc;
|
|
return e;
|
|
}
|
|
|
|
// Scan a UTF-8 stringpiece based on state table.
|
|
// Always scan complete UTF-8 characters
|
|
// Set number of bytes scanned. Return reason for exiting
|
|
// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
|
|
int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
|
|
const StringPiece& str,
|
|
int* bytes_consumed) {
|
|
const uint8* isrc =
|
|
reinterpret_cast<const uint8*>(str.data());
|
|
const uint8* src = isrc;
|
|
const int len = str.length();
|
|
const uint8* srclimit = isrc + len;
|
|
const uint8* srclimit8 = srclimit - 7;
|
|
*bytes_consumed = 0;
|
|
if (len == 0) return kExitOK;
|
|
|
|
int n;
|
|
int rest_consumed;
|
|
int exit_reason;
|
|
do {
|
|
// Skip 8 bytes of ASCII at a whack; no endianness issue
|
|
while ((src < srclimit8) &&
|
|
(((reinterpret_cast<const uint32*>(src)[0] |
|
|
reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
|
|
src += 8;
|
|
}
|
|
// Run state table on the rest
|
|
n = src - isrc;
|
|
StringPiece str2(str.data() + n, str.length() - n);
|
|
exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
|
|
src += rest_consumed;
|
|
} while ( exit_reason == kExitDoAgain );
|
|
|
|
*bytes_consumed = src - isrc;
|
|
return exit_reason;
|
|
}
|
|
|
|
// Hack to change halfwidth katakana to match an old UTF8CharToLower()
|
|
|
|
// Return number of src bytes skipped
|
|
static int DoSpecialFixup(const unsigned char c,
|
|
const unsigned char** srcp, const unsigned char* srclimit,
|
|
unsigned char** dstp, unsigned char* dstlimit) {
|
|
return 0;
|
|
}
|
|
|
|
|
|
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
|
// and doing text replacements.
|
|
// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
|
|
// Needs caller to loop on kExitDoAgain
|
|
static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed,
|
|
OffsetMap* offsetmap) {
|
|
int eshift = st->entry_shift;
|
|
int nEntries = (1 << eshift); // 64 or 256 entries per state
|
|
const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
|
|
const int ilen = istr.length();
|
|
const uint8* copystart = isrc;
|
|
const uint8* src = isrc;
|
|
const uint8* srclimit = src + ilen;
|
|
*bytes_consumed = 0;
|
|
*bytes_filled = 0;
|
|
*chars_changed = 0;
|
|
|
|
const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
|
|
const int olen = ostr.length();
|
|
uint8* dst = const_cast<uint8*>(odst);
|
|
uint8* dstlimit = dst + olen;
|
|
|
|
int total_changed = 0;
|
|
|
|
// Invariant condition during replacements:
|
|
// remaining dst size >= remaining src size
|
|
if ((dstlimit - dst) < (srclimit - src)) {
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart);
|
|
copystart = src;
|
|
}
|
|
return kExitDstSpaceFull;
|
|
}
|
|
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
|
|
Do_state_table:
|
|
// Do state-table scan, copying as we go
|
|
const uint8* Tbl = Tbl_0;
|
|
int e = 0;
|
|
uint8 c = 0;
|
|
|
|
Do_state_table_newe:
|
|
|
|
//----------------------------
|
|
while (src < srclimit) {
|
|
c = *src;
|
|
e = Tbl[c];
|
|
*dst = c;
|
|
src++;
|
|
dst++;
|
|
if (e >= kExitIllegalStructure) {break;}
|
|
Tbl = &Tbl_0[e << eshift];
|
|
}
|
|
//----------------------------
|
|
|
|
// Exit possibilities:
|
|
// Replacement code, do the replacement and loop
|
|
// Some other exit code, state0, back up one byte exactly
|
|
// Some other exit code, !state0, back up over last char
|
|
// source consumed, state0, exit OK
|
|
// source consumed, !state0, back up over partial char
|
|
// For illegal byte in state0, avoid backup up over PREVIOUS char
|
|
// For truncated last char, back up to beginning of it
|
|
|
|
if (e >= kExitIllegalStructure) {
|
|
// Switch on exit code; most loop back to top
|
|
int offset = 0;
|
|
switch (e) {
|
|
// These all make the output string the same size or shorter
|
|
// No checking needed
|
|
case kExitReplace31: // del 2, add 1 bytes to change
|
|
dst -= 2;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 2);
|
|
offsetmap->Delete(2);
|
|
copystart = src;
|
|
}
|
|
dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
|
|
total_changed++;
|
|
goto Do_state_table;
|
|
case kExitReplace32: // del 3, add 2 bytes to change
|
|
dst--;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 1);
|
|
offsetmap->Delete(1);
|
|
copystart = src;
|
|
}
|
|
dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
|
|
dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
|
|
total_changed++;
|
|
goto Do_state_table;
|
|
case kExitReplace21: // del 2, add 1 bytes to change
|
|
dst--;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 1);
|
|
offsetmap->Delete(1);
|
|
copystart = src;
|
|
}
|
|
dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
|
|
total_changed++;
|
|
goto Do_state_table;
|
|
case kExitReplace3: // update 3 bytes to change
|
|
dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
|
|
// Fall into next case
|
|
case kExitReplace2: // update 2 bytes to change
|
|
dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
|
|
// Fall into next case
|
|
case kExitReplace1: // update 1 byte to change
|
|
dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
|
|
total_changed++;
|
|
goto Do_state_table;
|
|
case kExitReplace1S0: // update 1 byte to change, 256-entry state
|
|
dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
|
|
total_changed++;
|
|
goto Do_state_table;
|
|
// These can make the output string longer than the input
|
|
case kExitReplaceOffset2:
|
|
if ((nEntries != 256) && InStateZero(st, Tbl)) {
|
|
// For space-optimized table, we need multiples of 256 bytes
|
|
// in state0 and multiples of nEntries in other states
|
|
offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
|
|
} else {
|
|
offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
|
|
}
|
|
// Fall into next case
|
|
case kExitSpecial: // Apply special fixups [read: hacks]
|
|
case kExitReplaceOffset1:
|
|
if ((nEntries != 256) && InStateZero(st, Tbl)) {
|
|
// For space-optimized table, we need multiples of 256 bytes
|
|
// in state0 and multiples of nEntries in other states
|
|
offset += (unsigned char)Tbl[c + (256 * 1)];
|
|
} else {
|
|
offset += (unsigned char)Tbl[c + (nEntries * 1)];
|
|
}
|
|
{
|
|
const RemapEntry* re = &st->remap_base[offset];
|
|
int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
|
|
int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
|
|
|
|
// Special-case non-HTML replacement of five sensitive entities
|
|
// " & ' < >
|
|
// 0022 0026 0027 003c 003e
|
|
// A replacement creating one of these is expressed as a pair of
|
|
// entries, one for HTML output and one for plaintext output.
|
|
// The first of the pair has the high bit of add_bytes set.
|
|
if (re->add_bytes & kHtmlPlaintextFlag) {
|
|
// Use this entry for plain text
|
|
if (!is_plain_text) {
|
|
// Use very next entry for HTML text (same back/delete length)
|
|
re = &st->remap_base[offset + 1];
|
|
add_len = re->add_bytes & ~kHtmlPlaintextFlag;
|
|
}
|
|
}
|
|
|
|
int string_offset = re->bytes_offset;
|
|
// After the replacement, need (dstlimit - newdst) >= (srclimit - src)
|
|
uint8* newdst = dst - del_len + add_len;
|
|
if ((dstlimit - newdst) < (srclimit - src)) {
|
|
// Won't fit; don't do the replacement. Caller may realloc and retry
|
|
e = kExitDstSpaceFull;
|
|
break; // exit, backing up over this char for later retry
|
|
}
|
|
dst -= del_len;
|
|
memcpy(dst, &st->remap_string[string_offset], add_len);
|
|
dst += add_len;
|
|
total_changed++;
|
|
if (offsetmap != NULL) {
|
|
if (add_len > del_len) {
|
|
offsetmap->Copy(src - copystart);
|
|
offsetmap->Insert(add_len - del_len);
|
|
copystart = src;
|
|
} else if (add_len < del_len) {
|
|
offsetmap->Copy(src - copystart + add_len - del_len);
|
|
offsetmap->Delete(del_len - add_len);
|
|
copystart = src;
|
|
}
|
|
}
|
|
if (re->delete_bytes & kReplaceAndResumeFlag) {
|
|
// There is a non-zero target state at the end of the
|
|
// replacement string
|
|
e = st->remap_string[string_offset + add_len];
|
|
Tbl = &Tbl_0[e << eshift];
|
|
goto Do_state_table_newe;
|
|
}
|
|
}
|
|
if (e == kExitRejectAlt) {break;}
|
|
if (e != kExitSpecial) {goto Do_state_table;}
|
|
|
|
// case kExitSpecial: // Apply special fixups [read: hacks]
|
|
// In this routine, do either UTF8CharToLower()
|
|
// fullwidth/halfwidth mapping or
|
|
// voiced mapping or
|
|
// semi-voiced mapping
|
|
|
|
// First, do EXIT_REPLACE_OFFSET1 action (above)
|
|
// Second: do additional code fixup
|
|
{
|
|
int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
|
|
if (offsetmap != NULL) {
|
|
if (srcdel != 0) {
|
|
offsetmap->Copy(src - copystart - srcdel);
|
|
offsetmap->Delete(srcdel);
|
|
copystart = src;
|
|
}
|
|
}
|
|
}
|
|
goto Do_state_table;
|
|
|
|
case kExitIllegalStructure: // structurally illegal byte; quit
|
|
case kExitReject: // NUL or illegal code encountered; quit
|
|
case kExitRejectAlt: // Apply replacement, then exit
|
|
default: // and all other exits
|
|
break;
|
|
} // End switch (e)
|
|
|
|
// Exit possibilities:
|
|
// Some other exit code, state0, back up one byte exactly
|
|
// Some other exit code, !state0, back up over last char
|
|
|
|
// Back up over exactly one byte of rejected/illegal UTF-8 character
|
|
src--;
|
|
dst--;
|
|
// Back up more if needed
|
|
if (!InStateZero(st, Tbl)) {
|
|
do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
}
|
|
} else if (!InStateZero(st, Tbl)) {
|
|
// src >= srclimit, !state0
|
|
// Back up over truncated UTF-8 character
|
|
e = kExitIllegalStructure;
|
|
do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
} else {
|
|
// src >= srclimit, state0
|
|
// Normal termination, source fully consumed
|
|
e = kExitOK;
|
|
}
|
|
|
|
if (offsetmap != NULL) {
|
|
if (src > copystart) {
|
|
offsetmap->Copy(src - copystart);
|
|
copystart = src;
|
|
}
|
|
}
|
|
|
|
// Possible return values here:
|
|
// kExitDstSpaceFull caller may realloc and retry from middle
|
|
// kExitIllegalStructure caller my overwrite/truncate
|
|
// kExitOK all done and happy
|
|
// kExitReject caller may overwrite/truncate
|
|
// kExitDoAgain LOOP NOT DONE; caller must retry from middle
|
|
// (may do fast ASCII loop first)
|
|
// kExitPlaceholder -unused-
|
|
// kExitNone -unused-
|
|
*bytes_consumed = src - isrc;
|
|
*bytes_filled = dst - odst;
|
|
*chars_changed = total_changed;
|
|
return e;
|
|
}
|
|
|
|
// TwoByte versions are needed for tables > 240 states, such
|
|
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
|
|
|
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
|
// copying to output stringpiece
|
|
// and doing text replacements.
|
|
// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
|
|
// Needs caller to loop on kExitDoAgain
|
|
static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed,
|
|
OffsetMap* offsetmap) {
|
|
int eshift = st->entry_shift;
|
|
int nEntries = (1 << eshift); // 64 or 256 entries per state
|
|
const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
|
|
const int ilen = istr.length();
|
|
const uint8* copystart = isrc;
|
|
const uint8* src = isrc;
|
|
const uint8* srclimit = src + ilen;
|
|
*bytes_consumed = 0;
|
|
*bytes_filled = 0;
|
|
*chars_changed = 0;
|
|
|
|
const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
|
|
const int olen = ostr.length();
|
|
uint8* dst = const_cast<uint8*>(odst);
|
|
uint8* dstlimit = dst + olen;
|
|
|
|
*chars_changed = 0;
|
|
|
|
int total_changed = 0;
|
|
|
|
int src_lll = srclimit - src;
|
|
int dst_lll = dstlimit - dst;
|
|
|
|
|
|
// Invariant condition during replacements:
|
|
// remaining dst size >= remaining src size
|
|
if ((dstlimit - dst) < (srclimit - src)) {
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart);
|
|
copystart = src;
|
|
}
|
|
return kExitDstSpaceFull_2;
|
|
}
|
|
const unsigned short* Tbl_0 = &st->state_table[st->state0];
|
|
|
|
Do_state_table_2:
|
|
// Do state-table scan, copying as we go
|
|
const unsigned short* Tbl = Tbl_0;
|
|
int e = 0;
|
|
uint8 c = 0;
|
|
|
|
Do_state_table_newe_2:
|
|
|
|
//----------------------------
|
|
while (src < srclimit) {
|
|
c = *src;
|
|
e = Tbl[c];
|
|
*dst = c;
|
|
src++;
|
|
dst++;
|
|
if (e >= kExitIllegalStructure_2) {break;}
|
|
Tbl = &Tbl_0[e << eshift];
|
|
}
|
|
//----------------------------
|
|
src_lll = src - isrc;
|
|
dst_lll = dst - odst;
|
|
|
|
// Exit possibilities:
|
|
// Replacement code, do the replacement and loop
|
|
// Some other exit code, state0, back up one byte exactly
|
|
// Some other exit code, !state0, back up over last char
|
|
// source consumed, state0, exit OK
|
|
// source consumed, !state0, back up over partial char
|
|
// For illegal byte in state0, avoid backup up over PREVIOUS char
|
|
// For truncated last char, back up to beginning of it
|
|
|
|
if (e >= kExitIllegalStructure_2) {
|
|
// Switch on exit code; most loop back to top
|
|
int offset = 0;
|
|
switch (e) {
|
|
// These all make the output string the same size or shorter
|
|
// No checking needed
|
|
case kExitReplace31_2: // del 2, add 1 bytes to change
|
|
dst -= 2;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 2);
|
|
offsetmap->Delete(2);
|
|
copystart = src;
|
|
}
|
|
dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
|
|
total_changed++;
|
|
goto Do_state_table_2;
|
|
case kExitReplace32_2: // del 3, add 2 bytes to change
|
|
dst--;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 1);
|
|
offsetmap->Delete(1);
|
|
copystart = src;
|
|
}
|
|
dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
|
|
dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
|
|
total_changed++;
|
|
goto Do_state_table_2;
|
|
case kExitReplace21_2: // del 2, add 1 bytes to change
|
|
dst--;
|
|
if (offsetmap != NULL) {
|
|
offsetmap->Copy(src - copystart - 1);
|
|
offsetmap->Delete(1);
|
|
copystart = src;
|
|
}
|
|
dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
|
|
total_changed++;
|
|
goto Do_state_table_2;
|
|
case kExitReplace3_2: // update 3 bytes to change
|
|
dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
|
|
// Fall into next case
|
|
case kExitReplace2_2: // update 2 bytes to change
|
|
dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
|
|
// Fall into next case
|
|
case kExitReplace1_2: // update 1 byte to change
|
|
dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
|
|
total_changed++;
|
|
goto Do_state_table_2;
|
|
case kExitReplace1S0_2: // update 1 byte to change, 256-entry state
|
|
dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
|
|
total_changed++;
|
|
goto Do_state_table_2;
|
|
// These can make the output string longer than the input
|
|
case kExitReplaceOffset2_2:
|
|
if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
|
|
// For space-optimized table, we need multiples of 256 bytes
|
|
// in state0 and multiples of nEntries in other states
|
|
offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
|
|
} else {
|
|
offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
|
|
}
|
|
// Fall into next case
|
|
case kExitReplaceOffset1_2:
|
|
if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
|
|
// For space-optimized table, we need multiples of 256 bytes
|
|
// in state0 and multiples of nEntries in other states
|
|
offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
|
|
} else {
|
|
offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
|
|
}
|
|
{
|
|
const RemapEntry* re = &st->remap_base[offset];
|
|
int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
|
|
int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
|
|
// Special-case non-HTML replacement of five sensitive entities
|
|
// " & ' < >
|
|
// 0022 0026 0027 003c 003e
|
|
// A replacement creating one of these is expressed as a pair of
|
|
// entries, one for HTML output and one for plaintext output.
|
|
// The first of the pair has the high bit of add_bytes set.
|
|
if (re->add_bytes & kHtmlPlaintextFlag) {
|
|
// Use this entry for plain text
|
|
if (!is_plain_text) {
|
|
// Use very next entry for HTML text (same back/delete length)
|
|
re = &st->remap_base[offset + 1];
|
|
add_len = re->add_bytes & ~kHtmlPlaintextFlag;
|
|
}
|
|
}
|
|
|
|
// After the replacement, need (dstlimit - dst) >= (srclimit - src)
|
|
int string_offset = re->bytes_offset;
|
|
// After the replacement, need (dstlimit - newdst) >= (srclimit - src)
|
|
uint8* newdst = dst - del_len + add_len;
|
|
if ((dstlimit - newdst) < (srclimit - src)) {
|
|
// Won't fit; don't do the replacement. Caller may realloc and retry
|
|
e = kExitDstSpaceFull_2;
|
|
break; // exit, backing up over this char for later retry
|
|
}
|
|
dst -= del_len;
|
|
memcpy(dst, &st->remap_string[string_offset], add_len);
|
|
dst += add_len;
|
|
if (offsetmap != NULL) {
|
|
if (add_len > del_len) {
|
|
offsetmap->Copy(src - copystart);
|
|
offsetmap->Insert(add_len - del_len);
|
|
copystart = src;
|
|
} else if (add_len < del_len) {
|
|
offsetmap->Copy(src - copystart + add_len - del_len);
|
|
offsetmap->Delete(del_len - add_len);
|
|
copystart = src;
|
|
}
|
|
}
|
|
if (re->delete_bytes & kReplaceAndResumeFlag) {
|
|
// There is a two-byte non-zero target state at the end of the
|
|
// replacement string
|
|
uint8 c1 = st->remap_string[string_offset + add_len];
|
|
uint8 c2 = st->remap_string[string_offset + add_len + 1];
|
|
e = (c1 << 8) | c2;
|
|
Tbl = &Tbl_0[e << eshift];
|
|
total_changed++;
|
|
goto Do_state_table_newe_2;
|
|
}
|
|
}
|
|
total_changed++;
|
|
if (e == kExitRejectAlt_2) {break;}
|
|
goto Do_state_table_2;
|
|
|
|
case kExitSpecial_2: // NO special fixups [read: hacks]
|
|
case kExitIllegalStructure_2: // structurally illegal byte; quit
|
|
case kExitReject_2: // NUL or illegal code encountered; quit
|
|
// and all other exits
|
|
default:
|
|
break;
|
|
} // End switch (e)
|
|
|
|
// Exit possibilities:
|
|
// Some other exit code, state0, back up one byte exactly
|
|
// Some other exit code, !state0, back up over last char
|
|
|
|
// Back up over exactly one byte of rejected/illegal UTF-8 character
|
|
src--;
|
|
dst--;
|
|
// Back up more if needed
|
|
if (!InStateZero_2(st, Tbl)) {
|
|
do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
}
|
|
} else if (!InStateZero_2(st, Tbl)) {
|
|
// src >= srclimit, !state0
|
|
// Back up over truncated UTF-8 character
|
|
e = kExitIllegalStructure_2;
|
|
|
|
do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
} else {
|
|
// src >= srclimit, state0
|
|
// Normal termination, source fully consumed
|
|
e = kExitOK_2;
|
|
}
|
|
|
|
if (offsetmap != NULL) {
|
|
if (src > copystart) {
|
|
offsetmap->Copy(src - copystart);
|
|
copystart = src;
|
|
}
|
|
}
|
|
|
|
|
|
// Possible return values here:
|
|
// kExitDstSpaceFull_2 caller may realloc and retry from middle
|
|
// kExitIllegalStructure_2 caller my overwrite/truncate
|
|
// kExitOK_2 all done and happy
|
|
// kExitReject_2 caller may overwrite/truncate
|
|
// kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle
|
|
// (may do fast ASCII loop first)
|
|
// kExitPlaceholder_2 -unused-
|
|
// kExitNone_2 -unused-
|
|
*bytes_consumed = src - isrc;
|
|
*bytes_filled = dst - odst;
|
|
*chars_changed = total_changed;
|
|
return e;
|
|
}
|
|
|
|
|
|
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
|
// and doing text replacements.
|
|
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
|
// Always scan complete UTF-8 characters
|
|
// Set number of bytes consumed from input, number filled to output.
|
|
// Return reason for exiting
|
|
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed,
|
|
OffsetMap* offsetmap) {
|
|
StringPiece local_istr(istr.data(), istr.length());
|
|
StringPiece local_ostr(ostr.data(), ostr.length());
|
|
int total_consumed = 0;
|
|
int total_filled = 0;
|
|
int total_changed = 0;
|
|
int local_bytes_consumed, local_bytes_filled, local_chars_changed;
|
|
int e;
|
|
do {
|
|
e = UTF8GenericReplaceInternal(st,
|
|
local_istr, local_ostr, is_plain_text,
|
|
&local_bytes_consumed, &local_bytes_filled,
|
|
&local_chars_changed,
|
|
offsetmap);
|
|
local_istr.remove_prefix(local_bytes_consumed);
|
|
local_ostr.remove_prefix(local_bytes_filled);
|
|
total_consumed += local_bytes_consumed;
|
|
total_filled += local_bytes_filled;
|
|
total_changed += local_chars_changed;
|
|
} while ( e == kExitDoAgain );
|
|
*bytes_consumed = total_consumed;
|
|
*bytes_filled = total_filled;
|
|
*chars_changed = total_changed;
|
|
return e;
|
|
}
|
|
|
|
// Older version without offsetmap
|
|
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed) {
|
|
return UTF8GenericReplace(st,
|
|
istr,
|
|
ostr,
|
|
is_plain_text,
|
|
bytes_consumed,
|
|
bytes_filled,
|
|
chars_changed,
|
|
NULL);
|
|
}
|
|
|
|
// Older version without is_plain_text or offsetmap
|
|
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed) {
|
|
bool is_plain_text = false;
|
|
return UTF8GenericReplace(st,
|
|
istr,
|
|
ostr,
|
|
is_plain_text,
|
|
bytes_consumed,
|
|
bytes_filled,
|
|
chars_changed,
|
|
NULL);
|
|
}
|
|
|
|
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
|
// copying to output stringpiece
|
|
// and doing text replacements.
|
|
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
|
// Always scan complete UTF-8 characters
|
|
// Set number of bytes consumed from input, number filled to output.
|
|
// Return reason for exiting
|
|
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed,
|
|
OffsetMap* offsetmap) {
|
|
StringPiece local_istr(istr.data(), istr.length());
|
|
StringPiece local_ostr(ostr.data(), ostr.length());
|
|
int total_consumed = 0;
|
|
int total_filled = 0;
|
|
int total_changed = 0;
|
|
int local_bytes_consumed, local_bytes_filled, local_chars_changed;
|
|
int e;
|
|
do {
|
|
e = UTF8GenericReplaceInternalTwoByte(st,
|
|
local_istr, local_ostr, is_plain_text,
|
|
&local_bytes_consumed,
|
|
&local_bytes_filled,
|
|
&local_chars_changed,
|
|
offsetmap);
|
|
local_istr.remove_prefix(local_bytes_consumed);
|
|
local_ostr.remove_prefix(local_bytes_filled);
|
|
total_consumed += local_bytes_consumed;
|
|
total_filled += local_bytes_filled;
|
|
total_changed += local_chars_changed;
|
|
} while ( e == kExitDoAgain_2 );
|
|
*bytes_consumed = total_consumed;
|
|
*bytes_filled = total_filled;
|
|
*chars_changed = total_changed;
|
|
|
|
return e - kExitOK_2 + kExitOK;
|
|
}
|
|
|
|
// Older version without offsetmap
|
|
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
bool is_plain_text,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed) {
|
|
return UTF8GenericReplaceTwoByte(st,
|
|
istr,
|
|
ostr,
|
|
is_plain_text,
|
|
bytes_consumed,
|
|
bytes_filled,
|
|
chars_changed,
|
|
NULL);
|
|
}
|
|
|
|
// Older version without is_plain_text or offsetmap
|
|
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
|
const StringPiece& istr,
|
|
StringPiece& ostr,
|
|
int* bytes_consumed,
|
|
int* bytes_filled,
|
|
int* chars_changed) {
|
|
bool is_plain_text = false;
|
|
return UTF8GenericReplaceTwoByte(st,
|
|
istr,
|
|
ostr,
|
|
is_plain_text,
|
|
bytes_consumed,
|
|
bytes_filled,
|
|
chars_changed,
|
|
NULL);
|
|
}
|
|
|
|
|
|
|
|
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
|
// The data pointer will be increased by 0..3 bytes to get to a character
|
|
// boundary, and the length will then be decreased by 0..3 bytes
|
|
// to encompass the last complete character.
|
|
void UTF8TrimToChars(StringPiece* istr) {
|
|
const char* src = istr->data();
|
|
int len = istr->length();
|
|
// Exit if empty string
|
|
if (len == 0) {
|
|
return;
|
|
}
|
|
|
|
// Exit on simple, common case
|
|
if ( ((src[0] & 0xc0) != 0x80) &&
|
|
(static_cast<signed char>(src[len - 1]) >= 0) ) {
|
|
// First byte is not a continuation and last byte is 7-bit ASCII -- done
|
|
return;
|
|
}
|
|
|
|
// Adjust the back end, len > 0
|
|
const char* srclimit = src + len;
|
|
// Backscan over any ending continuation bytes to find last char start
|
|
const char* s = srclimit - 1; // Last byte of the string
|
|
while ((src <= s) && ((*s & 0xc0) == 0x80)) {
|
|
s--;
|
|
}
|
|
// Include entire last char if it fits
|
|
if (src <= s) {
|
|
int last_char_len = UTF8OneCharLen(s);
|
|
if (s + last_char_len <= srclimit) {
|
|
// Last char fits, so include it, else exclude it
|
|
s += last_char_len;
|
|
}
|
|
}
|
|
if (s != srclimit) {
|
|
// s is one byte beyond the last full character, if any
|
|
istr->remove_suffix(srclimit - s);
|
|
// Exit if now empty string
|
|
if (istr->length() == 0) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Adjust the front end, len > 0
|
|
len = istr->length();
|
|
srclimit = src + len;
|
|
s = src; // First byte of the string
|
|
// Scan over any beginning continuation bytes to find first char start
|
|
while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
|
|
s++;
|
|
}
|
|
if (s != src) {
|
|
// s is at the first full character, if any
|
|
istr->remove_prefix(s - src);
|
|
}
|
|
}
|
|
|
|
} // End namespace CLD2
|