//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // //------------------------------------------------------------------------------ namespace System.Web.Security.AntiXss { using System; using System.Collections.Generic; using System.Diagnostics; ///

/// Reads individual scalar values from a UTF-16 input string. ///

/// /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type. /// internal struct Utf16StringReader { ///

/// Starting code point for the UTF-16 leading surrogates. ///

private const char LeadingSurrogateStart = '\uD800'; ///

/// Starting code point for the UTF-16 trailing surrogates. ///

private const char TrailingSurrogateStart = '\uDC00'; ///

/// The Unicode replacement character U+FFFD. ///

/// /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf. /// private const int UnicodeReplacementCharacterCodePoint = '\uFFFD'; ///

/// The current offset into '_input'. ///

private int _currentOffset; ///

/// The input string we're iterating on. ///

private readonly string _input; ///

/// Initializes the reader with the given UTF-16 input string. ///

/// The input string to decompose into scalar values. public Utf16StringReader(string input) { Debug.Assert(input != null); _input = input; _currentOffset = 0; } ///

/// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done. ///

/// The UTF-16 leading surrogate character. /// The UTF-16 trailing surrogate character. /// The scalar value resulting from combining these two surrogate characters. /// The caller must ensure that the inputs are valid surrogate characters. If not, /// the output of this routine is undefined. private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) { Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate."); Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate."); return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000; } ///

/// Determines whether a given code point is a valid Unicode scalar value. ///

/// The code point whose validity is to be checked. /// True if the input is a valid Unicode scalar value, false otherwise. private static bool IsValidUnicodeScalarValue(int codePoint) { // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF. // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76 return (0 <= codePoint && codePoint <= 0xD7FF) || (0xE000 <= codePoint && codePoint <= 0x10FFFF); } ///

/// Reads the next scalar value from the input string. ///

/// The next scalar value. If the input string contains invalid UTF-16, the /// return value is the Unicode replacement character U+FFFD. If the end of the string /// is reached, returns -1. public int ReadNextScalarValue() { if (_currentOffset >= _input.Length) { return -1; // EOF } char thisCodeUnit = _input[_currentOffset++]; int thisCodePoint = thisCodeUnit; if (Char.IsHighSurrogate(thisCodeUnit)) { if (_currentOffset < _input.Length) { char nextCodeUnit = _input[_currentOffset]; if (Char.IsLowSurrogate(nextCodeUnit)) { // We encountered a high (leading) surrogate followed by a low // (trailing) surrogate. Bump '_currentOffset' up by one more // since we're consuming both code units. _currentOffset++; thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit); } } } if (IsValidUnicodeScalarValue(thisCodePoint)) { return thisCodePoint; } else { // ERROR: This code point was either an unmatched high (leading) // surrogate or an unmatched low (trailing) surrogate, neither of // which maps to a valid Unicode scalar value. Per the Unicode // specification (Ch. 3, C10), we replace with U+FFFD. return UnicodeReplacementCharacterCodePoint; } } } }