//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // //------------------------------------------------------------------------------ namespace System.Web.Security.AntiXss { using System; using System.Collections.Generic; using System.Diagnostics; /// /// Reads individual scalar values from a UTF-16 input string. /// /// /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type. /// internal struct Utf16StringReader { /// /// Starting code point for the UTF-16 leading surrogates. /// private const char LeadingSurrogateStart = '\uD800'; /// /// Starting code point for the UTF-16 trailing surrogates. /// private const char TrailingSurrogateStart = '\uDC00'; /// /// The Unicode replacement character U+FFFD. /// /// /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf. /// private const int UnicodeReplacementCharacterCodePoint = '\uFFFD'; /// /// The current offset into '_input'. /// private int _currentOffset; /// /// The input string we're iterating on. /// private readonly string _input; /// /// Initializes the reader with the given UTF-16 input string. /// /// The input string to decompose into scalar values. public Utf16StringReader(string input) { Debug.Assert(input != null); _input = input; _currentOffset = 0; } /// /// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done. /// /// The UTF-16 leading surrogate character. /// The UTF-16 trailing surrogate character. /// The scalar value resulting from combining these two surrogate characters. /// The caller must ensure that the inputs are valid surrogate characters. If not, /// the output of this routine is undefined. private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) { Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate."); Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate."); return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000; } /// /// Determines whether a given code point is a valid Unicode scalar value. /// /// The code point whose validity is to be checked. /// True if the input is a valid Unicode scalar value, false otherwise. private static bool IsValidUnicodeScalarValue(int codePoint) { // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF. // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76 return (0 <= codePoint && codePoint <= 0xD7FF) || (0xE000 <= codePoint && codePoint <= 0x10FFFF); } /// /// Reads the next scalar value from the input string. /// /// The next scalar value. If the input string contains invalid UTF-16, the /// return value is the Unicode replacement character U+FFFD. If the end of the string /// is reached, returns -1. public int ReadNextScalarValue() { if (_currentOffset >= _input.Length) { return -1; // EOF } char thisCodeUnit = _input[_currentOffset++]; int thisCodePoint = thisCodeUnit; if (Char.IsHighSurrogate(thisCodeUnit)) { if (_currentOffset < _input.Length) { char nextCodeUnit = _input[_currentOffset]; if (Char.IsLowSurrogate(nextCodeUnit)) { // We encountered a high (leading) surrogate followed by a low // (trailing) surrogate. Bump '_currentOffset' up by one more // since we're consuming both code units. _currentOffset++; thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit); } } } if (IsValidUnicodeScalarValue(thisCodePoint)) { return thisCodePoint; } else { // ERROR: This code point was either an unmatched high (leading) // surrogate or an unmatched low (trailing) surrogate, neither of // which maps to a valid Unicode scalar value. Per the Unicode // specification (Ch. 3, C10), we replace with U+FFFD. return UnicodeReplacementCharacterCodePoint; } } } }