mcs/class/referencesource/System.Web/Security/AntiXss/Utf16StringReader.cs

//------------------------------------------------------------------------------
// <copyright file="Utf16StringReader.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//------------------------------------------------------------------------------

namespace System.Web.Security.AntiXss {
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;

    /// <summary>
    /// Reads individual scalar values from a UTF-16 input string.
    /// </summary>
    /// <remarks>
    /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type.
    /// </remarks>
    internal struct Utf16StringReader {

        /// <summary>
        /// Starting code point for the UTF-16 leading surrogates.
        /// </summary>
        private const char LeadingSurrogateStart = '\uD800';

        /// <summary>
        /// Starting code point for the UTF-16 trailing surrogates.
        /// </summary>
        private const char TrailingSurrogateStart = '\uDC00';

        /// <summary>
        /// The Unicode replacement character U+FFFD.
        /// </summary>
        /// <remarks>
        /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf.
        /// </remarks>
        private const int UnicodeReplacementCharacterCodePoint = '\uFFFD';

        /// <summary>
        /// The current offset into '_input'.
        /// </summary>
        private int _currentOffset;

        /// <summary>
        /// The input string we're iterating on.
        /// </summary>
        private readonly string _input;

        /// <summary>
        /// Initializes the reader with the given UTF-16 input string.
        /// </summary>
        /// <param name="input">The input string to decompose into scalar values.</param>
        public Utf16StringReader(string input) {
            Debug.Assert(input != null);

            _input = input;
            _currentOffset = 0;
        }

        /// <summary>
        /// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done.
        /// </summary>
        /// <param name="leadingSurrogate">The UTF-16 leading surrogate character.</param>
        /// <param name="trailingSurrogate">The UTF-16 trailing surrogate character.</param>
        /// <returns>The scalar value resulting from combining these two surrogate characters.</returns>
        /// <remarks>The caller must ensure that the inputs are valid surrogate characters. If not,
        /// the output of this routine is undefined.</remarks>
        private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) {
            Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate.");
            Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate.");

            return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000;
        }

        /// <summary>
        /// Determines whether a given code point is a valid Unicode scalar value.
        /// </summary>
        /// <param name="codePoint">The code point whose validity is to be checked.</param>
        /// <returns>True if the input is a valid Unicode scalar value, false otherwise.</returns>
        private static bool IsValidUnicodeScalarValue(int codePoint) {
            // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF.
            // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76
            return (0 <= codePoint && codePoint <= 0xD7FF)
                || (0xE000 <= codePoint && codePoint <= 0x10FFFF);
        }

        /// <summary>
        /// Reads the next scalar value from the input string.
        /// </summary>
        /// <returns>The next scalar value. If the input string contains invalid UTF-16, the
        /// return value is the Unicode replacement character U+FFFD. If the end of the string
        /// is reached, returns -1.</returns>
        public int ReadNextScalarValue() {
            if (_currentOffset >= _input.Length) {
                return -1; // EOF
            }

            char thisCodeUnit = _input[_currentOffset++];
            int thisCodePoint = thisCodeUnit;

            if (Char.IsHighSurrogate(thisCodeUnit)) {
                if (_currentOffset < _input.Length) {
                    char nextCodeUnit = _input[_currentOffset];
                    if (Char.IsLowSurrogate(nextCodeUnit)) {
                        // We encountered a high (leading) surrogate followed by a low
                        // (trailing) surrogate. Bump '_currentOffset' up by one more
                        // since we're consuming both code units.
                        _currentOffset++;
                        thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit);
                    }
                }
            }

            if (IsValidUnicodeScalarValue(thisCodePoint)) {
                return thisCodePoint;
            }
            else {
                // ERROR: This code point was either an unmatched high (leading)
                // surrogate or an unmatched low (trailing) surrogate, neither of
                // which maps to a valid Unicode scalar value. Per the Unicode
                // specification (Ch. 3, C10), we replace with U+FFFD.
                return UnicodeReplacementCharacterCodePoint;
            }
        }

    }
}