Imported Upstream version 4.6.0.125

Former-commit-id: a2155e9bd80020e49e72e86c44da02a8ac0e57a4
2016-08-03 10:59:49 +00:00
parent a569aebcfd
commit e79aa3c0ed
17047 changed files with 3137615 additions and 392334 deletions
--- a/mcs/class/referencesource/System.Web/Security/AntiXss/Utf16StringReader.cs
+++ b/mcs/class/referencesource/System.Web/Security/AntiXss/Utf16StringReader.cs
@@ -0,0 +1,126 @@
+//------------------------------------------------------------------------------
+// <copyright file="Utf16StringReader.cs" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//------------------------------------------------------------------------------
+
+namespace System.Web.Security.AntiXss {
+    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+
+    /// <summary>
+    /// Reads individual scalar values from a UTF-16 input string.
+    /// </summary>
+    /// <remarks>
+    /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type.
+    /// </remarks>
+    internal struct Utf16StringReader {
+
+        /// <summary>
+        /// Starting code point for the UTF-16 leading surrogates.
+        /// </summary>
+        private const char LeadingSurrogateStart = '\uD800';
+
+        /// <summary>
+        /// Starting code point for the UTF-16 trailing surrogates.
+        /// </summary>
+        private const char TrailingSurrogateStart = '\uDC00';
+
+        /// <summary>
+        /// The Unicode replacement character U+FFFD.
+        /// </summary>
+        /// <remarks>
+        /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf.
+        /// </remarks>
+        private const int UnicodeReplacementCharacterCodePoint = '\uFFFD';
+
+        /// <summary>
+        /// The current offset into '_input'.
+        /// </summary>
+        private int _currentOffset;
+
+        /// <summary>
+        /// The input string we're iterating on.
+        /// </summary>
+        private readonly string _input;
+
+        /// <summary>
+        /// Initializes the reader with the given UTF-16 input string.
+        /// </summary>
+        /// <param name="input">The input string to decompose into scalar values.</param>
+        public Utf16StringReader(string input) {
+            Debug.Assert(input != null);
+
+            _input = input;
+            _currentOffset = 0;
+        }
+
+        /// <summary>
+        /// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done.
+        /// </summary>
+        /// <param name="leadingSurrogate">The UTF-16 leading surrogate character.</param>
+        /// <param name="trailingSurrogate">The UTF-16 trailing surrogate character.</param>
+        /// <returns>The scalar value resulting from combining these two surrogate characters.</returns>
+        /// <remarks>The caller must ensure that the inputs are valid surrogate characters. If not,
+        /// the output of this routine is undefined.</remarks>
+        private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) {
+            Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate.");
+            Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate.");
+
+            return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000;
+        }
+
+        /// <summary>
+        /// Determines whether a given code point is a valid Unicode scalar value.
+        /// </summary>
+        /// <param name="codePoint">The code point whose validity is to be checked.</param>
+        /// <returns>True if the input is a valid Unicode scalar value, false otherwise.</returns>
+        private static bool IsValidUnicodeScalarValue(int codePoint) {
+            // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF.
+            // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76
+            return (0 <= codePoint && codePoint <= 0xD7FF)
+                || (0xE000 <= codePoint && codePoint <= 0x10FFFF);
+        }
+
+        /// <summary>
+        /// Reads the next scalar value from the input string.
+        /// </summary>
+        /// <returns>The next scalar value. If the input string contains invalid UTF-16, the
+        /// return value is the Unicode replacement character U+FFFD. If the end of the string
+        /// is reached, returns -1.</returns>
+        public int ReadNextScalarValue() {
+            if (_currentOffset >= _input.Length) {
+                return -1; // EOF
+            }
+
+            char thisCodeUnit = _input[_currentOffset++];
+            int thisCodePoint = thisCodeUnit;
+
+            if (Char.IsHighSurrogate(thisCodeUnit)) {
+                if (_currentOffset < _input.Length) {
+                    char nextCodeUnit = _input[_currentOffset];
+                    if (Char.IsLowSurrogate(nextCodeUnit)) {
+                        // We encountered a high (leading) surrogate followed by a low
+                        // (trailing) surrogate. Bump '_currentOffset' up by one more
+                        // since we're consuming both code units.
+                        _currentOffset++;
+                        thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit);
+                    }
+                }
+            }
+
+            if (IsValidUnicodeScalarValue(thisCodePoint)) {
+                return thisCodePoint;
+            }
+            else {
+                // ERROR: This code point was either an unmatched high (leading)
+                // surrogate or an unmatched low (trailing) surrogate, neither of
+                // which maps to a valid Unicode scalar value. Per the Unicode
+                // specification (Ch. 3, C10), we replace with U+FFFD.
+                return UnicodeReplacementCharacterCodePoint;
+            }
+        }
+
+    }
+}