You've already forked linux-packaging-mono
							
							
		
			
				
	
	
		
			127 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			127 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
| //------------------------------------------------------------------------------
 | |
| // <copyright file="Utf16StringReader.cs" company="Microsoft">
 | |
| //     Copyright (c) Microsoft Corporation.  All rights reserved.
 | |
| // </copyright>
 | |
| //------------------------------------------------------------------------------
 | |
| 
 | |
| namespace System.Web.Security.AntiXss {
 | |
|     using System;
 | |
|     using System.Collections.Generic;
 | |
|     using System.Diagnostics;
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Reads individual scalar values from a UTF-16 input string.
 | |
|     /// </summary>
 | |
|     /// <remarks>
 | |
|     /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type.
 | |
|     /// </remarks>
 | |
|     internal struct Utf16StringReader {
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Starting code point for the UTF-16 leading surrogates.
 | |
|         /// </summary>
 | |
|         private const char LeadingSurrogateStart = '\uD800';
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Starting code point for the UTF-16 trailing surrogates.
 | |
|         /// </summary>
 | |
|         private const char TrailingSurrogateStart = '\uDC00';
 | |
| 
 | |
|         /// <summary>
 | |
|         /// The Unicode replacement character U+FFFD.
 | |
|         /// </summary>
 | |
|         /// <remarks>
 | |
|         /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf.
 | |
|         /// </remarks>
 | |
|         private const int UnicodeReplacementCharacterCodePoint = '\uFFFD';
 | |
| 
 | |
|         /// <summary>
 | |
|         /// The current offset into '_input'.
 | |
|         /// </summary>
 | |
|         private int _currentOffset;
 | |
| 
 | |
|         /// <summary>
 | |
|         /// The input string we're iterating on.
 | |
|         /// </summary>
 | |
|         private readonly string _input;
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Initializes the reader with the given UTF-16 input string.
 | |
|         /// </summary>
 | |
|         /// <param name="input">The input string to decompose into scalar values.</param>
 | |
|         public Utf16StringReader(string input) {
 | |
|             Debug.Assert(input != null);
 | |
| 
 | |
|             _input = input;
 | |
|             _currentOffset = 0;
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done.
 | |
|         /// </summary>
 | |
|         /// <param name="leadingSurrogate">The UTF-16 leading surrogate character.</param>
 | |
|         /// <param name="trailingSurrogate">The UTF-16 trailing surrogate character.</param>
 | |
|         /// <returns>The scalar value resulting from combining these two surrogate characters.</returns>
 | |
|         /// <remarks>The caller must ensure that the inputs are valid surrogate characters. If not,
 | |
|         /// the output of this routine is undefined.</remarks>
 | |
|         private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) {
 | |
|             Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate.");
 | |
|             Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate.");
 | |
| 
 | |
|             return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000;
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Determines whether a given code point is a valid Unicode scalar value.
 | |
|         /// </summary>
 | |
|         /// <param name="codePoint">The code point whose validity is to be checked.</param>
 | |
|         /// <returns>True if the input is a valid Unicode scalar value, false otherwise.</returns>
 | |
|         private static bool IsValidUnicodeScalarValue(int codePoint) {
 | |
|             // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF.
 | |
|             // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76
 | |
|             return (0 <= codePoint && codePoint <= 0xD7FF)
 | |
|                 || (0xE000 <= codePoint && codePoint <= 0x10FFFF);
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Reads the next scalar value from the input string.
 | |
|         /// </summary>
 | |
|         /// <returns>The next scalar value. If the input string contains invalid UTF-16, the
 | |
|         /// return value is the Unicode replacement character U+FFFD. If the end of the string
 | |
|         /// is reached, returns -1.</returns>
 | |
|         public int ReadNextScalarValue() {
 | |
|             if (_currentOffset >= _input.Length) {
 | |
|                 return -1; // EOF
 | |
|             }
 | |
| 
 | |
|             char thisCodeUnit = _input[_currentOffset++];
 | |
|             int thisCodePoint = thisCodeUnit;
 | |
| 
 | |
|             if (Char.IsHighSurrogate(thisCodeUnit)) {
 | |
|                 if (_currentOffset < _input.Length) {
 | |
|                     char nextCodeUnit = _input[_currentOffset];
 | |
|                     if (Char.IsLowSurrogate(nextCodeUnit)) {
 | |
|                         // We encountered a high (leading) surrogate followed by a low
 | |
|                         // (trailing) surrogate. Bump '_currentOffset' up by one more
 | |
|                         // since we're consuming both code units.
 | |
|                         _currentOffset++;
 | |
|                         thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit);
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if (IsValidUnicodeScalarValue(thisCodePoint)) {
 | |
|                 return thisCodePoint;
 | |
|             }
 | |
|             else {
 | |
|                 // ERROR: This code point was either an unmatched high (leading)
 | |
|                 // surrogate or an unmatched low (trailing) surrogate, neither of
 | |
|                 // which maps to a valid Unicode scalar value. Per the Unicode
 | |
|                 // specification (Ch. 3, C10), we replace with U+FFFD.
 | |
|                 return UnicodeReplacementCharacterCodePoint;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     }
 | |
| }
 |