------------------------------------------------------------------------------ -- XML/Ada - An XML suite for Ada95 -- -- -- -- Copyright (C) 2001-2017, AdaCore -- -- -- -- This library is free software; you can redistribute it and/or modify it -- -- under terms of the GNU General Public License as published by the Free -- -- Software Foundation; either version 3, or (at your option) any later -- -- version. This library is distributed in the hope that it will be useful, -- -- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- -- -- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. -- -- -- -- As a special exception under Section 7 of GPL version 3, you are granted -- -- additional permissions described in the GCC Runtime Library Exception, -- -- version 3.1, as published by the Free Software Foundation. -- -- -- -- You should have received a copy of the GNU General Public License and -- -- a copy of the GCC Runtime Library Exception along with this program; -- -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- -- . -- -- -- ------------------------------------------------------------------------------ with Unicode.CES.Utf32; use Unicode.CES.Utf32; with Unicode.CCS; use Unicode.CCS; package body Unicode.CES.Utf8 is function Internal_Convert (Str : Utf8_String; Convert : Unicode.CCS.Conversion_Function := Identity'Access; Order : Byte_Order := Default_Byte_Order) return Utf8_String; -- Internal function used to convert character sets type Byte is mod 2 ** 8; Utf8_Skip_Data : constant array (Character) of Byte := (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1); -- Skip-byte depending on the first byte of a utf8 string. Utf8_Mask : constant array (Character) of Byte := (16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#03#, 16#03#, 16#03#, 16#03#, 16#01#, 16#01#, 0, 0); -- Mask to decode the first byte of utf8. -- This was generated automatically with the following algorithm: -- if C < 128 then Mask := 16#7f#; -- elsif (C and 16#E0#) = 16#C0# then Mask := 16#1f#; -- elsif (C and 16#F0#) = 16#E0# then Mask := 16#0f#; -- elsif (C and 16#F8#) = 16#F0# then Mask := 16#07#; -- elsif (C and 16#FC#) = 16#F8# then Mask := 16#03#; -- elsif (C and 16#FE#) = 16#FC# then Mask := 16#01#; function Shift_Left (Value : Unicode_Char; Amount : Natural) return Unicode_Char; pragma Import (Intrinsic, Shift_Left); function Shift_Right (Value : Unicode_Char; Amount : Natural) return Unicode_Char; pragma Import (Intrinsic, Shift_Right); -------------------- -- Utf8_Next_Char -- -------------------- function Utf8_Next_Char (Str : Utf8_String; Index : Natural) return Natural is begin return Index + Natural (Utf8_Skip_Data (Str (Index))); end Utf8_Next_Char; ------------------------- -- Utf8_Find_Next_Char -- ------------------------- function Utf8_Find_Next_Char (Str : Utf8_String; Index : Natural) return Natural is Idx : Natural := Index; C : Unicode_Char; begin while Idx <= Str'Last loop C := Character'Pos (Str (Idx)); if (C and 16#c0#) /= 16#80# then return Idx; end if; Idx := Idx + 1; end loop; return Str'Last + 1; end Utf8_Find_Next_Char; --------------------- -- Utf8_Prev_Char -- --------------------- function Utf8_Prev_Char (Str : Utf8_String; Index : Natural) return Natural is C : Unicode_Char; Idx : Natural := Index; begin while Idx > Str'First loop Idx := Idx - 1; C := Character'Pos (Str (Idx)); if (C and 16#c0#) /= 16#80# then return Idx; end if; end loop; return Str'First - 1; end Utf8_Prev_Char; ------------------- -- Utf8_Get_Char -- ------------------- procedure Utf8_Get_Char (Str : Utf8_String; Index : in out Positive; Char : out Unicode_Char) is -- pragma Suppress (All_Checks); Len : Natural; Val : Unicode_Char; C : Unicode_Char := Character'Pos (Str (Index)); Mask : constant Byte := Utf8_Mask (Str (Index)); begin if Mask = 0 then Char := Unicode_Char'Last; return; end if; Val := C and Unicode_Char (Mask); Len := Index + Natural (Utf8_Skip_Data (Str (Index))) - 1; if Str'Last < Len then Char := Unicode_Char'Last; return; end if; for Count in Index + 1 .. Len loop C := Character'Pos (Str (Count)); if (C and 16#C0#) /= 16#80# then Char := Unicode_Char'Last; return; end if; Val := Shift_Left (Val, 6) or (C and 16#3f#); end loop; Index := Len + 1; Char := Val; end Utf8_Get_Char; ----------- -- Width -- ----------- function Width (Char : Unicode_Char) return Natural is begin if Char < 16#80# then return 1; elsif Char < 16#800# then return 2; elsif Char < 16#10000# then return 3; elsif Char < 16#200000# then return 4; elsif Char < 16#4000000# then return 5; else return 6; end if; end Width; ------------ -- Encode -- ------------ procedure Encode (Char : Unicode_Char; Output : in out Byte_Sequence; Index : in out Natural) is Len : Natural; Mask : Unicode_Char; Val : Unicode_Char := Char; First, Last : Natural; begin if Char < 16#80# then Len := 1; Mask := 16#0#; elsif Char < 16#800# then Len := 2; Mask := 16#C0#; elsif Char < 16#10000# then Len := 3; Mask := 16#E0#; elsif Char < 16#200000# then Len := 4; Mask := 16#F0#; elsif Char < 16#4000000# then Len := 5; Mask := 16#F8#; else Len := 6; Mask := 16#FC#; end if; First := Index + 2; Last := Index + Len; for J in reverse First .. Last loop Output (J) := Character'Val ((Val and 16#3f#) or 16#80#); Val := Shift_Right (Val, 6); end loop; Output (Index + 1) := Character'Val (Val or Mask); Index := Last; end Encode; ---------- -- Read -- ---------- procedure Read (Str : Utf8_String; Index : in out Positive; Char : out Unicode_Char) is begin Utf8_Get_Char (Str, Index, Char); if Char = Unicode_Char'Last then raise Invalid_Encoding; end if; end Read; ------------ -- Length -- ------------ function Length (Str : Utf8_String) return Natural is Pos : Natural := Str'First; Length : Natural := 0; C : Unicode_Char; begin while Pos <= Str'Last loop Read (Str, Pos, C); Length := Length + 1; end loop; return Length; end Length; ---------------- -- From_Utf32 -- ---------------- function From_Utf32 (Str : Unicode.CES.Utf32.Utf32_String) return Utf8_String is -- Allocate worst case Result : Utf8_String (1 .. (Str'Length / Utf32_Char_Width) * 6); J : Positive := Str'First; R_Index : Natural := Result'First - 1; C : Unicode_Char; begin while J <= Str'Last loop Unicode.CES.Utf32.Read (Str, J, C); Encode (C, Result, R_Index); end loop; return Result (1 .. R_Index); end From_Utf32; -------------- -- To_Utf32 -- -------------- function To_Utf32 (Str : Utf8_String) return Unicode.CES.Utf32.Utf32_LE_String is -- Allocate worst case Result : Utf32_LE_String (1 .. Str'Length * Utf32_Char_Width); J : Positive := Str'First; R_Index : Natural := Result'First - 1; C : Unicode_Char; begin while J <= Str'Last loop Read (Str, J, C); Unicode.CES.Utf32.Encode (C, Result, R_Index); end loop; return Result (1 .. R_Index); end To_Utf32; ---------------------- -- Internal_Convert -- ---------------------- function Internal_Convert (Str : Utf8_String; Convert : Unicode.CCS.Conversion_Function := Identity'Access; Order : Byte_Order := Default_Byte_Order) return Utf8_String is pragma Warnings (Off, Order); Offset : Natural := 0; BOM : Bom_Type; C : Unicode_Char; J : Natural; begin Read_Bom (Str, Offset, BOM); case BOM is when Utf8_All | Unknown => null; when others => raise Invalid_Encoding; end case; if Convert = Identity'Access then return Str (Str'First + Offset .. Str'Last); else declare -- Allocate worst case for the string Result : Utf8_String (1 .. Str'Length * 6); R_Index : Natural := Result'First - 1; begin J := Str'First + Offset; while J <= Str'Last loop Read (Str, J, C); Encode (Convert (C), Result, R_Index); end loop; return Result (1 .. R_Index); end; end if; end Internal_Convert; ------------------- -- To_Unicode_LE -- ------------------- function To_Unicode_LE (Str : Utf8_String; Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set; Order : Byte_Order := Default_Byte_Order) return Utf8_String is begin return Internal_Convert (Str, Cs.To_Unicode, Order); end To_Unicode_LE; ----------- -- To_CS -- ----------- function To_CS (Str : Utf8_String; Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set; Order : Byte_Order := Default_Byte_Order) return Utf8_String is begin return Internal_Convert (Str, Cs.To_CS, Order); end To_CS; end Unicode.CES.Utf8;