mirror of
https://github.com/AdaCore/xmlada.git
synced 2026-02-12 12:30:28 -08:00
383 lines
13 KiB
Ada
383 lines
13 KiB
Ada
------------------------------------------------------------------------------
|
|
-- XML/Ada - An XML suite for Ada95 --
|
|
-- --
|
|
-- Copyright (C) 2001-2017, AdaCore --
|
|
-- --
|
|
-- This library is free software; you can redistribute it and/or modify it --
|
|
-- under terms of the GNU General Public License as published by the Free --
|
|
-- Software Foundation; either version 3, or (at your option) any later --
|
|
-- version. This library is distributed in the hope that it will be useful, --
|
|
-- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- --
|
|
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. --
|
|
-- --
|
|
-- As a special exception under Section 7 of GPL version 3, you are granted --
|
|
-- additional permissions described in the GCC Runtime Library Exception, --
|
|
-- version 3.1, as published by the Free Software Foundation. --
|
|
-- --
|
|
-- You should have received a copy of the GNU General Public License and --
|
|
-- a copy of the GCC Runtime Library Exception along with this program; --
|
|
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
|
|
-- <http://www.gnu.org/licenses/>. --
|
|
-- --
|
|
------------------------------------------------------------------------------
|
|
|
|
with Unicode.CES.Utf32; use Unicode.CES.Utf32;
|
|
with Unicode.CCS; use Unicode.CCS;
|
|
|
|
package body Unicode.CES.Utf8 is
|
|
|
|
function Internal_Convert
|
|
(Str : Utf8_String;
|
|
Convert : Unicode.CCS.Conversion_Function := Identity'Access;
|
|
Order : Byte_Order := Default_Byte_Order) return Utf8_String;
|
|
-- Internal function used to convert character sets
|
|
|
|
type Byte is mod 2 ** 8;
|
|
Utf8_Skip_Data : constant array (Character) of Byte :=
|
|
(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6,
|
|
6, 1, 1);
|
|
-- Skip-byte depending on the first byte of a utf8 string.
|
|
|
|
Utf8_Mask : constant array (Character) of Byte :=
|
|
(16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
|
|
16#7f#, 16#7f#, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
|
|
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
|
|
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
|
|
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#,
|
|
16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#,
|
|
16#0f#, 16#0f#, 16#0f#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#,
|
|
16#07#, 16#07#, 16#03#, 16#03#, 16#03#, 16#03#, 16#01#, 16#01#, 0, 0);
|
|
-- Mask to decode the first byte of utf8.
|
|
-- This was generated automatically with the following algorithm:
|
|
-- if C < 128 then Mask := 16#7f#;
|
|
-- elsif (C and 16#E0#) = 16#C0# then Mask := 16#1f#;
|
|
-- elsif (C and 16#F0#) = 16#E0# then Mask := 16#0f#;
|
|
-- elsif (C and 16#F8#) = 16#F0# then Mask := 16#07#;
|
|
-- elsif (C and 16#FC#) = 16#F8# then Mask := 16#03#;
|
|
-- elsif (C and 16#FE#) = 16#FC# then Mask := 16#01#;
|
|
|
|
function Shift_Left
|
|
(Value : Unicode_Char; Amount : Natural) return Unicode_Char;
|
|
pragma Import (Intrinsic, Shift_Left);
|
|
|
|
function Shift_Right
|
|
(Value : Unicode_Char; Amount : Natural) return Unicode_Char;
|
|
pragma Import (Intrinsic, Shift_Right);
|
|
|
|
--------------------
|
|
-- Utf8_Next_Char --
|
|
--------------------
|
|
|
|
function Utf8_Next_Char
|
|
(Str : Utf8_String; Index : Natural) return Natural is
|
|
begin
|
|
return Index + Natural (Utf8_Skip_Data (Str (Index)));
|
|
end Utf8_Next_Char;
|
|
|
|
-------------------------
|
|
-- Utf8_Find_Next_Char --
|
|
-------------------------
|
|
|
|
function Utf8_Find_Next_Char
|
|
(Str : Utf8_String; Index : Natural) return Natural
|
|
is
|
|
Idx : Natural := Index;
|
|
C : Unicode_Char;
|
|
begin
|
|
while Idx <= Str'Last loop
|
|
C := Character'Pos (Str (Idx));
|
|
if (C and 16#c0#) /= 16#80# then
|
|
return Idx;
|
|
end if;
|
|
Idx := Idx + 1;
|
|
end loop;
|
|
return Str'Last + 1;
|
|
end Utf8_Find_Next_Char;
|
|
|
|
---------------------
|
|
-- Utf8_Prev_Char --
|
|
---------------------
|
|
|
|
function Utf8_Prev_Char
|
|
(Str : Utf8_String; Index : Natural) return Natural
|
|
is
|
|
C : Unicode_Char;
|
|
Idx : Natural := Index;
|
|
begin
|
|
while Idx > Str'First loop
|
|
Idx := Idx - 1;
|
|
C := Character'Pos (Str (Idx));
|
|
if (C and 16#c0#) /= 16#80# then
|
|
return Idx;
|
|
end if;
|
|
end loop;
|
|
return Str'First - 1;
|
|
end Utf8_Prev_Char;
|
|
|
|
-------------------
|
|
-- Utf8_Get_Char --
|
|
-------------------
|
|
|
|
procedure Utf8_Get_Char
|
|
(Str : Utf8_String; Index : in out Positive; Char : out Unicode_Char)
|
|
is
|
|
-- pragma Suppress (All_Checks);
|
|
Len : Natural;
|
|
Val : Unicode_Char;
|
|
C : Unicode_Char := Character'Pos (Str (Index));
|
|
Mask : constant Byte := Utf8_Mask (Str (Index));
|
|
begin
|
|
if Mask = 0 then
|
|
Char := Unicode_Char'Last;
|
|
return;
|
|
end if;
|
|
|
|
Val := C and Unicode_Char (Mask);
|
|
Len := Index + Natural (Utf8_Skip_Data (Str (Index))) - 1;
|
|
|
|
if Str'Last < Len then
|
|
Char := Unicode_Char'Last;
|
|
return;
|
|
end if;
|
|
|
|
for Count in Index + 1 .. Len loop
|
|
C := Character'Pos (Str (Count));
|
|
if (C and 16#C0#) /= 16#80# then
|
|
Char := Unicode_Char'Last;
|
|
return;
|
|
end if;
|
|
Val := Shift_Left (Val, 6) or (C and 16#3f#);
|
|
end loop;
|
|
|
|
Index := Len + 1;
|
|
Char := Val;
|
|
end Utf8_Get_Char;
|
|
|
|
-----------
|
|
-- Width --
|
|
-----------
|
|
|
|
function Width (Char : Unicode_Char) return Natural is
|
|
begin
|
|
if Char < 16#80# then
|
|
return 1;
|
|
elsif Char < 16#800# then
|
|
return 2;
|
|
elsif Char < 16#10000# then
|
|
return 3;
|
|
elsif Char < 16#200000# then
|
|
return 4;
|
|
elsif Char < 16#4000000# then
|
|
return 5;
|
|
else
|
|
return 6;
|
|
end if;
|
|
end Width;
|
|
|
|
------------
|
|
-- Encode --
|
|
------------
|
|
|
|
procedure Encode
|
|
(Char : Unicode_Char;
|
|
Output : in out Byte_Sequence;
|
|
Index : in out Natural)
|
|
is
|
|
Len : Natural;
|
|
Mask : Unicode_Char;
|
|
Val : Unicode_Char := Char;
|
|
First, Last : Natural;
|
|
begin
|
|
if Char < 16#80# then
|
|
Len := 1;
|
|
Mask := 16#0#;
|
|
elsif Char < 16#800# then
|
|
Len := 2;
|
|
Mask := 16#C0#;
|
|
elsif Char < 16#10000# then
|
|
Len := 3;
|
|
Mask := 16#E0#;
|
|
elsif Char < 16#200000# then
|
|
Len := 4;
|
|
Mask := 16#F0#;
|
|
elsif Char < 16#4000000# then
|
|
Len := 5;
|
|
Mask := 16#F8#;
|
|
else
|
|
Len := 6;
|
|
Mask := 16#FC#;
|
|
end if;
|
|
|
|
First := Index + 2;
|
|
Last := Index + Len;
|
|
for J in reverse First .. Last loop
|
|
Output (J) := Character'Val ((Val and 16#3f#) or 16#80#);
|
|
Val := Shift_Right (Val, 6);
|
|
end loop;
|
|
Output (Index + 1) := Character'Val (Val or Mask);
|
|
Index := Last;
|
|
end Encode;
|
|
|
|
----------
|
|
-- Read --
|
|
----------
|
|
|
|
procedure Read
|
|
(Str : Utf8_String;
|
|
Index : in out Positive;
|
|
Char : out Unicode_Char) is
|
|
begin
|
|
Utf8_Get_Char (Str, Index, Char);
|
|
if Char = Unicode_Char'Last then
|
|
raise Invalid_Encoding;
|
|
end if;
|
|
end Read;
|
|
|
|
------------
|
|
-- Length --
|
|
------------
|
|
|
|
function Length (Str : Utf8_String) return Natural is
|
|
Pos : Natural := Str'First;
|
|
Length : Natural := 0;
|
|
C : Unicode_Char;
|
|
begin
|
|
while Pos <= Str'Last loop
|
|
Read (Str, Pos, C);
|
|
Length := Length + 1;
|
|
end loop;
|
|
return Length;
|
|
end Length;
|
|
|
|
----------------
|
|
-- From_Utf32 --
|
|
----------------
|
|
|
|
function From_Utf32 (Str : Unicode.CES.Utf32.Utf32_String)
|
|
return Utf8_String
|
|
is
|
|
-- Allocate worst case
|
|
Result : Utf8_String (1 .. (Str'Length / Utf32_Char_Width) * 6);
|
|
J : Positive := Str'First;
|
|
R_Index : Natural := Result'First - 1;
|
|
C : Unicode_Char;
|
|
begin
|
|
while J <= Str'Last loop
|
|
Unicode.CES.Utf32.Read (Str, J, C);
|
|
Encode (C, Result, R_Index);
|
|
end loop;
|
|
|
|
return Result (1 .. R_Index);
|
|
end From_Utf32;
|
|
|
|
--------------
|
|
-- To_Utf32 --
|
|
--------------
|
|
|
|
function To_Utf32 (Str : Utf8_String)
|
|
return Unicode.CES.Utf32.Utf32_LE_String
|
|
is
|
|
-- Allocate worst case
|
|
Result : Utf32_LE_String (1 .. Str'Length * Utf32_Char_Width);
|
|
J : Positive := Str'First;
|
|
R_Index : Natural := Result'First - 1;
|
|
C : Unicode_Char;
|
|
begin
|
|
while J <= Str'Last loop
|
|
Read (Str, J, C);
|
|
Unicode.CES.Utf32.Encode (C, Result, R_Index);
|
|
end loop;
|
|
return Result (1 .. R_Index);
|
|
end To_Utf32;
|
|
|
|
----------------------
|
|
-- Internal_Convert --
|
|
----------------------
|
|
|
|
function Internal_Convert
|
|
(Str : Utf8_String;
|
|
Convert : Unicode.CCS.Conversion_Function := Identity'Access;
|
|
Order : Byte_Order := Default_Byte_Order) return Utf8_String
|
|
is
|
|
pragma Warnings (Off, Order);
|
|
Offset : Natural := 0;
|
|
BOM : Bom_Type;
|
|
C : Unicode_Char;
|
|
J : Natural;
|
|
begin
|
|
Read_Bom (Str, Offset, BOM);
|
|
|
|
case BOM is
|
|
when Utf8_All | Unknown => null;
|
|
when others => raise Invalid_Encoding;
|
|
end case;
|
|
|
|
if Convert = Identity'Access then
|
|
return Str (Str'First + Offset .. Str'Last);
|
|
else
|
|
declare
|
|
-- Allocate worst case for the string
|
|
Result : Utf8_String (1 .. Str'Length * 6);
|
|
R_Index : Natural := Result'First - 1;
|
|
begin
|
|
J := Str'First + Offset;
|
|
while J <= Str'Last loop
|
|
Read (Str, J, C);
|
|
Encode (Convert (C), Result, R_Index);
|
|
end loop;
|
|
return Result (1 .. R_Index);
|
|
end;
|
|
end if;
|
|
end Internal_Convert;
|
|
|
|
-------------------
|
|
-- To_Unicode_LE --
|
|
-------------------
|
|
|
|
function To_Unicode_LE
|
|
(Str : Utf8_String;
|
|
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
|
|
Order : Byte_Order := Default_Byte_Order) return Utf8_String is
|
|
begin
|
|
return Internal_Convert (Str, Cs.To_Unicode, Order);
|
|
end To_Unicode_LE;
|
|
|
|
-----------
|
|
-- To_CS --
|
|
-----------
|
|
|
|
function To_CS
|
|
(Str : Utf8_String;
|
|
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
|
|
Order : Byte_Order := Default_Byte_Order) return Utf8_String is
|
|
begin
|
|
return Internal_Convert (Str, Cs.To_CS, Order);
|
|
end To_CS;
|
|
end Unicode.CES.Utf8;
|