Files
xmlada/unicode/unicode-ces-utf8.adb
Emmanuel Briot 53dc51bf5c Update copyright notices
Change-Id: Iae1283292b460ead48dedc565d6f24259e497e69
2017-01-03 10:02:38 +01:00

383 lines
13 KiB
Ada

------------------------------------------------------------------------------
-- XML/Ada - An XML suite for Ada95 --
-- --
-- Copyright (C) 2001-2017, AdaCore --
-- --
-- This library is free software; you can redistribute it and/or modify it --
-- under terms of the GNU General Public License as published by the Free --
-- Software Foundation; either version 3, or (at your option) any later --
-- version. This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- --
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception, --
-- version 3.1, as published by the Free Software Foundation. --
-- --
-- You should have received a copy of the GNU General Public License and --
-- a copy of the GCC Runtime Library Exception along with this program; --
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
-- <http://www.gnu.org/licenses/>. --
-- --
------------------------------------------------------------------------------
with Unicode.CES.Utf32; use Unicode.CES.Utf32;
with Unicode.CCS; use Unicode.CCS;
package body Unicode.CES.Utf8 is
function Internal_Convert
(Str : Utf8_String;
Convert : Unicode.CCS.Conversion_Function := Identity'Access;
Order : Byte_Order := Default_Byte_Order) return Utf8_String;
-- Internal function used to convert character sets
type Byte is mod 2 ** 8;
Utf8_Skip_Data : constant array (Character) of Byte :=
(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6,
6, 1, 1);
-- Skip-byte depending on the first byte of a utf8 string.
Utf8_Mask : constant array (Character) of Byte :=
(16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#, 16#7f#,
16#7f#, 16#7f#, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#,
16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#1f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#,
16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#, 16#0f#,
16#0f#, 16#0f#, 16#0f#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#, 16#07#,
16#07#, 16#07#, 16#03#, 16#03#, 16#03#, 16#03#, 16#01#, 16#01#, 0, 0);
-- Mask to decode the first byte of utf8.
-- This was generated automatically with the following algorithm:
-- if C < 128 then Mask := 16#7f#;
-- elsif (C and 16#E0#) = 16#C0# then Mask := 16#1f#;
-- elsif (C and 16#F0#) = 16#E0# then Mask := 16#0f#;
-- elsif (C and 16#F8#) = 16#F0# then Mask := 16#07#;
-- elsif (C and 16#FC#) = 16#F8# then Mask := 16#03#;
-- elsif (C and 16#FE#) = 16#FC# then Mask := 16#01#;
function Shift_Left
(Value : Unicode_Char; Amount : Natural) return Unicode_Char;
pragma Import (Intrinsic, Shift_Left);
function Shift_Right
(Value : Unicode_Char; Amount : Natural) return Unicode_Char;
pragma Import (Intrinsic, Shift_Right);
--------------------
-- Utf8_Next_Char --
--------------------
function Utf8_Next_Char
(Str : Utf8_String; Index : Natural) return Natural is
begin
return Index + Natural (Utf8_Skip_Data (Str (Index)));
end Utf8_Next_Char;
-------------------------
-- Utf8_Find_Next_Char --
-------------------------
function Utf8_Find_Next_Char
(Str : Utf8_String; Index : Natural) return Natural
is
Idx : Natural := Index;
C : Unicode_Char;
begin
while Idx <= Str'Last loop
C := Character'Pos (Str (Idx));
if (C and 16#c0#) /= 16#80# then
return Idx;
end if;
Idx := Idx + 1;
end loop;
return Str'Last + 1;
end Utf8_Find_Next_Char;
---------------------
-- Utf8_Prev_Char --
---------------------
function Utf8_Prev_Char
(Str : Utf8_String; Index : Natural) return Natural
is
C : Unicode_Char;
Idx : Natural := Index;
begin
while Idx > Str'First loop
Idx := Idx - 1;
C := Character'Pos (Str (Idx));
if (C and 16#c0#) /= 16#80# then
return Idx;
end if;
end loop;
return Str'First - 1;
end Utf8_Prev_Char;
-------------------
-- Utf8_Get_Char --
-------------------
procedure Utf8_Get_Char
(Str : Utf8_String; Index : in out Positive; Char : out Unicode_Char)
is
-- pragma Suppress (All_Checks);
Len : Natural;
Val : Unicode_Char;
C : Unicode_Char := Character'Pos (Str (Index));
Mask : constant Byte := Utf8_Mask (Str (Index));
begin
if Mask = 0 then
Char := Unicode_Char'Last;
return;
end if;
Val := C and Unicode_Char (Mask);
Len := Index + Natural (Utf8_Skip_Data (Str (Index))) - 1;
if Str'Last < Len then
Char := Unicode_Char'Last;
return;
end if;
for Count in Index + 1 .. Len loop
C := Character'Pos (Str (Count));
if (C and 16#C0#) /= 16#80# then
Char := Unicode_Char'Last;
return;
end if;
Val := Shift_Left (Val, 6) or (C and 16#3f#);
end loop;
Index := Len + 1;
Char := Val;
end Utf8_Get_Char;
-----------
-- Width --
-----------
function Width (Char : Unicode_Char) return Natural is
begin
if Char < 16#80# then
return 1;
elsif Char < 16#800# then
return 2;
elsif Char < 16#10000# then
return 3;
elsif Char < 16#200000# then
return 4;
elsif Char < 16#4000000# then
return 5;
else
return 6;
end if;
end Width;
------------
-- Encode --
------------
procedure Encode
(Char : Unicode_Char;
Output : in out Byte_Sequence;
Index : in out Natural)
is
Len : Natural;
Mask : Unicode_Char;
Val : Unicode_Char := Char;
First, Last : Natural;
begin
if Char < 16#80# then
Len := 1;
Mask := 16#0#;
elsif Char < 16#800# then
Len := 2;
Mask := 16#C0#;
elsif Char < 16#10000# then
Len := 3;
Mask := 16#E0#;
elsif Char < 16#200000# then
Len := 4;
Mask := 16#F0#;
elsif Char < 16#4000000# then
Len := 5;
Mask := 16#F8#;
else
Len := 6;
Mask := 16#FC#;
end if;
First := Index + 2;
Last := Index + Len;
for J in reverse First .. Last loop
Output (J) := Character'Val ((Val and 16#3f#) or 16#80#);
Val := Shift_Right (Val, 6);
end loop;
Output (Index + 1) := Character'Val (Val or Mask);
Index := Last;
end Encode;
----------
-- Read --
----------
procedure Read
(Str : Utf8_String;
Index : in out Positive;
Char : out Unicode_Char) is
begin
Utf8_Get_Char (Str, Index, Char);
if Char = Unicode_Char'Last then
raise Invalid_Encoding;
end if;
end Read;
------------
-- Length --
------------
function Length (Str : Utf8_String) return Natural is
Pos : Natural := Str'First;
Length : Natural := 0;
C : Unicode_Char;
begin
while Pos <= Str'Last loop
Read (Str, Pos, C);
Length := Length + 1;
end loop;
return Length;
end Length;
----------------
-- From_Utf32 --
----------------
function From_Utf32 (Str : Unicode.CES.Utf32.Utf32_String)
return Utf8_String
is
-- Allocate worst case
Result : Utf8_String (1 .. (Str'Length / Utf32_Char_Width) * 6);
J : Positive := Str'First;
R_Index : Natural := Result'First - 1;
C : Unicode_Char;
begin
while J <= Str'Last loop
Unicode.CES.Utf32.Read (Str, J, C);
Encode (C, Result, R_Index);
end loop;
return Result (1 .. R_Index);
end From_Utf32;
--------------
-- To_Utf32 --
--------------
function To_Utf32 (Str : Utf8_String)
return Unicode.CES.Utf32.Utf32_LE_String
is
-- Allocate worst case
Result : Utf32_LE_String (1 .. Str'Length * Utf32_Char_Width);
J : Positive := Str'First;
R_Index : Natural := Result'First - 1;
C : Unicode_Char;
begin
while J <= Str'Last loop
Read (Str, J, C);
Unicode.CES.Utf32.Encode (C, Result, R_Index);
end loop;
return Result (1 .. R_Index);
end To_Utf32;
----------------------
-- Internal_Convert --
----------------------
function Internal_Convert
(Str : Utf8_String;
Convert : Unicode.CCS.Conversion_Function := Identity'Access;
Order : Byte_Order := Default_Byte_Order) return Utf8_String
is
pragma Warnings (Off, Order);
Offset : Natural := 0;
BOM : Bom_Type;
C : Unicode_Char;
J : Natural;
begin
Read_Bom (Str, Offset, BOM);
case BOM is
when Utf8_All | Unknown => null;
when others => raise Invalid_Encoding;
end case;
if Convert = Identity'Access then
return Str (Str'First + Offset .. Str'Last);
else
declare
-- Allocate worst case for the string
Result : Utf8_String (1 .. Str'Length * 6);
R_Index : Natural := Result'First - 1;
begin
J := Str'First + Offset;
while J <= Str'Last loop
Read (Str, J, C);
Encode (Convert (C), Result, R_Index);
end loop;
return Result (1 .. R_Index);
end;
end if;
end Internal_Convert;
-------------------
-- To_Unicode_LE --
-------------------
function To_Unicode_LE
(Str : Utf8_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf8_String is
begin
return Internal_Convert (Str, Cs.To_Unicode, Order);
end To_Unicode_LE;
-----------
-- To_CS --
-----------
function To_CS
(Str : Utf8_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf8_String is
begin
return Internal_Convert (Str, Cs.To_CS, Order);
end To_CS;
end Unicode.CES.Utf8;