Files
xmlada/unicode/unicode-ces-utf16.adb
Emmanuel Briot 53dc51bf5c Update copyright notices
Change-Id: Iae1283292b460ead48dedc565d6f24259e497e69
2017-01-03 10:02:38 +01:00

347 lines
10 KiB
Ada

------------------------------------------------------------------------------
-- XML/Ada - An XML suite for Ada95 --
-- --
-- Copyright (C) 2001-2017, AdaCore --
-- --
-- This library is free software; you can redistribute it and/or modify it --
-- under terms of the GNU General Public License as published by the Free --
-- Software Foundation; either version 3, or (at your option) any later --
-- version. This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- --
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception, --
-- version 3.1, as published by the Free Software Foundation. --
-- --
-- You should have received a copy of the GNU General Public License and --
-- a copy of the GCC Runtime Library Exception along with this program; --
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
-- <http://www.gnu.org/licenses/>. --
-- --
------------------------------------------------------------------------------
with Unicode.CES.Utf32; use Unicode.CES.Utf32;
with Unicode.CCS; use Unicode.CCS;
package body Unicode.CES.Utf16 is
------------
-- Encode --
------------
procedure Encode
(Char : Unicode_Char;
Output : in out Byte_Sequence;
Index : in out Natural)
is
C, D : Unicode_Char;
begin
if Char < 16#10000# then
C := Char and 16#00FF#;
D := (Char and 16#FF00#) / (2 ** 8);
Output (Index + 1) := Character'Val (C);
Output (Index + 2) := Character'Val (D);
Index := Index + 2;
else
C := 16#D800#
+ ((Char - 16#10000#) and 2#11111111110000000000#) / (2 ** 10);
D := 16#DC00#
+ ((Char - 16#10000#) and 2#1111111111#);
Output (Index + 1) := Character'Val (C and 16#00FF#);
Output (Index + 2) := Character'Val ((C and 16#FF00#) / (2 ** 8));
Output (Index + 3) := Character'Val (D and 16#00FF#);
Output (Index + 4) := Character'Val ((D and 16#FF00#) / (2 ** 8));
Index := Index + 4;
end if;
end Encode;
---------------
-- Encode_BE --
---------------
procedure Encode_BE
(Char : Unicode_Char;
Output : in out Byte_Sequence;
Index : in out Natural)
is
C, D : Unicode_Char;
begin
if Char < 16#10000# then
C := Char and 16#00FF#;
D := (Char and 16#FF00#) / (2 ** 8);
Output (Index + 1) := Character'Val (D);
Output (Index + 2) := Character'Val (C);
Index := Index + 2;
else
C := 16#D800#
+ ((Char - 16#10000#) and 2#11111111110000000000#) / (2 ** 10);
D := 16#DC00#
+ ((Char - 16#10000#) and 2#1111111111#);
Output (Index + 1) := Character'Val ((C and 16#FF00#) / (2 ** 8));
Output (Index + 2) := Character'Val (C and 16#00FF#);
Output (Index + 3) := Character'Val ((D and 16#FF00#) / (2 ** 8));
Output (Index + 4) := Character'Val (D and 16#00FF#);
Index := Index + 4;
end if;
end Encode_BE;
----------
-- Read --
----------
procedure Read
(Str : Utf16_LE_String;
Index : in out Positive;
Char : out Unicode_Char)
is
C, D : Unicode_Char;
begin
if Index + 1 > Str'Last then
raise Incomplete_Encoding;
end if;
C := Character'Pos (Str (Index + 1)) * 256 + Character'Pos (Str (Index));
-- High surrogate value
if C in 16#D800# .. 16#DBFF# then
if Index + 3 > Str'Last then
raise Incomplete_Encoding;
end if;
D := Character'Pos (Str (Index + 3)) * 256
+ Character'Pos (Str (Index + 2));
-- Not a low surrogate ?
if not (D in 16#DC00# .. 16#DFFF#) then
raise Invalid_Encoding;
end if;
C := C and 2#1111111111#;
D := D and 2#1111111111#;
Char := C * 2#10000000000# + D + 16#10000#;
Index := Index + 4;
else
Char := C;
Index := Index + 2;
end if;
end Read;
-------------
-- Read_BE --
-------------
procedure Read_BE
(Str : Utf16_BE_String;
Index : in out Positive;
Char : out Unicode_Char)
is
C, D : Unicode_Char;
begin
if Index + 1 > Str'Last then
raise Incomplete_Encoding;
end if;
C := Character'Pos (Str (Index)) * 256 + Character'Pos (Str (Index + 1));
-- High surrogate value
if C in 16#D800# .. 16#DBFF# then
if Index + 3 > Str'Last then
raise Incomplete_Encoding;
end if;
D := Character'Pos (Str (Index + 2)) * 256
+ Character'Pos (Str (Index + 3));
-- Not a low surrogate ?
if not (D in 16#DC00# .. 16#DFFF#) then
raise Invalid_Encoding;
end if;
C := C and 2#1111111111#;
D := D and 2#1111111111#;
Char := C * 2#10000000000# + D + 16#10000#;
Index := Index + 4;
else
Char := C;
Index := Index + 2;
end if;
end Read_BE;
-----------
-- Width --
-----------
function Width (Char : Unicode_Char) return Natural is
begin
if Char >= 16#10000# then
return 4;
else
return 2;
end if;
end Width;
------------
-- Length --
------------
function Length (Str : Utf16_String) return Natural is
Pos : Natural := Str'First;
Len : Natural := 0;
C : Unicode_Char;
begin
while Pos <= Str'Last loop
Read (Str, Pos, C);
Len := Len + 1;
end loop;
return Len;
end Length;
----------------
-- From_Utf32 --
----------------
function From_Utf32
(Str : Unicode.CES.Utf32.Utf32_LE_String)
return Utf16_LE_String
is
Result : Utf16_LE_String (1 .. (Str'Length / Utf32_Char_Width) * 4);
J : Positive := Str'First;
R_Index : Natural := Result'First - 1;
C : Unicode_Char;
begin
while J <= Str'Last loop
Unicode.CES.Utf32.Read (Str, J, C);
Encode (C, Result, R_Index);
end loop;
return Result (1 .. R_Index);
end From_Utf32;
--------------
-- To_Utf32 --
--------------
function To_Utf32
(Str : Utf16_LE_String)
return Unicode.CES.Utf32.Utf32_LE_String
is
Result : Utf32_LE_String (1 .. (Str'Length / 2) * Utf32_Char_Width);
J : Natural := Str'First;
R_Index : Natural := Result'First - 1;
C : Unicode_Char;
begin
while J <= Str'Last loop
Read (Str, J, C);
Unicode.CES.Utf32.Encode (C, Result, R_Index);
end loop;
return Result (1 .. R_Index);
end To_Utf32;
-------------------
-- To_Unicode_LE --
-------------------
-- ??? Note: this assumes that the original character and its
-- conversion are encoded on the same length, which is always
-- right so far with Unicode.
function To_Unicode_LE
(Str : Utf16_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf16_LE_String
is
BOM : Bom_Type;
Offset : Natural := 0;
O : Byte_Order := Order;
J : Natural := Str'First;
S : Utf16_LE_String (1 .. Str'Length);
C : Unicode_Char;
begin
Read_Bom (Str, Offset, BOM);
case BOM is
when Utf16_LE => O := Low_Byte_First;
when Utf16_BE => O := High_Byte_First;
when Unknown => null;
when others => raise Invalid_Encoding;
end case;
if O = Low_Byte_First then
if Cs.To_Unicode = Identity'Access then
return Str (Str'First + Offset .. Str'Last);
else
J := J + Offset - 1;
while J <= Str'Last loop
Read (Str, J, C);
Encode (Cs.To_Unicode (C), S, J);
end loop;
return S (S'First + Offset .. S'Last);
end if;
else
J := J + Offset;
if Cs.To_Unicode = Identity'Access then
while J <= Str'Last loop
S (J + 1) := Str (J);
S (J) := Str (J + 1);
J := J + 2;
end loop;
else
J := J - 1;
while J <= Str'Last loop
Read_BE (Str, J, C);
Encode (Cs.To_Unicode (C), S, J);
end loop;
return S (S'First + Offset .. S'Last);
end if;
return S (S'First + Offset .. S'Last);
end if;
end To_Unicode_LE;
-----------
-- To_CS --
-----------
function To_CS
(Str : Utf16_LE_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf16_String
is
pragma Warnings (Off, Order);
Offset : constant Natural := 0;
J : Natural := Str'First;
S : Utf16_LE_String (1 .. Str'Length);
C : Unicode_Char;
begin
if Order = Low_Byte_First then
if Cs.To_CS = Identity'Access then
return Str (Str'First + Offset .. Str'Last);
else
J := J + Offset - 1;
while J <= Str'Last loop
Read (Str, J, C);
Encode (Cs.To_CS (C), S, J);
end loop;
return S (S'First + Offset .. S'Last);
end if;
else
J := J + Offset;
if Cs.To_CS = Identity'Access then
while J <= Str'Last loop
S (J + 1) := Str (J);
S (J) := Str (J + 1);
J := J + 2;
end loop;
else
J := J - 1;
while J <= Str'Last loop
Read (Str, J, C);
Encode_BE (Cs.To_CS (C), S, J);
end loop;
return S (S'First + Offset .. S'Last);
end if;
return S (S'First + Offset .. S'Last);
end if;
end To_CS;
end Unicode.CES.Utf16;