Files
VSS/source/text/implementation/vss-implementation-utf8_normalization.adb
Vadim Godunko 120eea9eac Revert "Merge branch 'topic/vadim/text' into 'master'"
This reverts merge request !394
2025-08-21 11:03:15 +00:00

1972 lines
78 KiB
Ada

--
-- Copyright (C) 2023-2025, AdaCore
--
-- SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--
with VSS.Implementation.UCD_Normalization_Common;
with VSS.Implementation.Strings;
with VSS.Implementation.UTF8_Strings.Mutable_Operations;
package body VSS.Implementation.UTF8_Normalization is
use type VSS.Implementation.Strings.Character_Count;
use type VSS.Unicode.UTF8_Code_Unit_Offset;
procedure Append_Reordered
(Result_Data : in out
VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Last_CCC :
in out VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Source_Storage :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Source_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Source_Size : VSS.Unicode.UTF8_Code_Unit_Count);
-- Append full decomposition mapping specified by Info to the end of the
-- Result_Data with canonical reordering when necessary.
function Get_Decomposition_Information
(Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Code : VSS.Unicode.Code_Point)
return VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
-- Returns decomposition information for given data and character.
----------------------
-- Append_Reordered --
----------------------
procedure Append_Reordered
(Result_Data : in out
VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Last_CCC :
in out VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Source_Storage :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Source_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Source_Size : VSS.Unicode.UTF8_Code_Unit_Count)
is
use all type VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
procedure Reorder_And_Insert
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
CCC : VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Storage : VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Size : VSS.Unicode.UTF8_Code_Unit_Count);
-- Insert given encoded character into the string preserving canonical
-- ordering.
------------------------
-- Reorder_And_Insert --
------------------------
procedure Reorder_And_Insert
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
CCC : VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Storage : VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Size : VSS.Unicode.UTF8_Code_Unit_Count)
is
Previous_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Previous_Code : VSS.Unicode.Code_Point;
Previous_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Insert_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
begin
Previous_Offset := Result_Size;
loop
Insert_Offset := Previous_Offset;
exit when Previous_Offset = 0;
VSS.Implementation.UTF8_Strings.Unchecked_Backward_Decode
(Result_Data, Previous_Offset, Previous_Code);
Previous_Info :=
Get_Decomposition_Information
(Decomposition_Data, Previous_Code);
exit when Previous_Info.CCC = CCC_NR
or Previous_Info.CCC <= CCC;
end loop;
VSS.Implementation.UTF8_Strings.Mutable_Operations.Unchecked_Insert
(Result_Data,
Result_Size,
Insert_Offset,
Storage,
Offset,
Size,
1);
end Reorder_And_Insert;
Next_Offset : VSS.Unicode.UTF8_Code_Unit_Offset := Source_Offset;
Current_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Current_Code : VSS.Unicode.Code_Point;
Current_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
begin
loop
Current_Offset := Next_Offset;
exit when Next_Offset >= Source_Offset + Source_Size;
VSS.Implementation.UTF8_Encoding.Unchecked_Decode_Forward
(Source_Storage, Next_Offset, Current_Code);
Current_Info :=
Get_Decomposition_Information (Decomposition_Data, Current_Code);
if Last_CCC > Current_Info.CCC then
Reorder_And_Insert
(Result_Data,
Result_Size,
Current_Info.CCC,
Source_Storage,
Current_Offset,
Next_Offset - Current_Offset);
else
pragma Warnings (Off);
-- Disable warnings, this code is never executed for now,
-- however, may need to be completed later.
Last_CCC := Current_Info.CCC;
raise Program_Error;
pragma Warnings (On);
end if;
end loop;
end Append_Reordered;
-----------------
-- Decomposite --
-----------------
procedure Decompose
(Source_Storage :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Source_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Result_Data : out
VSS.Implementation.UTF8_Strings.UTF8_String_Data)
is
use all type VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
use type VSS.Unicode.Code_Point;
procedure Reorder_And_Insert
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
CCC : VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Storage : VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Size : VSS.Unicode.UTF8_Code_Unit_Count);
-- Insert given encoded character into the string preserving canonical
-- ordering.
------------------------
-- Reorder_And_Insert --
------------------------
procedure Reorder_And_Insert
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
CCC : VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Storage : VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Size : VSS.Unicode.UTF8_Code_Unit_Count)
is
Previous_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Previous_Code : VSS.Unicode.Code_Point;
Previous_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Insert_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
begin
Previous_Offset := Result_Size;
loop
Insert_Offset := Previous_Offset;
exit when Previous_Offset = 0;
VSS.Implementation.UTF8_Strings.Unchecked_Backward_Decode
(Result_Data, Previous_Offset, Previous_Code);
Previous_Info :=
Get_Decomposition_Information
(Decomposition_Data, Previous_Code);
exit when Previous_Info.CCC = CCC_NR
or Previous_Info.CCC <= CCC;
end loop;
VSS.Implementation.UTF8_Strings.Mutable_Operations.Unchecked_Insert
(Result_Data,
Result_Size,
Insert_Offset,
Storage,
Offset,
Size,
1);
end Reorder_And_Insert;
Result_Size : VSS.Unicode.UTF8_Code_Unit_Count := 0;
Code : VSS.Unicode.Code_Point;
Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Start : VSS.Unicode.UTF8_Code_Unit_Offset;
Out_Start : VSS.Unicode.UTF8_Code_Unit_Offset;
Offset : VSS.Unicode.UTF8_Code_Unit_Offset := 0;
Length : VSS.Implementation.Strings.Character_Count;
Last_CCC : VSS.Implementation.UCD_Normalization_UTF8.CCC_Values :=
VSS.Implementation.UCD_Normalization_UTF8.CCC_NR;
begin
loop
-- Check whether source string is in normalization form, and attempt
-- to lookup for maximum length of normalized data.
Start := Offset;
Length := 0;
loop
Out_Start := Offset;
exit when Offset >= Source_Size;
VSS.Implementation.UTF8_Encoding.Unchecked_Decode_Forward
(Source_Storage, Offset, Code);
Info := Get_Decomposition_Information (Decomposition_Data, Code);
exit when not Info.Decomposition_QC;
-- Copy data and run normalization
exit when Last_CCC > Info.CCC and Info.CCC /= CCC_NR;
-- Copy data and run reordering
Last_CCC := Info.CCC;
Length := Length + 1;
end loop;
-- Copy found normalized data if any
if Start /= Out_Start then
VSS.Implementation.UTF8_Strings.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Start,
Out_Start - Start,
Length);
end if;
exit when Out_Start >= Source_Size;
-- Source text has been processed completely, exit.
if Info.Decomposition_QC then
-- Apply canonical ordering algoriphm to the next character in
-- the source string.
Reorder_And_Insert
(Result_Data,
Result_Size,
Info.CCC,
Source_Storage,
Out_Start,
Offset - Out_Start);
else
-- Apply decomposition mapping
if Code in 16#AC00# .. 16#AC00# + 11_172 then
-- Hangul syllables are decomposed algorithmically.
declare
use type VSS.Unicode.UTF8_Code_Unit;
S_Base : constant := 16#AC00#;
L_Base : constant := 16#1100#;
V_Base : constant := 16#1161#;
T_Base : constant := 16#11A7#;
T_Count : constant := 28;
N_Count : constant := 588; -- V_Count * T_Count
S_Index : constant VSS.Unicode.Code_Point := Code - S_Base;
L_Index : constant VSS.Unicode.Code_Point :=
S_Index / N_Count;
V_Index : constant VSS.Unicode.Code_Point :=
(S_Index mod N_Count) / T_Count;
T_Index : constant VSS.Unicode.Code_Point :=
S_Index mod T_Count;
L_Part : constant VSS.Unicode.Code_Point :=
L_Base + L_Index;
V_Part : constant VSS.Unicode.Code_Point :=
V_Base + V_Index;
T_Part : constant VSS.Unicode.Code_Point :=
T_Base + T_Index;
Aux : VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. 8);
begin
-- First byte of encoded sequence for all characters of the
-- decomposition is always 16#E1#, thus don't compute it and
-- ignore corresponding bits in L_Part/V_Part/T_Part.
Aux (0) := 16#E1#;
Aux (1) :=
16#80#
or VSS.Unicode.UTF8_Code_Unit
((L_Part / 16#40#) mod 16#40#);
Aux (2) :=
16#80# or VSS.Unicode.UTF8_Code_Unit (L_Part mod 16#40#);
Aux (3) := 16#E1#;
Aux (4) :=
16#80#
or VSS.Unicode.UTF8_Code_Unit
((V_Part / 16#40#) mod 16#40#);
Aux (5) :=
16#80# or VSS.Unicode.UTF8_Code_Unit (V_Part mod 16#40#);
if T_Index = 0 then
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Append
(Result_Data,
Result_Size,
Aux,
0,
6,
2);
else
Aux (6) := 16#E1#;
Aux (7) :=
16#80#
or VSS.Unicode.UTF8_Code_Unit
((T_Part / 16#40#) mod 16#40#);
Aux (8) :=
16#80#
or VSS.Unicode.UTF8_Code_Unit (T_Part mod 16#40#);
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Append
(Result_Data,
Result_Size,
Aux,
0,
9,
3);
end if;
Last_CCC := CCC_NR;
end;
elsif Info.First_CCC /= CCC_NR
and Last_CCC /= CCC_NR
and Last_CCC > Info.First_CCC
then
-- Reordering is necessary
Append_Reordered
(Result_Data,
Result_Size,
Decomposition_Data,
Last_CCC,
VSS.Implementation.UCD_Normalization_UTF8.UTF8_Data_Table,
Info.Offset,
Info.Size);
else
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Append
(Result_Data,
Result_Size,
VSS.Implementation.UCD_Normalization_UTF8.UTF8_Data_Table,
Info.Offset,
Info.Size,
Info.Length);
Last_CCC := Info.Last_CCC;
end if;
end if;
exit when Offset >= Source_Size;
end loop;
end Decompose;
---------------------------
-- Decompose_And_Compose --
---------------------------
procedure Decompose_And_Compose
(Source_Storage :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array;
Source_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Result_Data : out
VSS.Implementation.UTF8_Strings.UTF8_String_Data)
is
use type
VSS.Implementation.UCD_Normalization_Common.First_Mapping_Code_Offset;
use type
VSS.Implementation.UCD_Normalization_Common.Last_Mapping_Code_Offset;
use all type VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
use all type
VSS.Implementation.UCD_Normalization_UTF8.Composition_Quick_Check;
use type VSS.Unicode.Code_Point;
S_Base : constant := 16#AC00#;
L_Base : constant := 16#1100#;
V_Base : constant := 16#1161#;
T_Base : constant := 16#11A7#;
V_Count : constant := 21;
T_Count : constant := 28;
N_Count : constant := V_Count * T_Count; -- 588;
function Backward
(Data : VSS.Implementation.UTF8_Strings.UTF8_String_Data;
From : VSS.Unicode.UTF8_Code_Unit_Offset;
Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Code : out VSS.Unicode.Code_Point;
Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
return Boolean;
-- Decode previous character, obtain its normalization information
-- and return True on success. Return False when given character is
-- first character of the string.
function Has_Decomposition
(Info : VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
return Boolean;
-- Return True when character has full decomposition.
procedure Apply_Decomposition
(Result_Data : in out
VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
From_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
From_Size : VSS.Unicode.UTF8_Code_Unit_Count;
From_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Skip_Compositon : out Boolean;
Starter_Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : out VSS.Unicode.Code_Point;
Starter_Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Previous_CCC : out
VSS.Implementation.UCD_Normalization_UTF8.CCC_Values);
-- Apply decomposition to the slice of the string starting from given
-- offset.
procedure Apply_Canonical_Composition
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
From_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
From_Size : VSS.Unicode.UTF8_Code_Unit_Count;
From_Code : VSS.Unicode.Code_Point;
From_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
with Pre => From_Info.CCC = CCC_NR;
-- Apply canonical composition to the slice of the string starting
-- from the given position till the end of the string. Character at
-- the position must be starter.
procedure Lookup_Starter
(Result_Data : VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : VSS.Unicode.UTF8_Code_Unit_Offset;
Need_Decomposition : out Boolean;
Skip_Composition : out Boolean;
Starter_Found : out Boolean;
Starter_Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : out VSS.Unicode.Code_Point;
Starter_Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information);
-- Lookup for the last character in the string that is starter
-- or its decomposition will provide starter.
---------------------------------
-- Apply_Canonical_Composition --
---------------------------------
procedure Apply_Canonical_Composition
(Result_Data : in out VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
From_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
From_Size : VSS.Unicode.UTF8_Code_Unit_Count;
From_Code : VSS.Unicode.Code_Point;
From_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
is
Previous_CCC :
VSS.Implementation.UCD_Normalization_UTF8.CCC_Values;
Current_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Current_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Current_Code : VSS.Unicode.Code_Point;
Current_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Current_Consumed : Boolean;
Next_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Current_Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Current_Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Current_Starter_Code : VSS.Unicode.Code_Point;
Current_Starter_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
New_Starter_Code : VSS.Unicode.Code_Point;
New_Starter_Buffer :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array (0 .. 3);
New_Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
begin
Previous_CCC := CCC_NR;
Next_Offset := From_Offset + From_Size;
Current_Starter_Offset := From_Offset;
Current_Starter_Size := From_Size;
Current_Starter_Code := From_Code;
Current_Starter_Info := From_Info;
loop
exit when Next_Offset >= Result_Size;
Current_Offset := Next_Offset;
VSS.Implementation.UTF8_Strings.Unchecked_Decode_Forward
(Result_Data, Next_Offset, Current_Code);
Current_Info :=
Get_Decomposition_Information
(Decomposition_Data, Current_Code);
Current_Size := Next_Offset - Current_Offset;
Current_Consumed := False;
-- Current character may compose with the starter
if Current_Starter_Info.First_Index /= 0
and Current_Info.Last_Index /= 0
then
if Previous_CCC < Current_Info.CCC then
-- Current character is not blocked from the starter and can
-- be last character of the decomposition mapping of the
-- some primary composite; thus lookup for primary composite.
New_Starter_Code :=
VSS.Implementation.UCD_Normalization_Common
.Composition_Mapping
(Current_Info.Last_Index,
Current_Starter_Info.First_Index);
if New_Starter_Code /= 0 then
-- Starter and current character is decomposition mapping
-- of the primary composite, remove current character and
-- replace starter by the primary composite found.
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Delete
(Result_Data,
Result_Size,
Current_Offset,
Current_Size,
1);
VSS.Implementation.UTF8_Encoding.Encode
(New_Starter_Code,
New_Starter_Size,
New_Starter_Buffer (0),
New_Starter_Buffer (1),
New_Starter_Buffer (2),
New_Starter_Buffer (3));
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Replace
(Result_Data,
Result_Size,
Current_Starter_Offset,
Current_Starter_Size,
1,
New_Starter_Buffer,
New_Starter_Buffer'First,
New_Starter_Size,
1);
Next_Offset :=
Next_Offset + New_Starter_Size
- Current_Starter_Size - Current_Size;
Current_Starter_Size := New_Starter_Size;
Current_Starter_Code := New_Starter_Code;
Current_Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, New_Starter_Code);
Current_Consumed := True;
else
Previous_CCC := Current_Info.CCC;
end if;
elsif Previous_CCC = CCC_NR and Current_Info.CCC = CCC_NR then
-- Two starters
New_Starter_Code :=
VSS.Implementation.UCD_Normalization_Common
.Composition_Mapping
(Current_Info.Last_Index,
Current_Starter_Info.First_Index);
if New_Starter_Code /= 0 then
-- Starter and current character is decomposition mapping
-- of the primary composite, remove current character and
-- replace starter by the primary composite found.
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Delete
(Result_Data,
Result_Size,
Current_Offset,
Current_Size,
1);
VSS.Implementation.UTF8_Encoding.Encode
(New_Starter_Code,
New_Starter_Size,
New_Starter_Buffer (0),
New_Starter_Buffer (1),
New_Starter_Buffer (2),
New_Starter_Buffer (3));
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Replace
(Result_Data,
Result_Size,
Current_Starter_Offset,
Current_Starter_Size,
1,
New_Starter_Buffer,
New_Starter_Buffer'First,
New_Starter_Size,
1);
Next_Offset :=
Next_Offset + New_Starter_Size
- Current_Starter_Size - Current_Size;
Current_Starter_Size := New_Starter_Size;
Current_Starter_Code := New_Starter_Code;
Current_Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, New_Starter_Code);
Current_Consumed := True;
end if;
end if;
elsif Current_Code in 16#1161# .. 16#1175#
and Current_Starter_Code in 16#1100# .. 16#1112#
then
-- Hangul Syllable Composition:
--
-- Leading consonant + Vowel => LV_Syllable
declare
Starter_Buffer :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. 3);
L_Index : VSS.Unicode.Code_Point;
V_Index : VSS.Unicode.Code_Point;
LV_Index : VSS.Unicode.Code_Point;
begin
L_Index := Current_Starter_Code - L_Base;
V_Index := Current_Code - V_Base;
LV_Index := L_Index * N_Count + V_Index * T_Count;
Current_Starter_Code := S_Base + LV_Index;
Current_Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, Current_Starter_Code);
-- Encoded size of all possible characters are, same, so
-- reuse variable.
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Delete
(Result_Data,
Result_Size,
Current_Offset,
Current_Size,
1);
VSS.Implementation.UTF8_Encoding.Encode
(Current_Starter_Code,
Current_Starter_Size,
Starter_Buffer (0),
Starter_Buffer (1),
Starter_Buffer (2),
Starter_Buffer (3));
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Replace
(Result_Data,
Result_Size,
Current_Starter_Offset,
Current_Starter_Size,
1,
Starter_Buffer,
Starter_Buffer'First,
Current_Starter_Size,
1);
Next_Offset := Next_Offset - Current_Size;
Current_Consumed := True;
end;
elsif Current_Code in 16#11A8# .. 16#11C2#
and Current_Starter_Code in 16#AC00# .. 16#D7A3#
then
-- Hangul Syllable Composition:
--
-- LV_Syllable + Trailing consonant => LVT_Syllable
declare
Starter_Buffer :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. 3);
S_Index : VSS.Unicode.Code_Point;
T_Index : VSS.Unicode.Code_Point;
begin
S_Index := Current_Starter_Code - S_Base;
T_Index := S_Index mod T_Count;
if T_Index = 0 then
-- Starter is LV_Syllable, can compose with
-- current T_Jamo.
T_Index := Current_Code - T_Base;
Current_Starter_Code := Current_Starter_Code + T_Index;
Current_Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, Current_Starter_Code);
-- Encoded size of all possible characters are, same, so
-- reuse variable.
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Delete
(Result_Data,
Result_Size,
Current_Offset,
Current_Size,
1);
VSS.Implementation.UTF8_Encoding.Encode
(Current_Starter_Code,
Current_Starter_Size,
Starter_Buffer (0),
Starter_Buffer (1),
Starter_Buffer (2),
Starter_Buffer (3));
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Replace
(Result_Data,
Result_Size,
Current_Starter_Offset,
Current_Starter_Size,
1,
Starter_Buffer,
Starter_Buffer'First,
Current_Starter_Size,
1);
Next_Offset := Next_Offset - Current_Size;
Current_Consumed := True;
end if;
end;
end if;
if not Current_Consumed then
Previous_CCC := Current_Info.CCC;
-- Current character is new starter
if Current_Info.CCC = CCC_NR then
Current_Starter_Offset := Current_Offset;
Current_Starter_Size := Current_Size;
Current_Starter_Code := Current_Code;
Current_Starter_Info := Current_Info;
end if;
end if;
end loop;
end Apply_Canonical_Composition;
-------------------------
-- Apply_Decomposition --
-------------------------
procedure Apply_Decomposition
(Result_Data : in out
VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : in out VSS.Unicode.UTF8_Code_Unit_Count;
From_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
From_Size : VSS.Unicode.UTF8_Code_Unit_Count;
From_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Skip_Compositon : out Boolean;
Starter_Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : out VSS.Unicode.Code_Point;
Starter_Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Previous_CCC : out
VSS.Implementation.UCD_Normalization_UTF8.CCC_Values)
is
Current_Offset : VSS.Unicode.UTF8_Code_Unit_Offset := From_Offset;
Current_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Current_Code : VSS.Unicode.Code_Point;
Current_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information :=
From_Info;
Next_Offset : VSS.Unicode.UTF8_Code_Unit_Offset :=
From_Offset + From_Size;
begin
-- By convention, this subprogram called only when there is at least
-- one character is available.
Skip_Compositon := True;
Previous_CCC := CCC_NR;
loop
if Current_Info.Has_Starter then
-- XXX By convention, when full decomposition has starter
-- character(s) the first characters of the decomposition is
-- starter too. This should be checked by code generator.
if Has_Decomposition (Current_Info) then
Current_Size := Next_Offset - Current_Offset;
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Replace
(Result_Data,
Result_Size,
Current_Offset,
Current_Size,
1,
VSS.Implementation.UCD_Normalization_UTF8
.UTF8_Data_Table,
Current_Info.Offset,
Current_Info.Size,
Current_Info.Length);
Next_Offset :=
Next_Offset + Current_Info.Size - Current_Size;
Previous_CCC := Current_Info.Last_CCC;
else
raise Program_Error;
end if;
else
-- Not a starter, do decompositon with canonical reordering.
if Has_Decomposition (Current_Info) then
raise Program_Error;
else
if Previous_CCC <= Current_Info.CCC then
-- No decomposition, canonical ordering preserved.
Previous_CCC := Current_Info.CCC;
else
-- Violation of canonical reordering, move character
-- into appropriate position.
declare
Previous_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Previous_Code : VSS.Unicode.Code_Point;
Previous_Info :
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information;
Insert_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
begin
Current_Size := Next_Offset - Current_Offset;
Previous_Offset := Current_Offset;
loop
Insert_Offset := Previous_Offset;
VSS.Implementation.UTF8_Strings
.Unchecked_Backward_Decode
(Result_Data, Previous_Offset, Previous_Code);
Previous_Info :=
Get_Decomposition_Information
(Decomposition_Data, Previous_Code);
exit when Previous_Info.CCC = CCC_NR
or Previous_Info.CCC <= Current_Info.CCC;
end loop;
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Move_Slice
(Result_Data,
Current_Offset,
Current_Size,
Insert_Offset);
end;
end if;
end if;
end if;
exit when Next_Offset >= Result_Size;
Current_Offset := Next_Offset;
VSS.Implementation.UTF8_Strings.Unchecked_Decode_Forward
(Result_Data, Next_Offset, Current_Code);
Current_Info :=
Get_Decomposition_Information (Decomposition_Data, Current_Code);
end loop;
Next_Offset := From_Offset;
VSS.Implementation.UTF8_Strings.Unchecked_Decode_Forward
(Result_Data, Next_Offset, Current_Code);
Current_Info :=
Get_Decomposition_Information (Decomposition_Data, Current_Code);
if Current_Info.CCC = CCC_NR then
Skip_Compositon := Current_Info.First_Index = 0;
Starter_Offset := From_Offset;
Starter_Size := Next_Offset - From_Offset;
Starter_Code := Current_Code;
Starter_Info := Current_Info;
end if;
end Apply_Decomposition;
--------------
-- Backward --
--------------
function Backward
(Data : VSS.Implementation.UTF8_Strings.UTF8_String_Data;
From : VSS.Unicode.UTF8_Code_Unit_Offset;
Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Code : out VSS.Unicode.Code_Point;
Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
return Boolean is
begin
if From = 0 then
return False;
end if;
Offset := From;
VSS.Implementation.UTF8_Strings.Unchecked_Backward_Decode
(Data, Offset, Code);
Info := Get_Decomposition_Information (Decomposition_Data, Code);
Size := From - Offset;
return True;
end Backward;
-----------------------
-- Has_Decomposition --
-----------------------
function Has_Decomposition
(Info : VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
return Boolean is
begin
return Info.Size /= 0;
end Has_Decomposition;
--------------------
-- Lookup_Starter --
--------------------
procedure Lookup_Starter
(Result_Data : VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Result_Size : VSS.Unicode.UTF8_Code_Unit_Offset;
Need_Decomposition : out Boolean;
Skip_Composition : out Boolean;
Starter_Found : out Boolean;
Starter_Offset : out VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : out VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : out VSS.Unicode.Code_Point;
Starter_Info : out
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information)
is
Current_Offset : VSS.Unicode.UTF8_Code_Unit_Offset := Result_Size;
Current_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Current_Code : VSS.Unicode.Code_Point;
Current_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
begin
Need_Decomposition := False;
Skip_Composition := True;
Starter_Found := False;
if Result_Size /= 0 then
loop
exit when not Backward
(Result_Data,
Current_Offset,
Current_Offset,
Current_Size,
Current_Code,
Current_Info);
Need_Decomposition :=
Need_Decomposition or Has_Decomposition (Current_Info);
if Current_Info.Has_Starter then
Skip_Composition := Current_Info.First_Index = 0;
Starter_Found := True;
Starter_Offset := Current_Offset;
Starter_Size := Current_Size;
Starter_Code := Current_Code;
Starter_Info := Current_Info;
exit;
end if;
end loop;
end if;
end Lookup_Starter;
Result_Size : VSS.Unicode.UTF8_Code_Unit_Count := 0;
Source_Current_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Source_Next_Offset : VSS.Unicode.UTF8_Code_Unit_Offset := 0;
Source_Copy_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Source_Length_Delta : VSS.Implementation.Strings.Character_Count;
Source_Code : VSS.Unicode.Code_Point;
Source_Info :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information;
Last_CCC :
VSS.Implementation.UCD_Normalization_UTF8.CCC_Values :=
CCC_NR;
begin
loop
exit when Source_Next_Offset >= Source_Size;
Source_Copy_Offset := Source_Next_Offset;
Source_Length_Delta := 0;
-- Lookup source string for the longest slice already in
-- normalization form.
loop
Source_Current_Offset := Source_Next_Offset;
exit when Source_Next_Offset >= Source_Size;
VSS.Implementation.UTF8_Encoding.Unchecked_Decode_Forward
(Source_Storage, Source_Next_Offset, Source_Code);
Source_Info :=
Get_Decomposition_Information (Decomposition_Data, Source_Code);
case Source_Info.Composition_QC is
when No =>
-- Character that cannot ever occur in the respective
-- normalization form.
exit;
when Maybe =>
-- Character that may occur in the respective
-- normalization form, depending on the context.
exit;
when Yes =>
-- Character that may occur in the respective
-- normalization form.
if Last_CCC > Source_Info.CCC
and Source_Info.CCC /= CCC_NR
then
-- Violation of the canonical ordering.
exit;
end if;
end case;
Last_CCC := Source_Info.CCC;
Source_Length_Delta := Source_Length_Delta + 1;
end loop;
-- Copy found normalized data
if Source_Copy_Offset /= Source_Current_Offset then
VSS.Implementation.UTF8_Strings.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Copy_Offset,
Source_Current_Offset - Source_Copy_Offset,
Source_Length_Delta);
end if;
exit when Source_Current_Offset >= Source_Size;
case Source_Info.Composition_QC is
when No =>
-- Character that cannot ever occur in the respective
-- normalization form.
if Source_Info.Has_Starter then
pragma Assert (Source_Info.First_CCC = CCC_NR);
pragma Assert (Source_Info.Last_Index = 0);
pragma Assert (Has_Decomposition (Source_Info));
declare
Starter_Code : VSS.Unicode.Code_Point;
Next_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Skip_Composition : Boolean;
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Info :
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information;
begin
Starter_Offset := Result_Size;
VSS.Implementation.UTF8_Strings.Mutable_Operations
.Unchecked_Append
(Result_Data,
Result_Size,
VSS.Implementation.UCD_Normalization_UTF8
.UTF8_Data_Table,
Source_Info.Offset,
Source_Info.Size,
Source_Info.Length);
Last_CCC := Source_Info.Last_CCC;
-- If last starter in full decomposition can be first
-- character of the pair of canonical decomposition of
-- the primary composite - process all following
-- characters till next starter/end of string and do
-- composition.
--
-- XXX this is not implemented yet
-- XXX There are 2 bits available in Info record, can it
-- be used to encode length of the starter in the full
-- decomposition?
Next_Offset := Starter_Offset;
VSS.Implementation.UTF8_Strings.Unchecked_Decode_Forward
(Result_Data, Next_Offset, Starter_Code);
Starter_Size := Next_Offset - Starter_Offset;
Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, Starter_Code);
Skip_Composition := False;
pragma Assert (Starter_Info.Last_Index = 0);
if not Skip_Composition then
loop
exit when Source_Next_Offset >= Source_Size;
Source_Current_Offset := Source_Next_Offset;
VSS.Implementation.UTF8_Encoding
.Unchecked_Decode_Forward
(Source_Storage,
Source_Next_Offset,
Source_Code);
Source_Info :=
Get_Decomposition_Information
(Decomposition_Data, Source_Code);
-- Append decompositions of all characters from
-- the source string till next starter.
if Source_Info.Has_Starter then
-- It is tested previously, and need to be
-- analyzed: first character in the full
-- decomposition should be starter by our
-- convention, but it may composite with the
-- previous starter. NF * C_QC may be used
-- here to do check, or Last_Index /= 0...
raise Program_Error;
end if;
if Has_Decomposition (Source_Info) then
raise Program_Error;
-- if Source_Info.Has_Starter then
-- raise Program_Error;
--
-- else
-- if Last_CCC <= Source_Info.First_CCC then
-- Unchecked_Append
-- (Result_Data,
-- Result_Size,
-- VSS.Implementation.UCD_Normalization_UTF8
-- .UTF8_Data_Table,
-- Source_Info.Offset,
-- Source_Info.Size,
-- Source_Info.Length);
-- Last_CCC := Source_Info.Last_CCC;
--
-- else
-- raise Program_Error;
-- end if;
--
-- end if;
else
-- if Source_Info.CCC = CCC_NR then
-- raise Program_Error;
--
-- else
if Last_CCC <= Source_Info.CCC then
VSS.Implementation.UTF8_Strings.
Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset
- Source_Current_Offset,
1);
Last_CCC := Source_Info.CCC;
else
raise Program_Error;
-- XXX Only single character is processed
-- here, thus Append_Reordered do
-- unnecessary action, because it can
-- process sequence of characters. So, it
-- is possible to optimize code here by
-- adding another subprogram to process
-- single character only.
-- Append_Reordered
-- (Result_Data,
-- Result_Size,
-- Decomposition_Data,
-- Last_CCC,
-- Source_Storage,
-- Source_Current_Offset,
-- Source_Next_Offset
-- - Source_Current_Offset);
end if;
end if;
end loop;
-- Do canonical composition
Apply_Canonical_Composition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
end if;
end;
else
-- XXX Duplicated with Maybe/non-starter case?
declare
Need_Decomposition : Boolean;
Skip_Composition : Boolean;
Starter_Found : Boolean;
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : VSS.Unicode.Code_Point;
Starter_Info :
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information;
begin
-- Lookup backward till starter character will be found
Lookup_Starter
(Result_Data,
Result_Size,
Need_Decomposition,
Skip_Composition,
Starter_Found,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
if Need_Decomposition then
declare
Aux_Starter_Info : constant
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information := Starter_Info;
-- Workaround for GNAT warning about overlapped
-- parameters.
begin
Apply_Decomposition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Aux_Starter_Info,
Skip_Composition,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info,
Last_CCC);
end;
end if;
-- Append decompositions of all characters from the
-- source string till next starter.
loop
if Has_Decomposition (Source_Info) then
if Source_Info.First_CCC = CCC_NR then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
VSS.Implementation.UCD_Normalization_UTF8
.UTF8_Data_Table,
Source_Info.Offset,
Source_Info.Size,
Source_Info.Length);
Last_CCC := Source_Info.Last_CCC;
exit;
elsif Last_CCC <= Source_Info.First_CCC then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
VSS.Implementation.UCD_Normalization_UTF8
.UTF8_Data_Table,
Source_Info.Offset,
Source_Info.Size,
Source_Info.Length);
Last_CCC := Source_Info.Last_CCC;
else
raise Program_Error;
end if;
else
if Source_Info.CCC = CCC_NR then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := Source_Info.CCC;
exit;
elsif Last_CCC <= Source_Info.CCC then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := Source_Info.CCC;
else
-- XXX Only single character is processed here,
-- thus Append_Reordered do unnecessary action,
-- because it can process sequence of
-- characters. So, it is possible to optimize
-- code here by adding another subprogram to
-- process single character only.
Append_Reordered
(Result_Data,
Result_Size,
Decomposition_Data,
Last_CCC,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset);
end if;
end if;
exit when Source_Next_Offset >= Source_Size;
Source_Current_Offset := Source_Next_Offset;
VSS.Implementation.UTF8_Encoding
.Unchecked_Decode_Forward
(Source_Storage, Source_Next_Offset, Source_Code);
Source_Info :=
Get_Decomposition_Information
(Decomposition_Data, Source_Code);
end loop;
-- Do canonical composition when necessary
if not Skip_Composition then
Apply_Canonical_Composition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
end if;
end;
end if;
when Maybe =>
-- Character that may occur in the respective normalization
-- form, depending on the context.
if Source_Info.Has_Starter then
if Has_Decomposition (Source_Info) then
raise Program_Error;
elsif Source_Code in 16#1161# .. 16#1175# then
pragma Assert (Source_Info.Last_Index = 0);
-- Algorithmic composition, character must not compose
-- with previous character in ordinary way.
declare
Starter_Code : VSS.Unicode.Code_Point;
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Buffer :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. 3);
L_Index : VSS.Unicode.Code_Point;
V_Index : VSS.Unicode.Code_Point;
LV_Index : VSS.Unicode.Code_Point;
Append_Source : Boolean := True;
begin
if Result_Size /= 0 then
Starter_Offset := Result_Size;
VSS.Implementation.UTF8_Strings.
Unchecked_Backward_Decode
(Result_Data, Starter_Offset, Starter_Code);
if Starter_Code in 16#1100# .. 16#1112# then
L_Index := Starter_Code - L_Base;
V_Index := Source_Code - V_Base;
LV_Index :=
L_Index * N_Count + V_Index * T_Count;
Starter_Code := S_Base + LV_Index;
VSS.Implementation.UTF8_Encoding.Encode
(Starter_Code,
Starter_Size,
Starter_Buffer (0),
Starter_Buffer (1),
Starter_Buffer (2),
Starter_Buffer (3));
-- Encoded size of all possible characters are,
-- same, so use it.
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Replace
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
1,
Starter_Buffer,
Starter_Buffer'First,
Starter_Size,
1);
Append_Source := False;
end if;
end if;
if Append_Source then
-- Composition is impossible, append character
-- to the result and reset canonical combining
-- class of the last character.
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := CCC_NR;
end if;
end;
elsif Source_Code in 16#11A8# .. 16#11C2# then
pragma Assert (Source_Info.Last_Index = 0);
-- Algorithmic composition, character must not compose
-- with previous character in ordinary way.
declare
Starter_Code : VSS.Unicode.Code_Point;
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Buffer :
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. 3);
S_Index : VSS.Unicode.Code_Point;
T_Index : VSS.Unicode.Code_Point;
Append_Source : Boolean := True;
begin
if Result_Size /= 0 then
Starter_Offset := Result_Size;
VSS.Implementation.UTF8_Strings
.Unchecked_Backward_Decode
(Result_Data, Starter_Offset, Starter_Code);
if Starter_Code in 16#AC00# .. 16#D7A3# then
S_Index := Starter_Code - S_Base;
T_Index := S_Index mod T_Count;
if T_Index = 0 then
-- Starter is LV_Syllable, can compose with
-- current T_Jamo.
T_Index := Source_Code - T_Base;
Starter_Code := Starter_Code + T_Index;
VSS.Implementation.UTF8_Encoding.Encode
(Starter_Code,
Starter_Size,
Starter_Buffer (0),
Starter_Buffer (1),
Starter_Buffer (2),
Starter_Buffer (3));
-- Encoded size of all possible characters
-- are same, so use it.
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Replace
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
1,
Starter_Buffer,
Starter_Buffer'First,
Starter_Size,
1);
Append_Source := False;
end if;
end if;
end if;
if Append_Source then
-- Composition is impossible, append character
-- to the result and reset canonical combining
-- class of the last character.
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := CCC_NR;
end if;
end;
else
pragma Assert (Source_Info.Last_Index /= 0);
-- Character may compose with previous character.
declare
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : VSS.Unicode.Code_Point;
Starter_Info :
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information;
begin
Starter_Offset := Result_Size;
VSS.Implementation.UTF8_Strings
.Unchecked_Backward_Decode
(Result_Data, Starter_Offset, Starter_Code);
Starter_Info :=
Get_Decomposition_Information
(Decomposition_Data, Starter_Code);
if Starter_Info.Has_Starter then
-- XXX Don't need to decomposite previous
-- character ???
-- if Has_Decomposition (Starter_Info) then
-- raise Program_Error;
--
-- else
if Starter_Info.First_Index /= 0 then
Starter_Size := Result_Size - Starter_Offset;
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Apply_Canonical_Composition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
end if;
else
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := CCC_NR;
end if;
end;
end if;
else
declare
Need_Decomposition : Boolean;
Skip_Composition : Boolean;
Starter_Found : Boolean;
Starter_Offset : VSS.Unicode.UTF8_Code_Unit_Offset;
Starter_Size : VSS.Unicode.UTF8_Code_Unit_Count;
Starter_Code : VSS.Unicode.Code_Point;
Starter_Info :
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information;
begin
-- Lookup backward till starter character will be found
Lookup_Starter
(Result_Data,
Result_Size,
Need_Decomposition,
Skip_Composition,
Starter_Found,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
if Need_Decomposition then
declare
Aux_Starter_Info : constant
VSS.Implementation.UCD_Normalization_UTF8
.Mapping_Information := Starter_Info;
-- Workaround of GNAT warning about overlapping
-- parameters.
begin
Apply_Decomposition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Aux_Starter_Info,
Skip_Composition,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info,
Last_CCC);
end;
end if;
-- Append decompositions of all characters from the
-- source string till next starter.
loop
-- if Source_Info.Has_Starter then
-- It is tested previously, and need to be
-- analyzed: first character in the full decomposition
-- should be starter by our convention, but it may
-- composite with the previous starter. NF*C_QC may
-- be used here to do check, or Last_Index /= 0...
-- raise Program_Error;
-- end if;
if Has_Decomposition (Source_Info) then
if Source_Info.First_CCC = CCC_NR then
raise Program_Error;
elsif Last_CCC <= Source_Info.First_CCC then
raise Program_Error;
else
Append_Reordered
(Result_Data,
Result_Size,
Decomposition_Data,
Last_CCC,
VSS.Implementation.UCD_Normalization_UTF8
.UTF8_Data_Table,
Source_Info.Offset,
Source_Info.Size);
end if;
else
if Source_Info.CCC = CCC_NR then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := Source_Info.CCC;
exit;
elsif Last_CCC <= Source_Info.CCC then
VSS.Implementation.UTF8_Strings
.Mutable_Operations.Unchecked_Append
(Result_Data,
Result_Size,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset,
1);
Last_CCC := Source_Info.CCC;
else
-- XXX Only single character is processed here,
-- thus Append_Reordered do unnecessary action,
-- because it can process sequence of
-- characters. So, it is possible to optimize
-- code here by adding another subprogram to
-- process single character only.
Append_Reordered
(Result_Data,
Result_Size,
Decomposition_Data,
Last_CCC,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset);
end if;
end if;
exit when Source_Next_Offset >= Source_Size;
Source_Current_Offset := Source_Next_Offset;
VSS.Implementation.UTF8_Encoding
.Unchecked_Decode_Forward
(Source_Storage, Source_Next_Offset, Source_Code);
Source_Info :=
Get_Decomposition_Information
(Decomposition_Data, Source_Code);
end loop;
-- Do canonical composition when necessary
if not Skip_Composition then
Apply_Canonical_Composition
(Result_Data,
Result_Size,
Starter_Offset,
Starter_Size,
Starter_Code,
Starter_Info);
end if;
end;
end if;
when Yes =>
if Last_CCC <= Source_Info.CCC and Source_Info.CCC = CCC_NR then
-- Precondition, should be removed.
raise Program_Error;
end if;
-- XXX Only single character is appended here! May be
-- optimized?
Append_Reordered
(Result_Data,
Result_Size,
Decomposition_Data,
Last_CCC,
Source_Storage,
Source_Current_Offset,
Source_Next_Offset - Source_Current_Offset);
end case;
end loop;
end Decompose_And_Compose;
-----------------------------------
-- Get_Decomposition_Information --
-----------------------------------
function Get_Decomposition_Information
(Decomposition_Data :
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset_Array;
Code : VSS.Unicode.Code_Point)
return VSS.Implementation.UCD_Normalization_UTF8.Mapping_Information
is
use type VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset;
use type VSS.Unicode.Code_Point;
Group :
constant VSS.Implementation.UCD_Normalization_UTF8.Mapping_Group :=
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Group
(Code
/ VSS.Implementation.UCD_Normalization_UTF8.Mapping_Group_Size);
Offset : constant
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset :=
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Offset
(Code
mod VSS.Implementation.UCD_Normalization_UTF8.Mapping_Group_Size);
begin
return
VSS.Implementation.UCD_Normalization_UTF8.Mapping_Data_Table
(Decomposition_Data (Group) + Offset);
end Get_Decomposition_Information;
---------------
-- Normalize --
---------------
procedure Normalize
(Text : VSS.Implementation.UTF8_Strings.UTF8_String_Data;
Form : VSS.Implementation.Normalization_Form;
Result : out VSS.Implementation.UTF8_Strings.UTF8_String_Data) is
begin
VSS.Implementation.UTF8_Strings.Mutable_Operations.Initialize
(Result, Text.Size);
if Text.Size = 0 then
return;
end if;
declare
Storage : constant
VSS.Implementation.UTF8_Encoding.UTF8_Code_Unit_Array
(0 .. Text.Size)
with Import, Address => Text.Storage_Address;
begin
case Form is
when VSS.Implementation.Normalization_Form_D =>
VSS.Implementation.UTF8_Normalization.Decompose
(Storage,
Text.Size,
VSS.Implementation.UCD_Normalization_UTF8.Canonical_Index,
Result);
when VSS.Implementation.Normalization_Form_C =>
VSS.Implementation.UTF8_Normalization.Decompose_And_Compose
(Storage,
Text.Size,
VSS.Implementation.UCD_Normalization_UTF8.Canonical_Index,
Result);
when VSS.Implementation.Normalization_Form_KD =>
VSS.Implementation.UTF8_Normalization.Decompose
(Storage,
Text.Size,
VSS.Implementation.UCD_Normalization_UTF8
.Compatibility_Index,
Result);
when VSS.Implementation.Normalization_Form_KC =>
VSS.Implementation.UTF8_Normalization.Decompose_And_Compose
(Storage,
Text.Size,
VSS.Implementation.UCD_Normalization_UTF8
.Compatibility_Index,
Result);
end case;
end;
end Normalize;
end VSS.Implementation.UTF8_Normalization;