From 3c93ca46a2cd021c1756bd71e447fe68c547c2f5 Mon Sep 17 00:00:00 2001 From: Simon Montagu Date: Fri, 16 Nov 2012 11:33:15 -0800 Subject: [PATCH] Fix an error in the UTF-8 state machine table, and clean up the table generally. Bug 811363, r=jfkthame. --- .../universalchardet/src/base/nsMBCSSM.cpp | 114 ++++----- intl/chardet/tools/genutf8.pl | 241 +++++++----------- intl/chardet/tools/genverifier.pm | 34 +-- 3 files changed, 167 insertions(+), 222 deletions(-) diff --git a/extensions/universalchardet/src/base/nsMBCSSM.cpp b/extensions/universalchardet/src/base/nsMBCSSM.cpp index 773c335c18b..e3c9e480e37 100644 --- a/extensions/universalchardet/src/base/nsMBCSSM.cpp +++ b/extensions/universalchardet/src/base/nsMBCSSM.cpp @@ -403,79 +403,65 @@ const SMModel SJISSMModel = { static const uint32_t UTF8_cls [ 256 / 8 ] = { -//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 -PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value -PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f -PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 -PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f -PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 -PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f -PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 -PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f -PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 -PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f -PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 -PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f -PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 -PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f -PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 -PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f -PCK4BITS(2,2,2,2,3,3,3,3), // 80 - 87 -PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f -PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97 -PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f -PCK4BITS(5,5,5,5,5,5,5,5), // a0 - a7 -PCK4BITS(5,5,5,5,5,5,5,5), // a8 - af -PCK4BITS(5,5,5,5,5,5,5,5), // b0 - b7 -PCK4BITS(5,5,5,5,5,5,5,5), // b8 - bf -PCK4BITS(0,0,6,6,6,6,6,6), // c0 - c7 -PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf -PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7 -PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df -PCK4BITS(7,8,8,8,8,8,8,8), // e0 - e7 -PCK4BITS(8,8,8,8,8,9,8,8), // e8 - ef -PCK4BITS(10,11,11,11,11,11,11,11), // f0 - f7 -PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 +PCK4BITS( 1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 +PCK4BITS( 1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 +PCK4BITS( 1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f +PCK4BITS( 2, 2, 2, 2, 2, 2, 2, 2), // 80 - 87 +PCK4BITS( 2, 2, 2, 2, 2, 2, 2, 2), // 88 - 8f +PCK4BITS( 3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97 +PCK4BITS( 3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f +PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // a0 - a7 +PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // a8 - af +PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // b0 - b7 +PCK4BITS( 4, 4, 4, 4, 4, 4, 4, 4), // b8 - bf +PCK4BITS( 0, 0, 5, 5, 5, 5, 5, 5), // c0 - c7 +PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // c8 - cf +PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // d0 - d7 +PCK4BITS( 5, 5, 5, 5, 5, 5, 5, 5), // d8 - df +PCK4BITS( 6, 7, 7, 7, 7, 7, 7, 7), // e0 - e7 +PCK4BITS( 7, 7, 7, 7, 7, 8, 7, 7), // e8 - ef +PCK4BITS( 9,10,10,10,11, 0, 0, 0), // f0 - f7 +PCK4BITS( 0, 0, 0, 0, 0, 0, 0, 0) // f8 - ff }; -static const uint32_t UTF8_st [ 26] = { -PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 -PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//18-1f -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//20-27 -PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//28-2f -PCK4BITS(eError,eError, 5, 5, 5, 5,eError,eError),//30-37 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//38-3f -PCK4BITS(eError,eError,eError, 5, 5, 5,eError,eError),//40-47 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//48-4f -PCK4BITS(eError,eError, 7, 7, 7, 7,eError,eError),//50-57 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//58-5f -PCK4BITS(eError,eError,eError,eError, 7, 7,eError,eError),//60-67 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//68-6f -PCK4BITS(eError,eError, 9, 9, 9, 9,eError,eError),//70-77 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//78-7f -PCK4BITS(eError,eError,eError,eError,eError, 9,eError,eError),//80-87 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//88-8f -PCK4BITS(eError,eError, 12, 12, 12, 12,eError,eError),//90-97 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//98-9f -PCK4BITS(eError,eError,eError,eError,eError, 12,eError,eError),//a0-a7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//a8-af -PCK4BITS(eError,eError, 12, 12, 12,eError,eError,eError),//b0-b7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//b8-bf -PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eError,eError),//c0-c7 -PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf +static const uint32_t UTF8_st [ 15] = { +PCK4BITS(eError,eStart,eError,eError,eError, 3, 4, 5), // 00 - 07 +PCK4BITS( 6, 7, 8, 9,eError,eError,eError,eError), // 08 - 0f +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError), // 10 - 17 +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe), // 18 - 1f +PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart), // 20 - 27 +PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError), // 28 - 2f +PCK4BITS(eError,eError,eError,eError, 3,eError,eError,eError), // 30 - 37 +PCK4BITS(eError,eError,eError,eError,eError,eError, 3, 3), // 38 - 3f +PCK4BITS( 3,eError,eError,eError,eError,eError,eError,eError), // 40 - 47 +PCK4BITS(eError,eError, 3, 3,eError,eError,eError,eError), // 48 - 4f +PCK4BITS(eError,eError,eError,eError,eError,eError, 5, 5), // 50 - 57 +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError), // 58 - 5f +PCK4BITS(eError,eError, 5, 5, 5,eError,eError,eError), // 60 - 67 +PCK4BITS(eError,eError,eError,eError,eError,eError, 5,eError), // 68 - 6f +PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) // 70 - 77 }; -static const uint32_t UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, - 3, 3, 4, 4, 5, 5, 6, 6 }; +static const uint32_t UTF8CharLenTable[] = {0, 1, 0, 0, 0, 2, 3, 3, 3, 4, 4, 4}; const SMModel UTF8SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, - 16, + 12, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, CHAR_LEN_TABLE(UTF8CharLenTable), "UTF-8", }; - diff --git a/intl/chardet/tools/genutf8.pl b/intl/chardet/tools/genutf8.pl index e7a118c0853..437dd535b82 100644 --- a/intl/chardet/tools/genutf8.pl +++ b/intl/chardet/tools/genutf8.pl @@ -12,132 +12,105 @@ my(@utf8_cls); my(@utf8_st); my($utf8_ver); -# -# -# UTF8 encode the UCS4 into 1 to 6 bytes -# +# +# +# UTF8 encode the UCS4 into 1 to 4 bytes +# # 1 byte 00 00 00 00 00 00 00 7f # 2 bytes 00 00 00 80 00 00 07 ff # 3 bytes 00 00 08 00 00 00 ff ff -# 4 bytes 00 01 00 00 00 1f ff ff -# 5 bytes 00 20 00 00 03 ff ff ff -# 6 bytes 04 00 00 00 7f ff ff ff -# -# Howerver, since Surrogate area should not be encoded into UTF8 as +# 4 bytes 00 01 00 00 00 10 ff ff +# +# However, since Surrogate area should not be encoded into UTF8 as # a Surrogate pair, we can remove the surrogate area from UTF8 -# +# # 1 byte 00 00 00 00 00 00 00 7f # 2 bytes 00 00 00 80 00 00 07 ff # 3 bytes 00 00 08 00 00 00 d7 ff # 00 00 e0 00 00 00 ff ff -# 4 bytes 00 01 00 00 00 1f ff ff -# 5 bytes 00 20 00 00 03 ff ff ff -# 6 bytes 04 00 00 00 7f ff ff ff -# -# Now we break them into 6 bits group for 2-6 bytes UTF8 -# +# 4 bytes 00 01 00 00 00 10 ff ff +# +# Now we break them into 6 bits group for 2-4 bytes UTF8 +# # 1 byte 00 7f # 2 bytes 02 00 1f 3f # 3 bytes 00 20 00 0d 1f 3f # 0e 00 00 0f 3f 3f -# 4 bytes 00 20 00 00 07 3f 3f 3f -# 5 bytes 00 08 00 00 00 03 3f 3f 3f 3f -# 6 bytes 00 04 00 00 00 00 01 3f 3f 3f 3f 3f -# +# 4 bytes 00 10 00 00 04 0f 3f 3f +# # Break down more -# +# # 1 byte 00 7f # 2 bytes 02 00 1f 3f # 3 bytes 00 20 00 00 3f 3f # 01 00 00 0c 3f 3f # 0d 00 00 0d 1f 3f # 0e 00 00 0f 3f 3f -# 4 bytes 00 20 00 00 00 3f 3f 3f -# 01 00 00 00 07 3f 3f 3f -# 5 bytes 00 08 00 00 00 00 3f 3f 3f 3f -# 01 00 00 00 00 03 3f 3f 3f 3f -# 6 bytes 00 04 00 00 00 00 00 3f 3f 3f 3f 3f -# 01 00 00 00 00 00 01 3f 3f 3f 3f 3f -# -# Now, add +# 4 bytes 00 10 00 00 00 3f 3f 3f +# 01 00 00 00 03 3f 3f 3f +# 04 00 00 00 04 0f 3f 3f +# +# Now, add # c0 to the lead byte of 2 bytes UTF8 # e0 to the lead byte of 3 bytes UTF8 # f0 to the lead byte of 4 bytes UTF8 -# f8 to the lead byte of 5 bytes UTF8 -# fc to the lead byte of 6 bytes UTF8 -# 80 to the trail bytes of 2 - 6 bytes UTF8 -# +# 80 to the trail bytes +# # 1 byte 00 7f # 2 bytes c2 80 df bf # 3 bytes e0 a0 80 e0 bf bf # e1 80 80 ec bf bf # ed 80 80 ed 9f bf # ee 80 80 ef bf bf -# 4 bytes f0 a0 80 80 f0 bf bf bf -# f1 80 80 80 f7 bf bf bf -# 5 bytes f8 88 80 80 80 f8 bf bf bf bf -# f9 80 80 80 80 fb bf bf bf bf -# 6 bytes fc 84 80 80 80 80 fc bf bf bf bf bf -# fd 80 80 80 80 80 fd bf bf bf bf bf -# -# +# 4 bytes f0 90 80 80 f0 bf bf bf +# f1 80 80 80 f3 bf bf bf +# f4 80 80 80 f4 8f bf bf +# +# # Now we can construct our state diagram -# -# 0:0x00,0x0e,0x0f,0x1b->Error +# +# 0:0x0e,0x0f,0x1b->Error # 0:[0-0x7f]->0 -# 0:fd->3 -# 0:fc->4 -# 0:[f9-fb]->5 -# 0:f8->6 -# 0:[f1-f7]->7 -# 0:f0->8 -# 0:[e1-ecee-ef]->9 -# 0:e0->10 -# 0:ed->11 -# 0:[c2-df]->12 +# 0:[c2-df]->3 +# 0:e0->4 +# 0:[e1-ec, ee-ef]->5 +# 0:ed->6 +# 0:f0->7 +# 0:[f1-f3]->8 +# 0:f4->9 # 0:*->Error -# 3:[80-bf]->5 +# 3:[80-bf]->0 # 3:*->Error -# 4:[84-bf]->5 +# 4:[a0-bf]->3 # 4:*->Error -# 5:[80-bf]->7 +# 5:[80-bf]->3 # 5:*->Error -# 6:[88-bf]->7 +# 6:[80-9f]->3 # 6:*->Error -# 7:[80-bf]->9 +# 7:[90-bf]->5 # 7:*->Error -# 8:[a0-bf]->9 +# 8:[80-bf]->5 # 8:*->Error -# 9:[80-bf]->12 +# 9:[80-8f]->5 # 9:*->Error -# 10:[a0-bf]->12 -# 10:*->Error -# 11:[80-9f]->12 -# 11:*->Error -# 12:[80-bf]->0 -# 12:*->Error -# +# # Now, we classified chars into class -# +# # 00,0e,0f,1b:k0 # 01-0d,10-1a,1c-7f:k1 -# 80-83:k2 -# 84-87:k3 -# 88-9f:k4 -# a0-bf:k5 +# 80-8f:k2 +# 90-9f:k3 +# a0-bf:k4 # c0-c1:k0 -# c2-df:k6 -# e0:k7 -# e1-ec:k8 -# ed:k9 -# ee-ef:k8 -# f0:k10 -# f1-f7:k11 -# f8:k12 -# f9-fb:k13 -# fc:k14 -# fd:k15 -# fe-ff:k0 +# c2-df:k5 +# e0:k6 +# e1-ec:k7 +# ed:k8 +# ee-ef:k7 +# f0:k9 +# f1-f3:k10 +# f4:k11 +# f5-ff:k0 # # Now, let's put them into array form @@ -148,84 +121,68 @@ my($utf8_ver); [ 0x01 , 0x0d , 1 ], [ 0x10 , 0x1a , 1 ], [ 0x1c , 0x7f , 1 ], - [ 0x80 , 0x83 , 2 ], - [ 0x84 , 0x87 , 3 ], - [ 0x88 , 0x9f , 4 ], - [ 0xa0 , 0xbf , 5 ], + [ 0x80 , 0x8f , 2 ], + [ 0x90 , 0x9f , 3 ], + [ 0xa0 , 0xbf , 4 ], [ 0xc0 , 0xc1 , 0 ], - [ 0xc2 , 0xdf , 6 ], - [ 0xe0 , 0xe0 , 7 ], - [ 0xe1 , 0xec , 8 ], - [ 0xed , 0xed , 9 ], - [ 0xee , 0xef , 8 ], - [ 0xf0 , 0xf0 , 10 ], - [ 0xf1 , 0xf7 , 11 ], - [ 0xf8 , 0xf8 , 12 ], - [ 0xf9 , 0xfb , 13 ], - [ 0xfc , 0xfc , 14 ], - [ 0xfd , 0xfd , 15 ], - [ 0xfe , 0xff , 0 ], + [ 0xc2 , 0xdf , 5 ], + [ 0xe0 , 0xe0 , 6 ], + [ 0xe1 , 0xec , 7 ], + [ 0xed , 0xed , 8 ], + [ 0xee , 0xef , 7 ], + [ 0xf0 , 0xf0 , 9 ], + [ 0xf1 , 0xf3 , 10 ], + [ 0xf4 , 0xf4 , 11 ], + [ 0xf5 , 0xff , 0 ], ); -# -# Now, we write the state diagram in class -# +# +# Now, we write the state diagram in class +# # 0:k0->Error # 0:k1->0 -# 0:k15->3 -# 0:k14->4 -# 0:k13->5 -# 0:k12->6 -# 0:k11->7 +# 0:k5->3 +# 0:k6->4 +# 0:k7->5 +# 0:k8->6 +# 0:k9->7 # 0:k10->8 -# 0:k8->9 -# 0:k7->10 -# 0:k9->11 -# 0:k6->12 +# 0:k11->9 # 0:*->Error -# 3:k2,k3,k4,k5->5 +# 3:k2,k3,k4->0 # 3:*->Error -# 4:k3,k4,k5->5 +# 4:k4->3 # 4:*->Error -# 5:k2,k3,k4,k5->7 +# 5:k2,k3,k4->3 # 5:*->Error -# 6:k4,k5->7 +# 6:k2,k3->3 # 6:*->Error -# 7:k2,k3,k4,k5->9 +# 7:k3,k4->5 # 7:*->Error -# 8:k5->9 +# 8:k2,k3,k4->5 # 8:*->Error -# 9:k2,k3,k4,k5->12 +# 9:k2->5 # 9:*->Error -# 10:k5->12 -# 10:*->Error -# 11:k2,k3,k4->12 -# 11:*->Error -# 12:k2,k3,k4,k5->0 -# 12:*->Error -# +# # Now, let's put them into array -# +# package genverifier; @utf8_st = ( -# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - 1, 0, 1, 1, 1, 1,12,10, 9,11, 8, 7, 6, 5, 4, 3, # state 0 Start - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe - 1, 1, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 3 - 1, 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 4 - 1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 5 - 1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 - 1, 1, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 - 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 8 - 1, 1,12,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 - 1, 1, 1, 1, 1,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 10 - 1, 1,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 11 - 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 12 +# 0 1 2 3 4 5 6 7 8 9 10 11 + 1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe + 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3 + 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4 + 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5 + 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6 + 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7 + 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8 + 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9 ); -$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 16, \@utf8_st); +$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st); print $utf8_ver; diff --git a/intl/chardet/tools/genverifier.pm b/intl/chardet/tools/genverifier.pm index b9e7b958326..8ccfef4d691 100644 --- a/intl/chardet/tools/genverifier.pm +++ b/intl/chardet/tools/genverifier.pm @@ -54,13 +54,13 @@ sub GenStatePkg { sub GenPkg { my($name, $bits, $tbl) = @_; my($ret); - $ret = " {\n" . - " eIdxSft" . $bits . "bits, \n" . - " eSftMsk" . $bits . "bits, \n" . - " eBitSft" . $bits . "bits, \n" . - " eUnitMsk" . $bits . "bits, \n" . - " " . $name . $tbl . " \n" . - " }"; + $ret = " {" . + "eIdxSft" . $bits . "bits, " . + "eSftMsk" . $bits . "bits, " . + "eBitSft" . $bits . "bits, " . + "eUnitMsk" . $bits . "bits, " . + $name . $tbl . "" . + " }"; return $ret; }; ##-------------------------------------------------------------- @@ -75,7 +75,7 @@ sub Gen4BitsClass { $ret .= "PCK4BITS("; for($j = $i; $j < $i + 8; $j++) { $cls = &GetClass($j,$clstbl); - $ret .= sprintf("%d", $cls) ; + $ret .= sprintf("%2d", $cls) ; if($j != ($i+7)) { $ret .= ","; } @@ -85,7 +85,7 @@ sub Gen4BitsClass { } else { $ret .= "),"; } - $ret .= sprintf(" // %02x - %02x \n", $i, ($i+7)); + $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7)); } $ret .= "};\n"; return $ret; @@ -101,16 +101,18 @@ sub GenVerifier { $ret .= "\n\n"; $ret .= Gen4BitsState($name, $st); $ret .= "\n\n"; - $ret .= "static nsVerifier ns" . $name . "Verifier = {\n"; - $ret .= ' "' . $charset . '",' . "\n"; + $ret .= "const SMModel " . $name . "SMModel = {\n"; $ret .= GenClassPkg($name, 4); $ret .= ",\n"; - $ret .= " " . $numcls; + $ret .= " " . $numcls; $ret .= ",\n"; $ret .= GenStatePkg($name, 4); - $ret .= "\n};\n"; + $ret .= ",\n"; + $ret .= " " . "CHAR_LEN_TABLE(" . $name . "CharLenTable),\n"; + $ret .= ' "' . $charset . '",' . "\n"; + $ret .= "};\n"; return $ret; - + }; ##-------------------------------------------------------------- sub Gen4BitsState { @@ -141,7 +143,7 @@ sub Gen4BitsState { } else { $ret .= "),"; } - $ret .= sprintf("//%02x-%02x \n", $i, ($i+7)); + $ret .= sprintf(" // %02x - %02x\n", $i, ($i+7)); } $ret .= "};\n"; return $ret; @@ -150,7 +152,7 @@ sub Gen4BitsState { sub GenNote { my($ret) = << "END_NOTE"; -/* +/* * DO NOT EDIT THIS DOCUMENT MANUALLY !!! * THIS FILE IS AUTOMATICALLY GENERATED BY THE TOOLS UNDER * mozilla/intl/chardet/tools/