This patch changes the default behaviour of the builtin charmap

codec to not apply Latin-1 mappings for keys which are not found
in the mapping dictionaries, but instead treat them as undefined
mappings.

The patch was originally written by Martin v. Loewis with some
additional (cosmetic) changes and an updated test script
by Marc-Andre Lemburg.

The standard codecs were recreated from the most current files
available at the Unicode.org site using the Tools/scripts/gencodec.py
tool.

This patch closes the bugs #116285 and #119960.
This commit is contained in:
Marc-André Lemburg
2001-01-03 21:29:14 +00:00
parent b55b7bb3ab
commit a866df806d
56 changed files with 424 additions and 293 deletions

View File

@@ -539,6 +539,21 @@ def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
sr.file_encoding = file_encoding
return sr
### Helpers for charmap-based codecs
def make_identity_dict(rng):
""" make_identity_dict(rng) -> dict
Return a dictionary where elements of the rng sequence are
mapped to themselves.
"""
res = {}
for i in rng:
res[i]=i
return res
### Tests
if __name__ == '__main__':

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP037.TXT'.
""" Python Character Mapping Codec generated from 'CP037.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0004: 0x009c, # CONTROL
0x0005: 0x0009, # HORIZONTAL TABULATION
0x0006: 0x0086, # CONTROL
@@ -273,7 +273,7 @@ decoding_map = {
0x00fd: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE
0x00fe: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE
0x00ff: 0x009f, # CONTROL
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1006.TXT'.
""" Python Character Mapping Codec generated from 'CP1006.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x00a1: 0x06f0, # EXTENDED ARABIC-INDIC DIGIT ZERO
0x00a2: 0x06f1, # EXTENDED ARABIC-INDIC DIGIT ONE
0x00a3: 0x06f2, # EXTENDED ARABIC-INDIC DIGIT TWO
@@ -131,7 +131,7 @@ decoding_map = {
0x00fd: 0xfbae, # ARABIC LETTER YEH BARREE ISOLATED FORM
0x00fe: 0xfe7c, # ARABIC SHADDA ISOLATED FORM
0x00ff: 0xfe7d, # ARABIC SHADDA MEDIAL FORM
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1026.TXT'.
""" Python Character Mapping Codec generated from 'CP1026.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0004: 0x009c, # CONTROL
0x0005: 0x0009, # HORIZONTAL TABULATION
0x0006: 0x0086, # CONTROL
@@ -273,7 +273,7 @@ decoding_map = {
0x00fd: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE
0x00fe: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE
0x00ff: 0x009f, # CONTROL
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1250.TXT'.
""" Python Character Mapping Codec generated from 'CP1250.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -116,7 +116,7 @@ decoding_map = {
0x00fb: 0x0171, # LATIN SMALL LETTER U WITH DOUBLE ACUTE
0x00fe: 0x0163, # LATIN SMALL LETTER T WITH CEDILLA
0x00ff: 0x02d9, # DOT ABOVE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1251.TXT'.
""" Python Character Mapping Codec generated from 'CP1251.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x0402, # CYRILLIC CAPITAL LETTER DJE
0x0081: 0x0403, # CYRILLIC CAPITAL LETTER GJE
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -150,7 +150,7 @@ decoding_map = {
0x00fd: 0x044d, # CYRILLIC SMALL LETTER E
0x00fe: 0x044e, # CYRILLIC SMALL LETTER YU
0x00ff: 0x044f, # CYRILLIC SMALL LETTER YA
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1252.TXT'.
""" Python Character Mapping Codec generated from 'CP1252.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -69,7 +69,7 @@ decoding_map = {
0x009d: None, # UNDEFINED
0x009e: 0x017e, # LATIN SMALL LETTER Z WITH CARON
0x009f: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1253.TXT'.
""" Python Character Mapping Codec generated from 'CP1253.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -144,7 +144,7 @@ decoding_map = {
0x00fd: 0x03cd, # GREEK SMALL LETTER UPSILON WITH TONOS
0x00fe: 0x03ce, # GREEK SMALL LETTER OMEGA WITH TONOS
0x00ff: None, # UNDEFINED
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1254.TXT'.
""" Python Character Mapping Codec generated from 'CP1254.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -75,7 +75,7 @@ decoding_map = {
0x00f0: 0x011f, # LATIN SMALL LETTER G WITH BREVE
0x00fd: 0x0131, # LATIN SMALL LETTER DOTLESS I
0x00fe: 0x015f, # LATIN SMALL LETTER S WITH CEDILLA
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1255.TXT'.
""" Python Character Mapping Codec generated from 'CP1255.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -136,7 +136,7 @@ decoding_map = {
0x00fd: 0x200e, # LEFT-TO-RIGHT MARK
0x00fe: 0x200f, # RIGHT-TO-LEFT MARK
0x00ff: None, # UNDEFINED
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1256.TXT'.
""" Python Character Mapping Codec generated from 'CP1256.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: 0x067e, # ARABIC LETTER PEH
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -122,7 +122,7 @@ decoding_map = {
0x00fd: 0x200e, # LEFT-TO-RIGHT MARK
0x00fe: 0x200f, # RIGHT-TO-LEFT MARK
0x00ff: 0x06d2, # ARABIC LETTER YEH BARREE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1257.TXT'.
""" Python Character Mapping Codec generated from 'CP1257.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -124,7 +124,7 @@ decoding_map = {
0x00fd: 0x017c, # LATIN SMALL LETTER Z WITH DOT ABOVE
0x00fe: 0x017e, # LATIN SMALL LETTER Z WITH CARON
0x00ff: 0x02d9, # DOT ABOVE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP1258.TXT'.
""" Python Character Mapping Codec generated from 'CP1258.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x20ac, # EURO SIGN
0x0081: None, # UNDEFINED
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -83,7 +83,7 @@ decoding_map = {
0x00f5: 0x01a1, # LATIN SMALL LETTER O WITH HORN
0x00fd: 0x01b0, # LATIN SMALL LETTER U WITH HORN
0x00fe: 0x20ab, # DONG SIGN
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP424.TXT'.
""" Python Character Mapping Codec generated from 'CP424.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0004: 0x009c, # SELECT
0x0005: 0x0009, # HORIZONTAL TABULATION
0x0006: 0x0086, # REQUIRED NEW LINE
@@ -273,7 +273,7 @@ decoding_map = {
0x00fd: None, # UNDEFINED
0x00fe: None, # UNDEFINED
0x00ff: 0x009f, # EIGHT ONES
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP437.TXT'.
""" Python Character Mapping Codec generated from 'CP437.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x00c7, # LATIN CAPITAL LETTER C WITH CEDILLA
0x0081: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS
0x0082: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE
@@ -165,7 +165,7 @@ decoding_map = {
0x00fd: 0x00b2, # SUPERSCRIPT TWO
0x00fe: 0x25a0, # BLACK SQUARE
0x00ff: 0x00a0, # NO-BREAK SPACE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP500.TXT'.
""" Python Character Mapping Codec generated from 'CP500.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0004: 0x009c, # CONTROL
0x0005: 0x0009, # HORIZONTAL TABULATION
0x0006: 0x0086, # CONTROL
@@ -273,7 +273,7 @@ decoding_map = {
0x00fd: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE
0x00fe: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE
0x00ff: 0x009f, # CONTROL
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP737.TXT'.
""" Python Character Mapping Codec generated from 'CP737.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x0391, # GREEK CAPITAL LETTER ALPHA
0x0081: 0x0392, # GREEK CAPITAL LETTER BETA
0x0082: 0x0393, # GREEK CAPITAL LETTER GAMMA
@@ -165,7 +165,7 @@ decoding_map = {
0x00fd: 0x00b2, # SUPERSCRIPT TWO
0x00fe: 0x25a0, # BLACK SQUARE
0x00ff: 0x00a0, # NO-BREAK SPACE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP775.TXT'.
""" Python Character Mapping Codec generated from 'CP775.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x0106, # LATIN CAPITAL LETTER C WITH ACUTE
0x0081: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS
0x0082: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE
@@ -165,7 +165,7 @@ decoding_map = {
0x00fd: 0x00b2, # SUPERSCRIPT TWO
0x00fe: 0x25a0, # BLACK SQUARE
0x00ff: 0x00a0, # NO-BREAK SPACE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP850.TXT'.
""" Python Character Mapping Codec generated from 'CP850.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x00c7, # LATIN CAPITAL LETTER C WITH CEDILLA
0x0081: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS
0x0082: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE
@@ -165,7 +165,7 @@ decoding_map = {
0x00fd: 0x00b2, # SUPERSCRIPT TWO
0x00fe: 0x25a0, # BLACK SQUARE
0x00ff: 0x00a0, # NO-BREAK SPACE
}
})
### Encoding Map

View File

@@ -1,9 +1,9 @@
""" Python Character Mapping Codec generated from 'CP852.TXT'.
""" Python Character Mapping Codec generated from 'CP852.TXT' with gencodec.py.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright 2000 Guido van Rossum.
"""#"
@@ -35,8 +35,8 @@ def getregentry():
### Decoding Map
decoding_map = {
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0080: 0x00c7, # LATIN CAPITAL LETTER C WITH CEDILLA
0x0081: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS
0x0082: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE
@@ -165,7 +165,7 @@ decoding_map = {
0x00fd: 0x0159, # LATIN SMALL LETTER R WITH CARON
0x00fe: 0x25a0, # BLACK SQUARE
0x00ff: 0x00a0, # NO-BREAK SPACE
}
})
### Encoding Map

Some files were not shown because too many files have changed in this diff Show More