diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 954ce2647f..584046d81e 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -15,8 +15,7 @@ import string
interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
-incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
- '|#([0-9]*|[xX][0-9a-fA-F]*))?')
+incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
@@ -185,16 +184,18 @@ class HTMLParser:
k = self.parse_pi(i)
elif declopen.match(rawdata, i): # '
n = len(rawdata)
+ decltype = None
+ extrachars = ""
while j < n:
c = rawdata[j]
if c == ">":
# end of declaration syntax
- self.handle_decl(rawdata[i+2:j])
+ data = rawdata[i+2:j]
+ if decltype == "doctype":
+ self.handle_decl(data)
+ else:
+ self.unknown_decl(data)
return j + 1
if c in "\"'":
m = declstringlit.match(rawdata, j)
@@ -273,12 +291,242 @@ class HTMLParser:
if not m:
return -1 # incomplete
j = m.end()
+ if decltype is None:
+ decltype = m.group(0).rstrip().lower()
+ if decltype != "doctype":
+ extrachars = "="
+ elif c == "[" and decltype == "doctype":
+ j = self.parse_doctype_subset(j + 1, i)
+ if j < 0:
+ return j
+ elif c in extrachars:
+ j = j + 1
+ while j < n and rawdata[j] in string.whitespace:
+ j = j + 1
+ if j == n:
+ # end of buffer while in declaration
+ return -1
else:
raise HTMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`,
self.getpos())
+ decltype = decltype or ''
return -1 # incomplete
+ # Internal -- scan past the internal subset in a n:
+ # end of buffer; incomplete
+ return -1
+ if rawdata[j:j+4] == "