2012-05-21 04:12:37 -07:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
2008-02-14 14:57:20 -08:00
|
|
|
|
|
|
|
import codecs
|
|
|
|
import encodings.idna
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
|
|
|
|
"""
|
|
|
|
Processes a file containing effective TLD data. See the following URL for a
|
|
|
|
description of effective TLDs and of the file format that this script
|
|
|
|
processes (although for the latter you're better off just reading this file's
|
|
|
|
short source code).
|
|
|
|
|
|
|
|
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
|
|
|
|
"""
|
|
|
|
|
|
|
|
def getEffectiveTLDs(path):
|
|
|
|
file = codecs.open(path, "r", "UTF-8")
|
2009-07-17 07:20:11 -07:00
|
|
|
domains = set()
|
2008-02-14 14:57:20 -08:00
|
|
|
while True:
|
|
|
|
line = file.readline()
|
|
|
|
# line always contains a line terminator unless the file is empty
|
|
|
|
if len(line) == 0:
|
|
|
|
raise StopIteration
|
|
|
|
line = line.rstrip()
|
|
|
|
# comment, empty, or superfluous line for explicitness purposes
|
|
|
|
if line.startswith("//") or "." not in line:
|
|
|
|
continue
|
|
|
|
line = re.split(r"[ \t\n]", line, 1)[0]
|
|
|
|
entry = EffectiveTLDEntry(line)
|
|
|
|
domain = entry.domain()
|
|
|
|
assert domain not in domains, \
|
|
|
|
"repeating domain %s makes no sense" % domain
|
|
|
|
domains.add(domain)
|
|
|
|
yield entry
|
|
|
|
|
|
|
|
def _normalizeHostname(domain):
|
|
|
|
"""
|
|
|
|
Normalizes the given domain, component by component. ASCII components are
|
|
|
|
lowercased, while non-ASCII components are processed using the ToASCII
|
|
|
|
algorithm.
|
|
|
|
"""
|
|
|
|
def convertLabel(label):
|
|
|
|
if _isASCII(label):
|
|
|
|
return label.lower()
|
|
|
|
return encodings.idna.ToASCII(label)
|
|
|
|
return ".".join(map(convertLabel, domain.split(".")))
|
|
|
|
|
|
|
|
def _isASCII(s):
|
|
|
|
"True if s consists entirely of ASCII characters, false otherwise."
|
|
|
|
for c in s:
|
|
|
|
if ord(c) > 127:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
class EffectiveTLDEntry:
|
|
|
|
"""
|
|
|
|
Stores an entry in an effective-TLD name file.
|
|
|
|
"""
|
|
|
|
|
|
|
|
_exception = False
|
|
|
|
_wild = False
|
|
|
|
|
|
|
|
def __init__(self, line):
|
|
|
|
"""
|
|
|
|
Creates a TLD entry from a line of data, which must have been stripped of
|
|
|
|
the line ending.
|
|
|
|
"""
|
|
|
|
if line.startswith("!"):
|
|
|
|
self._exception = True
|
|
|
|
domain = line[1:]
|
|
|
|
elif line.startswith("*."):
|
|
|
|
self._wild = True
|
|
|
|
domain = line[2:]
|
|
|
|
else:
|
|
|
|
domain = line
|
|
|
|
self._domain = _normalizeHostname(domain)
|
|
|
|
|
|
|
|
def domain(self):
|
|
|
|
"The domain this represents."
|
|
|
|
return self._domain
|
|
|
|
|
|
|
|
def exception(self):
|
|
|
|
"True if this entry's domain denotes does not denote an effective TLD."
|
|
|
|
return self._exception
|
|
|
|
|
|
|
|
def wild(self):
|
|
|
|
"True if this entry represents a class of effective TLDs."
|
|
|
|
return self._wild
|
|
|
|
|
|
|
|
|
|
|
|
#################
|
|
|
|
# DO EVERYTHING #
|
|
|
|
#################
|
|
|
|
|
|
|
|
def main():
|
|
|
|
"""
|
|
|
|
argv[1] is the effective TLD file to parse.
|
|
|
|
A C++ array of { domain, exception, wild } entries representing the
|
|
|
|
eTLD file is then printed to stdout.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def boolStr(b):
|
|
|
|
if b:
|
|
|
|
return "PR_TRUE"
|
|
|
|
return "PR_FALSE"
|
|
|
|
|
|
|
|
print "{"
|
|
|
|
for etld in getEffectiveTLDs(sys.argv[1]):
|
|
|
|
exception = boolStr(etld.exception())
|
|
|
|
wild = boolStr(etld.wild())
|
|
|
|
print ' { "%s", %s, %s },' % (etld.domain(), exception, wild)
|
2012-07-30 07:20:58 -07:00
|
|
|
print " { nullptr, PR_FALSE, PR_FALSE }"
|
2008-02-14 14:57:20 -08:00
|
|
|
print "}"
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|