mirror of
https://github.com/AdaCore/cpython.git
synced 2026-02-12 12:57:15 -08:00
[2.7] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) (#8133)
Most of the change involves fixing up the test suite, which previously made
the assumption that there wouldn't be a new line if the input didn't end in
one.
Contributed by Ammar Askar.
(cherry picked from commit c4ef4896ea)
This commit is contained in:
@@ -1,32 +1,54 @@
|
|||||||
from test import test_support
|
from test import test_support
|
||||||
from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP,
|
from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, NEWLINE,
|
||||||
STRING, ENDMARKER, tok_name, Untokenizer, tokenize)
|
STRING, ENDMARKER, tok_name, Untokenizer, tokenize)
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import os
|
import os
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
|
|
||||||
|
# Converts a source string into a list of textual representation
|
||||||
|
# of the tokens such as:
|
||||||
|
# ` NAME 'if' (1, 0) (1, 2)`
|
||||||
|
# to make writing tests easier.
|
||||||
|
def stringify_tokens_from_source(token_generator, source_string):
|
||||||
|
result = []
|
||||||
|
num_lines = len(source_string.splitlines())
|
||||||
|
missing_trailing_nl = source_string[-1] not in '\r\n'
|
||||||
|
|
||||||
|
for type, token, start, end, line in token_generator:
|
||||||
|
if type == ENDMARKER:
|
||||||
|
break
|
||||||
|
# Ignore the new line on the last line if the input lacks one
|
||||||
|
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
|
||||||
|
continue
|
||||||
|
type = tok_name[type]
|
||||||
|
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
|
||||||
|
locals())
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
class TokenizeTest(TestCase):
|
class TokenizeTest(TestCase):
|
||||||
# Tests for the tokenize module.
|
# Tests for the tokenize module.
|
||||||
|
|
||||||
# The tests can be really simple. Given a small fragment of source
|
# The tests can be really simple. Given a small fragment of source
|
||||||
# code, print out a table with tokens. The ENDMARKER is omitted for
|
# code, print out a table with tokens. The ENDMARKER, ENCODING and
|
||||||
# brevity.
|
# final NEWLINE are omitted for brevity.
|
||||||
|
|
||||||
def check_tokenize(self, s, expected):
|
def check_tokenize(self, s, expected):
|
||||||
# Format the tokens in s in a table format.
|
# Format the tokens in s in a table format.
|
||||||
# The ENDMARKER is omitted.
|
|
||||||
result = []
|
|
||||||
f = StringIO(s)
|
f = StringIO(s)
|
||||||
for type, token, start, end, line in generate_tokens(f.readline):
|
result = stringify_tokens_from_source(generate_tokens(f.readline), s)
|
||||||
if type == ENDMARKER:
|
|
||||||
break
|
|
||||||
type = tok_name[type]
|
|
||||||
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
|
|
||||||
locals())
|
|
||||||
self.assertEqual(result,
|
self.assertEqual(result,
|
||||||
expected.rstrip().splitlines())
|
expected.rstrip().splitlines())
|
||||||
|
|
||||||
|
def test_implicit_newline(self):
|
||||||
|
# Make sure that the tokenizer puts in an implicit NEWLINE
|
||||||
|
# when the input lacks a trailing new line.
|
||||||
|
f = StringIO("x")
|
||||||
|
tokens = list(generate_tokens(f.readline))
|
||||||
|
self.assertEqual(tokens[-2][0], NEWLINE)
|
||||||
|
self.assertEqual(tokens[-1][0], ENDMARKER)
|
||||||
|
|
||||||
def test_basic(self):
|
def test_basic(self):
|
||||||
self.check_tokenize("1 + 1", """\
|
self.check_tokenize("1 + 1", """\
|
||||||
@@ -616,7 +638,7 @@ class TestRoundtrip(TestCase):
|
|||||||
self.check_roundtrip("if x == 1:\n"
|
self.check_roundtrip("if x == 1:\n"
|
||||||
" print x\n")
|
" print x\n")
|
||||||
self.check_roundtrip("# This is a comment\n"
|
self.check_roundtrip("# This is a comment\n"
|
||||||
"# This also")
|
"# This also\n")
|
||||||
|
|
||||||
# Some people use different formatting conventions, which makes
|
# Some people use different formatting conventions, which makes
|
||||||
# untokenize a little trickier. Note that this test involves trailing
|
# untokenize a little trickier. Note that this test involves trailing
|
||||||
|
|||||||
@@ -306,8 +306,15 @@ def generate_tokens(readline):
|
|||||||
contline = None
|
contline = None
|
||||||
indents = [0]
|
indents = [0]
|
||||||
|
|
||||||
|
last_line = b''
|
||||||
|
line = b''
|
||||||
while 1: # loop over lines in stream
|
while 1: # loop over lines in stream
|
||||||
try:
|
try:
|
||||||
|
# We capture the value of the line variable here because
|
||||||
|
# readline uses the empty string '' to signal end of input,
|
||||||
|
# hence `line` itself will always be overwritten at the end
|
||||||
|
# of this loop.
|
||||||
|
last_line = line
|
||||||
line = readline()
|
line = readline()
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
line = ''
|
line = ''
|
||||||
@@ -437,6 +444,9 @@ def generate_tokens(readline):
|
|||||||
(lnum, pos), (lnum, pos+1), line)
|
(lnum, pos), (lnum, pos+1), line)
|
||||||
pos += 1
|
pos += 1
|
||||||
|
|
||||||
|
# Add an implicit NEWLINE if the input doesn't end in one
|
||||||
|
if last_line and last_line[-1] not in '\r\n':
|
||||||
|
yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
|
||||||
for indent in indents[1:]: # pop remaining indent levels
|
for indent in indents[1:]: # pop remaining indent levels
|
||||||
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
|
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
|
||||||
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
Tokenize module now implicitly emits a NEWLINE when provided with input that
|
||||||
|
does not have a trailing new line. This behavior now matches what the C
|
||||||
|
tokenizer does internally. Contributed by Ammar Askar.
|
||||||
Reference in New Issue
Block a user