From b37624fde25c546dd9ab12bfa7d74a31ad5b02ff Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 22 Dec 2024 20:18:02 +0530 Subject: [PATCH 01/24] Replace the blib2to3 tokenizer with pytokens --- pyproject.toml | 1 + src/blib2to3/pgen2/driver.py | 20 +- src/blib2to3/pgen2/pgen.py | 7 +- src/blib2to3/pgen2/tokenize.py | 1099 +++----------------------------- tests/test_tokenize.py | 16 +- 5 files changed, 90 insertions(+), 1053 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 30d2962248c..82e572bf35e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", + "pytokens>=0.1.2", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index d17fd1d7bfb..7fc72764808 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -28,7 +28,7 @@ from typing import IO, Any, Optional, Union, cast from blib2to3.pgen2.grammar import Grammar -from blib2to3.pgen2.tokenize import GoodTokenInfo +from blib2to3.pgen2.tokenize import TokenInfo from blib2to3.pytree import NL # Pgen imports @@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None: logger = logging.getLogger(__name__) self.logger = logger - def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL: + def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL: """Parse a series of tokens and return the syntax tree.""" # XXX Move the prefix computation into a wrapper around tokenize. proxy = TokenProxy(tokens) @@ -180,26 +180,18 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> assert p.rootnode is not None return p.rootnode - def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL: - """Parse a stream and return the syntax tree.""" - tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar) - return self.parse_tokens(tokens, debug) - - def parse_stream(self, stream: IO[str], debug: bool = False) -> NL: - """Parse a stream and return the syntax tree.""" - return self.parse_stream_raw(stream, debug) - def parse_file( self, filename: Path, encoding: Optional[str] = None, debug: bool = False ) -> NL: """Parse a file and return the syntax tree.""" with open(filename, encoding=encoding) as stream: - return self.parse_stream(stream, debug) + text = stream.read() + return self.parse_string(text, debug) def parse_string(self, text: str, debug: bool = False) -> NL: """Parse a string and return the syntax tree.""" - tokens = tokenize.generate_tokens( - io.StringIO(text).readline, grammar=self.grammar + tokens = tokenize.tokenize( + text, grammar=self.grammar ) return self.parse_tokens(tokens, debug) diff --git a/src/blib2to3/pgen2/pgen.py b/src/blib2to3/pgen2/pgen.py index 17f7533995f..82d36bc46a1 100644 --- a/src/blib2to3/pgen2/pgen.py +++ b/src/blib2to3/pgen2/pgen.py @@ -6,7 +6,7 @@ from typing import IO, Any, NoReturn, Optional, Union from blib2to3.pgen2 import grammar, token, tokenize -from blib2to3.pgen2.tokenize import GoodTokenInfo +from blib2to3.pgen2.tokenize import TokenInfo Path = Union[str, "os.PathLike[str]"] @@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar): class ParserGenerator: filename: Path stream: IO[str] - generator: Iterator[GoodTokenInfo] + generator: Iterator[TokenInfo] first: dict[str, Optional[dict[str, int]]] def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: @@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: stream = open(filename, encoding="utf-8") close_stream = stream.close self.filename = filename - self.stream = stream - self.generator = tokenize.generate_tokens(stream.readline) + self.generator = tokenize.tokenize(stream.read()) self.gettoken() # Initialize lookahead self.dfas, self.startsymbol = self.parse() if close_stream is not None: diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 97dd92b06f0..45ff598c89e 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -45,13 +45,11 @@ FSTRING_MIDDLE, FSTRING_START, INDENT, - LBRACE, NAME, NEWLINE, NL, NUMBER, OP, - RBRACE, STRING, tok_name, ) @@ -61,6 +59,11 @@ import re from codecs import BOM_UTF8, lookup +import token + +import pytokens +from pytokens import TokenType + from . import token @@ -71,176 +74,79 @@ ] del token +Coord = tuple[int, int] +TokenInfo = tuple[int, str, Coord, Coord, str] + +TOKEN_TYPE_MAP = { + TokenType.indent: INDENT, + TokenType.dedent: DEDENT, + TokenType.newline: NEWLINE, + TokenType.nl: NL, + TokenType.comment: COMMENT, + TokenType.semicolon: OP, + TokenType.lparen: OP, + TokenType.rparen: OP, + TokenType.lbracket: OP, + TokenType.rbracket: OP, + TokenType.lbrace: OP, + TokenType.rbrace: OP, + TokenType.colon: OP, + TokenType.op: OP, + TokenType.identifier: NAME, + TokenType.number: NUMBER, + TokenType.string: STRING, + TokenType.fstring_start: FSTRING_START, + TokenType.fstring_middle: FSTRING_MIDDLE, + TokenType.fstring_end: FSTRING_END, + TokenType.endmarker: ENDMARKER -def group(*choices: str) -> str: - return "(" + "|".join(choices) + ")" - - -def any(*choices: str) -> str: - return group(*choices) + "*" - - -def maybe(*choices: str) -> str: - return group(*choices) + "?" - - -def _combinations(*l: str) -> set[str]: - return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()} - - -Whitespace = r"[ \f\t]*" -Comment = r"#[^\r\n]*" -Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) -Name = ( # this is invalid but it's fine because Name comes after Number in all groups - r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" -) - -Binnumber = r"0[bB]_?[01]+(?:_[01]+)*" -Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?" -Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?" -Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?") -Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) -Exponent = r"[eE][-+]?\d+(?:_\d+)*" -Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe( - Exponent -) -Expfloat = r"\d+(?:_\d+)*" + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]") -Number = group(Imagnumber, Floatnumber, Intnumber) - -# Tail end of ' string. -Single = r"(?:\\.|[^'\\])*'" -# Tail end of " string. -Double = r'(?:\\.|[^"\\])*"' -# Tail end of ''' string. -Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" -# Tail end of """ string. -Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' -_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?" -_fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)" -Triple = group( - _litprefix + "'''", - _litprefix + '"""', - _fstringlitprefix + '"""', - _fstringlitprefix + "'''", -) - -# beginning of a single quoted f-string. must not end with `{{` or `\N{` -SingleLbrace = r"(?:\\N{|{{|\\'|[^\n'{])*(?>=?", - r"<<=?", - r"<>", - r"!=", - r"//=?", - r"->", - r"[+\-*/%&@|^=<>:]=?", - r"~", -) - -Bracket = "[][(){}]" -Special = group(r"\r?\n", r"[:;.,`@]") -Funny = group(Operator, Bracket, Special) - -_string_middle_single = r"(?:[^\n'\\]|\\.)*" -_string_middle_double = r'(?:[^\n"\\]|\\.)*' - -# FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{` -_fstring_middle_single = SingleLbrace -_fstring_middle_double = DoubleLbrace - -# First (or only) line of ' or " string. -ContStr = group( - _litprefix + "'" + _string_middle_single + group("'", r"\\\r?\n"), - _litprefix + '"' + _string_middle_double + group('"', r"\\\r?\n"), - group(_fstringlitprefix + "'") + _fstring_middle_single, - group(_fstringlitprefix + '"') + _fstring_middle_double, - group(_fstringlitprefix + "'") + _string_middle_single + group("'", r"\\\r?\n"), - group(_fstringlitprefix + '"') + _string_middle_double + group('"', r"\\\r?\n"), -) -PseudoExtras = group(r"\\\r?\n", Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) - -singleprog = re.compile(Single) -singleprog_plus_lbrace = re.compile(group(SingleLbrace, Single)) -doubleprog = re.compile(Double) -doubleprog_plus_lbrace = re.compile(group(DoubleLbrace, Double)) - -single3prog = re.compile(Single3) -single3prog_plus_lbrace = re.compile(group(Single3Lbrace, Single3)) -double3prog = re.compile(Double3) -double3prog_plus_lbrace = re.compile(group(Double3Lbrace, Double3)) - -_strprefixes = _combinations("r", "R", "b", "B") | {"u", "U", "ur", "uR", "Ur", "UR"} -_fstring_prefixes = _combinations("r", "R", "f", "F") - {"r", "R"} - -endprogs: Final = { - "'": singleprog, - '"': doubleprog, - "'''": single3prog, - '"""': double3prog, - **{f"{prefix}'": singleprog for prefix in _strprefixes}, - **{f'{prefix}"': doubleprog for prefix in _strprefixes}, - **{f"{prefix}'": singleprog_plus_lbrace for prefix in _fstring_prefixes}, - **{f'{prefix}"': doubleprog_plus_lbrace for prefix in _fstring_prefixes}, - **{f"{prefix}'''": single3prog for prefix in _strprefixes}, - **{f'{prefix}"""': double3prog for prefix in _strprefixes}, - **{f"{prefix}'''": single3prog_plus_lbrace for prefix in _fstring_prefixes}, - **{f'{prefix}"""': double3prog_plus_lbrace for prefix in _fstring_prefixes}, } -triple_quoted: Final = ( - {"'''", '"""'} - | {f"{prefix}'''" for prefix in _strprefixes | _fstring_prefixes} - | {f'{prefix}"""' for prefix in _strprefixes | _fstring_prefixes} -) -single_quoted: Final = ( - {"'", '"'} - | {f"{prefix}'" for prefix in _strprefixes | _fstring_prefixes} - | {f'{prefix}"' for prefix in _strprefixes | _fstring_prefixes} -) -fstring_prefix: Final = ( - {f"{prefix}'" for prefix in _fstring_prefixes} - | {f'{prefix}"' for prefix in _fstring_prefixes} - | {f"{prefix}'''" for prefix in _fstring_prefixes} - | {f'{prefix}"""' for prefix in _fstring_prefixes} -) - -tabsize = 8 - - -class TokenError(Exception): - pass - +class TokenError(Exception): ... + +def token_type(token: pytokens.Token, source: str) -> int: + tok_type = TOKEN_TYPE_MAP[token.type] + if tok_type == NAME: + if source == "async": + return ASYNC + + if source == "await": + return AWAIT + + return tok_type + +def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: + lines = source.split("\n") + lines += [""] # For newline tokens in files that don't end in a newline + line, column = 1, 0 + try: + for token in pytokens.tokenize(source): + line, column = token.start_line, token.start_col + if token.type == TokenType.whitespace: + continue -class StopTokenizing(Exception): - pass + token_string = source[token.start_index:token.end_index] + + if token.type == TokenType.newline and token_string == '': + # Black doesn't yield empty newline tokens at the end of a file + # if there's no newline at the end of a file. + continue + source_line = lines[token.start_line - 1] -Coord = tuple[int, int] + if token.type == TokenType.op and token_string == "...": + # Black doesn't have an ellipsis token yet, yield 3 DOTs instead + assert token.start_line == token.end_line + assert token.end_col == token.start_col + 3 + token_string = "." + for start_col in range(token.start_col, token.start_col + 3): + end_col = start_col + 1 + yield (token_type(token, token_string), token_string, (token.start_line, start_col), (token.end_line, end_col), source_line) + else: + yield (token_type(token, token_string), token_string, (token.start_line, token.start_col), (token.end_line, token.end_col), source_line) + except Exception as exc: # TODO: + raise TokenError(repr(exc), (line, column)) def printtoken( type: int, token: str, srow_col: Coord, erow_col: Coord, line: str @@ -251,864 +157,11 @@ def printtoken( "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) ) - -TokenEater = Callable[[int, str, Coord, Coord, str], None] - - -def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None: - """ - The tokenize() function accepts two parameters: one representing the - input stream, and one providing an output mechanism for tokenize(). - - The first parameter, readline, must be a callable object which provides - the same interface as the readline() method of built-in file objects. - Each call to the function should return one line of input as a string. - - The second parameter, tokeneater, must also be a callable object. It is - called once for each token, with five arguments, corresponding to the - tuples generated by generate_tokens(). - """ - try: - tokenize_loop(readline, tokeneater) - except StopTokenizing: - pass - - -# backwards compatible interface -def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None: - for token_info in generate_tokens(readline): - tokeneater(*token_info) - - -GoodTokenInfo = tuple[int, str, Coord, Coord, str] -TokenInfo = Union[tuple[int, str], GoodTokenInfo] - - -class Untokenizer: - tokens: list[str] - prev_row: int - prev_col: int - - def __init__(self) -> None: - self.tokens = [] - self.prev_row = 1 - self.prev_col = 0 - - def add_whitespace(self, start: Coord) -> None: - row, col = start - assert row <= self.prev_row - col_offset = col - self.prev_col - if col_offset: - self.tokens.append(" " * col_offset) - - def untokenize(self, iterable: Iterable[TokenInfo]) -> str: - for t in iterable: - if len(t) == 2: - self.compat(t, iterable) - break - tok_type, token, start, end, line = t - self.add_whitespace(start) - self.tokens.append(token) - self.prev_row, self.prev_col = end - if tok_type in (NEWLINE, NL): - self.prev_row += 1 - self.prev_col = 0 - return "".join(self.tokens) - - def compat(self, token: tuple[int, str], iterable: Iterable[TokenInfo]) -> None: - startline = False - indents = [] - toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += " " - if toknum in (NEWLINE, NL): - startline = True - for tok in iterable: - toknum, tokval = tok[:2] - - if toknum in (NAME, NUMBER, ASYNC, AWAIT): - tokval += " " - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - - -cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII) -blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) - - -def _get_normal_name(orig_enc: str) -> str: - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if enc == "utf-8" or enc.startswith("utf-8-"): - return "utf-8" - if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith( - ("latin-1-", "iso-8859-1-", "iso-latin-1-") - ): - return "iso-8859-1" - return orig_enc - - -def detect_encoding(readline: Callable[[], bytes]) -> tuple[str, list[bytes]]: - """ - The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argument, readline, - in the same way as the tokenize() generator. - - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (left as bytes) it has read - in. - - It detects the encoding from the presence of a utf-8 bom or an encoding - cookie as specified in pep-0263. If both a bom and a cookie are present, but - disagree, a SyntaxError will be raised. If the encoding cookie is an invalid - charset, raise a SyntaxError. Note that if a utf-8 bom is found, - 'utf-8-sig' is returned. - - If no encoding is specified, then the default of 'utf-8' will be returned. - """ - bom_found = False - encoding = None - default = "utf-8" - - def read_or_stop() -> bytes: - try: - return readline() - except StopIteration: - return b"" - - def find_cookie(line: bytes) -> Optional[str]: - try: - line_string = line.decode("ascii") - except UnicodeDecodeError: - return None - match = cookie_re.match(line_string) - if not match: - return None - encoding = _get_normal_name(match.group(1)) - try: - codec = lookup(encoding) - except LookupError: - # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - if codec.name != "utf-8": - # This behaviour mimics the Python interpreter - raise SyntaxError("encoding problem: utf-8") - encoding += "-sig" - return encoding - - first = read_or_stop() - if first.startswith(BOM_UTF8): - bom_found = True - first = first[3:] - default = "utf-8-sig" - if not first: - return default, [] - - encoding = find_cookie(first) - if encoding: - return encoding, [first] - if not blank_re.match(first): - return default, [first] - - second = read_or_stop() - if not second: - return default, [first] - - encoding = find_cookie(second) - if encoding: - return encoding, [first, second] - - return default, [first, second] - - -def untokenize(iterable: Iterable[TokenInfo]) -> str: - """Transform tokens back into Python source code. - - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. - - Round-trip invariant for full input: - Untokenized source will match input source exactly - - Round-trip invariant for limited input: - # Output text will tokenize the back to the input - t1 = [tok[:2] for tok in generate_tokens(f.readline)] - newcode = untokenize(t1) - readline = iter(newcode.splitlines(1)).next - t2 = [tok[:2] for tokin generate_tokens(readline)] - assert t1 == t2 - """ - ut = Untokenizer() - return ut.untokenize(iterable) - - -def is_fstring_start(token: str) -> bool: - return builtins.any(token.startswith(prefix) for prefix in fstring_prefix) - - -def _split_fstring_start_and_middle(token: str) -> tuple[str, str]: - for prefix in fstring_prefix: - _, prefix, rest = token.partition(prefix) - if prefix != "": - return prefix, rest - - raise ValueError(f"Token {token!r} is not a valid f-string start") - - -STATE_NOT_FSTRING: Final = 0 # not in an f-string -STATE_MIDDLE: Final = 1 # in the string portion of an f-string (outside braces) -STATE_IN_BRACES: Final = 2 # between braces in an f-string -# in the format specifier (between the colon and the closing brace) -STATE_IN_COLON: Final = 3 - - -class FStringState: - """Keeps track of state around f-strings. - - The tokenizer should call the appropriate method on this class when - it transitions to a different part of an f-string. This is needed - because the tokenization depends on knowing where exactly we are in - the f-string. - - For example, consider the following f-string: - - f"a{1:b{2}c}d" - - The following is the tokenization of this string and the states - tracked by this class: - - 1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE] - 1,2-1,3: FSTRING_MIDDLE 'a' - 1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES] - 1,4-1,5: NUMBER '1' - 1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON] - 1,6-1,7: FSTRING_MIDDLE 'b' - 1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES] - 1,8-1,9: NUMBER '2' - 1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON] - 1,10-1,11: FSTRING_MIDDLE 'c' - 1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE] - 1,12-1,13: FSTRING_MIDDLE 'd' - 1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING] - 1,14-1,15: NEWLINE '\n' - 2,0-2,0: ENDMARKER '' - - Notice that the nested braces in the format specifier are represented - by adding a STATE_IN_BRACES entry to the state stack. The stack is - also used if there are nested f-strings. - - """ - - def __init__(self) -> None: - self.stack: list[int] = [STATE_NOT_FSTRING] - - def is_in_fstring_expression(self) -> bool: - return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING) - - def current(self) -> int: - return self.stack[-1] - - def enter_fstring(self) -> None: - self.stack.append(STATE_MIDDLE) - - def leave_fstring(self) -> None: - state = self.stack.pop() - assert state == STATE_MIDDLE - - def consume_lbrace(self) -> None: - current_state = self.stack[-1] - if current_state == STATE_MIDDLE: - self.stack[-1] = STATE_IN_BRACES - elif current_state == STATE_IN_COLON: - self.stack.append(STATE_IN_BRACES) - else: - assert False, current_state - - def consume_rbrace(self) -> None: - current_state = self.stack[-1] - assert current_state in (STATE_IN_BRACES, STATE_IN_COLON) - if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON: - self.stack.pop() - else: - self.stack[-1] = STATE_MIDDLE - - def consume_colon(self) -> None: - assert self.stack[-1] == STATE_IN_BRACES, self.stack - self.stack[-1] = STATE_IN_COLON - - -def generate_tokens( - readline: Callable[[], str], grammar: Optional[Grammar] = None -) -> Iterator[GoodTokenInfo]: - """ - The generate_tokens() generator requires one argument, readline, which - must be a callable object which provides the same interface as the - readline() method of built-in file objects. Each call to the function - should return one line of input as a string. Alternately, readline - can be a callable function terminating with StopIteration: - readline = open(myfile).next # Example of alternate readline - - The generator produces 5-tuples with these members: the token type; the - token string; a 2-tuple (srow, scol) of ints specifying the row and - column where the token begins in the source; a 2-tuple (erow, ecol) of - ints specifying the row and column where the token ends in the source; - and the line on which the token was found. The line passed is the - logical line; continuation lines are included. - """ - lnum = parenlev = continued = 0 - parenlev_stack: list[int] = [] - fstring_state = FStringState() - formatspec = "" - numchars: Final[str] = "0123456789" - contstr, needcont = "", 0 - contline: Optional[str] = None - indents = [0] - - # If we know we're parsing 3.7+, we can unconditionally parse `async` and - # `await` as keywords. - async_keywords = False if grammar is None else grammar.async_keywords - # 'stashed' and 'async_*' are used for async/await parsing - stashed: Optional[GoodTokenInfo] = None - async_def = False - async_def_indent = 0 - async_def_nl = False - - strstart: tuple[int, int] - endprog_stack: list[Pattern[str]] = [] - formatspec_start: tuple[int, int] - - while 1: # loop over lines in stream - try: - line = readline() - except StopIteration: - line = "" - lnum += 1 - - # skip lines that are just indent characters ending with a slash - # to avoid storing that line's indent information. - if not contstr and line.rstrip("\n").strip(" \t\f") == "\\": - continue - - pos, max = 0, len(line) - - if contstr: # continued string - assert contline is not None - if not line: - raise TokenError("EOF in multi-line string", strstart) - endprog = endprog_stack[-1] - endmatch = endprog.match(line) - if endmatch: - end = endmatch.end(0) - token = contstr + line[:end] - spos = strstart - epos = (lnum, end) - tokenline = contline + line - if fstring_state.current() in ( - STATE_NOT_FSTRING, - STATE_IN_BRACES, - ) and not is_fstring_start(token): - yield (STRING, token, spos, epos, tokenline) - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - if is_fstring_start(token): - fstring_start, token = _split_fstring_start_and_middle(token) - fstring_start_epos = (spos[0], spos[1] + len(fstring_start)) - yield ( - FSTRING_START, - fstring_start, - spos, - fstring_start_epos, - tokenline, - ) - fstring_state.enter_fstring() - # increase spos to the end of the fstring start - spos = fstring_start_epos - - if token.endswith("{"): - fstring_middle, lbrace = token[:-1], token[-1] - fstring_middle_epos = lbrace_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield (LBRACE, lbrace, lbrace_spos, epos, line) - fstring_state.consume_lbrace() - else: - if token.endswith(('"""', "'''")): - fstring_middle, fstring_end = token[:-3], token[-3:] - fstring_middle_epos = end_spos = (lnum, end - 3) - else: - fstring_middle, fstring_end = token[:-1], token[-1] - fstring_middle_epos = end_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield ( - FSTRING_END, - fstring_end, - end_spos, - epos, - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - pos = end - contstr, needcont = "", 0 - contline = None - elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": - yield ( - ERRORTOKEN, - contstr + line, - strstart, - (lnum, len(line)), - contline, - ) - contstr = "" - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - # new statement - elif ( - parenlev == 0 - and not continued - and not fstring_state.is_in_fstring_expression() - ): - if not line: - break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == " ": - column += 1 - elif line[pos] == "\t": - column = (column // tabsize + 1) * tabsize - elif line[pos] == "\f": - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if stashed: - yield stashed - stashed = None - - if line[pos] in "\r\n": # skip blank lines - yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) - continue - - if line[pos] == "#": # skip comments - comment_token = line[pos:].rstrip("\r\n") - nl_pos = pos + len(comment_token) - yield ( - COMMENT, - comment_token, - (lnum, pos), - (lnum, nl_pos), - line, - ) - yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents - indents.append(column) - yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - - while column < indents[-1]: # count dedents - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line), - ) - indents = indents[:-1] - - if async_def and async_def_indent >= indents[-1]: - async_def = False - async_def_nl = False - async_def_indent = 0 - - yield (DEDENT, "", (lnum, pos), (lnum, pos), line) - - if async_def and async_def_nl and async_def_indent >= indents[-1]: - async_def = False - async_def_nl = False - async_def_indent = 0 - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - if fstring_state.current() == STATE_MIDDLE: - endprog = endprog_stack[-1] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - start, end = endmatch.span(0) - token = line[start:end] - if token.endswith(('"""', "'''")): - middle_token, end_token = token[:-3], token[-3:] - middle_epos = end_spos = (lnum, end - 3) - else: - middle_token, end_token = token[:-1], token[-1] - middle_epos = end_spos = (lnum, end - 1) - # TODO: unsure if this can be safely removed - if stashed: - yield stashed - stashed = None - yield ( - FSTRING_MIDDLE, - middle_token, - (lnum, pos), - middle_epos, - line, - ) - if not token.endswith("{"): - yield ( - FSTRING_END, - end_token, - end_spos, - (lnum, end), - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line) - fstring_state.consume_lbrace() - pos = end - continue - else: # multiple lines - strstart = (lnum, end) - contstr = line[end:] - contline = line - break - - if fstring_state.current() == STATE_IN_COLON: - match = fstring_middle_after_colon.match(line, pos) - if match is None: - formatspec += line[pos:] - pos = max - continue - - start, end = match.span(1) - token = line[start:end] - formatspec += token - - brace_start, brace_end = match.span(2) - brace_or_nl = line[brace_start:brace_end] - if brace_or_nl == "\n": - pos = brace_end - - yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line) - formatspec = "" - - if brace_or_nl == "{": - yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line) - fstring_state.consume_lbrace() - end = brace_end - elif brace_or_nl == "}": - yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line) - fstring_state.consume_rbrace() - end = brace_end - formatspec_start = (lnum, brace_end) - - pos = end - continue - - if fstring_state.current() == STATE_IN_BRACES and parenlev == 0: - match = bang.match(line, pos) - if match: - start, end = match.span(1) - yield (OP, "!", (lnum, start), (lnum, end), line) - pos = end - continue - - match = colon.match(line, pos) - if match: - start, end = match.span(1) - yield (OP, ":", (lnum, start), (lnum, end), line) - fstring_state.consume_colon() - formatspec_start = (lnum, end) - pos = end - continue - - pseudomatch = pseudoprog.match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - token, initial = line[start:end], line[start] - - if initial in numchars or ( - initial == "." and token != "." - ): # ordinary number - yield (NUMBER, token, spos, epos, line) - elif initial in "\r\n": - newline = NEWLINE - if parenlev > 0 or fstring_state.is_in_fstring_expression(): - newline = NL - elif async_def: - async_def_nl = True - if stashed: - yield stashed - stashed = None - yield (newline, token, spos, epos, line) - - elif initial == "#": - assert not token.endswith("\n") - if stashed: - yield stashed - stashed = None - yield (COMMENT, token, spos, epos, line) - elif token in triple_quoted: - endprog = endprogs[token] - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - if is_fstring_start(token): - yield (FSTRING_START, token, spos, epos, line) - fstring_state.enter_fstring() - - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - if stashed: - yield stashed - stashed = None - if not is_fstring_start(token): - pos = endmatch.end(0) - token = line[start:pos] - epos = (lnum, pos) - yield (STRING, token, spos, epos, line) - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - end = endmatch.end(0) - token = line[pos:end] - spos, epos = (lnum, pos), (lnum, end) - if not token.endswith("{"): - fstring_middle, fstring_end = token[:-3], token[-3:] - fstring_middle_epos = fstring_end_spos = (lnum, end - 3) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield ( - FSTRING_END, - fstring_end, - fstring_end_spos, - epos, - line, - ) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - fstring_middle, lbrace = token[:-1], token[-1] - fstring_middle_epos = lbrace_spos = (lnum, end - 1) - yield ( - FSTRING_MIDDLE, - fstring_middle, - spos, - fstring_middle_epos, - line, - ) - yield (LBRACE, lbrace, lbrace_spos, epos, line) - fstring_state.consume_lbrace() - pos = end - else: - # multiple lines - if is_fstring_start(token): - strstart = (lnum, pos) - contstr = line[pos:] - else: - strstart = (lnum, start) - contstr = line[start:] - contline = line - break - elif ( - initial in single_quoted - or token[:2] in single_quoted - or token[:3] in single_quoted - ): - maybe_endprog = ( - endprogs.get(initial) - or endprogs.get(token[:2]) - or endprogs.get(token[:3]) - ) - assert maybe_endprog is not None, f"endprog not found for {token}" - endprog = maybe_endprog - if token[-1] == "\n": # continued string - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - strstart = (lnum, start) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - if stashed: - yield stashed - stashed = None - - if not is_fstring_start(token): - yield (STRING, token, spos, epos, line) - else: - if pseudomatch[20] is not None: - fstring_start = pseudomatch[20] - offset = pseudomatch.end(20) - pseudomatch.start(1) - elif pseudomatch[22] is not None: - fstring_start = pseudomatch[22] - offset = pseudomatch.end(22) - pseudomatch.start(1) - elif pseudomatch[24] is not None: - fstring_start = pseudomatch[24] - offset = pseudomatch.end(24) - pseudomatch.start(1) - else: - fstring_start = pseudomatch[26] - offset = pseudomatch.end(26) - pseudomatch.start(1) - - start_epos = (lnum, start + offset) - yield (FSTRING_START, fstring_start, spos, start_epos, line) - fstring_state.enter_fstring() - endprog = endprogs[fstring_start] - endprog_stack.append(endprog) - parenlev_stack.append(parenlev) - parenlev = 0 - - end_offset = pseudomatch.end(1) - 1 - fstring_middle = line[start + offset : end_offset] - middle_spos = (lnum, start + offset) - middle_epos = (lnum, end_offset) - yield ( - FSTRING_MIDDLE, - fstring_middle, - middle_spos, - middle_epos, - line, - ) - if not token.endswith("{"): - end_spos = (lnum, end_offset) - end_epos = (lnum, end_offset + 1) - yield (FSTRING_END, token[-1], end_spos, end_epos, line) - fstring_state.leave_fstring() - endprog_stack.pop() - parenlev = parenlev_stack.pop() - else: - end_spos = (lnum, end_offset) - end_epos = (lnum, end_offset + 1) - yield (LBRACE, "{", end_spos, end_epos, line) - fstring_state.consume_lbrace() - - elif initial.isidentifier(): # ordinary name - if token in ("async", "await"): - if async_keywords or async_def: - yield ( - ASYNC if token == "async" else AWAIT, - token, - spos, - epos, - line, - ) - continue - - tok = (NAME, token, spos, epos, line) - if token == "async" and not stashed: - stashed = tok - continue - - if token in ("def", "for"): - if stashed and stashed[0] == NAME and stashed[1] == "async": - if token == "def": - async_def = True - async_def_indent = indents[-1] - - yield ( - ASYNC, - stashed[1], - stashed[2], - stashed[3], - stashed[4], - ) - stashed = None - - if stashed: - yield stashed - stashed = None - - yield tok - elif initial == "\\": # continued stmt - # This yield is new; needed for better idempotency: - if stashed: - yield stashed - stashed = None - yield (NL, token, spos, (lnum, pos), line) - continued = 1 - elif ( - initial == "}" - and parenlev == 0 - and fstring_state.is_in_fstring_expression() - ): - yield (RBRACE, token, spos, epos, line) - fstring_state.consume_rbrace() - formatspec_start = epos - else: - if initial in "([{": - parenlev += 1 - elif initial in ")]}": - parenlev -= 1 - if stashed: - yield stashed - stashed = None - yield (OP, token, spos, epos, line) - else: - yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) - pos += 1 - - if stashed: - yield stashed - stashed = None - - for _indent in indents[1:]: # pop remaining indent levels - yield (DEDENT, "", (lnum, 0), (lnum, 0), "") - yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") - assert len(endprog_stack) == 0 - assert len(parenlev_stack) == 0 - - if __name__ == "__main__": # testing if len(sys.argv) > 1: - tokenize(open(sys.argv[1]).readline) + token_iterator = tokenize(open(sys.argv[1]).read()) else: - tokenize(sys.stdin.readline) + token_iterator = tokenize(sys.stdin.read()) + + for tok in token_iterator: + printtoken(*tok) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 71773069546..253caf99d02 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -8,6 +8,7 @@ import black from blib2to3.pgen2 import token, tokenize +token._name @dataclass class Token: @@ -19,16 +20,7 @@ class Token: def get_tokens(text: str) -> list[Token]: """Return the tokens produced by the tokenizer.""" - readline = io.StringIO(text).readline - tokens: list[Token] = [] - - def tokeneater( - type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str - ) -> None: - tokens.append(Token(token.tok_name[type], string, start, end)) - - tokenize.tokenize(readline, tokeneater) - return tokens + return [Token(token.tok_name[tok_type], string, start, end) for tok_type, string, start, end, _ in tokenize.tokenize(text)] def assert_tokenizes(text: str, tokens: list[Token]) -> None: @@ -70,9 +62,9 @@ def test_fstring() -> None: [ Token("FSTRING_START", 'f"', (1, 0), (1, 2)), Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), - Token("LBRACE", "{", (1, 2), (1, 3)), + Token("OP", "{", (1, 2), (1, 3)), Token("NAME", "x", (1, 3), (1, 4)), - Token("RBRACE", "}", (1, 4), (1, 5)), + Token("OP", "}", (1, 4), (1, 5)), Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)), Token("FSTRING_END", '"', (1, 5), (1, 6)), Token("ENDMARKER", "", (2, 0), (2, 0)), From 1174fbc7d6a2940544fbc08755349c8a50e9e987 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:50:28 +0000 Subject: [PATCH 02/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/blib2to3/pgen2/tokenize.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 45ff598c89e..d4c4c149284 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -58,13 +58,12 @@ __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" import re -from codecs import BOM_UTF8, lookup import token +from codecs import BOM_UTF8, lookup import pytokens from pytokens import TokenType - from . import token __all__ = [x for x in dir(token) if x[0] != "_"] + [ @@ -109,10 +108,10 @@ def token_type(token: pytokens.Token, source: str) -> int: if tok_type == NAME: if source == "async": return ASYNC - + if source == "await": return AWAIT - + return tok_type def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: @@ -126,7 +125,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] continue token_string = source[token.start_index:token.end_index] - + if token.type == TokenType.newline and token_string == '': # Black doesn't yield empty newline tokens at the end of a file # if there's no newline at the end of a file. @@ -145,7 +144,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] yield (token_type(token, token_string), token_string, (token.start_line, start_col), (token.end_line, end_col), source_line) else: yield (token_type(token, token_string), token_string, (token.start_line, token.start_col), (token.end_line, token.end_col), source_line) - except Exception as exc: # TODO: + except Exception as exc: # TODO: raise TokenError(repr(exc), (line, column)) def printtoken( @@ -162,6 +161,6 @@ def printtoken( token_iterator = tokenize(open(sys.argv[1]).read()) else: token_iterator = tokenize(sys.stdin.read()) - + for tok in token_iterator: - printtoken(*tok) + printtoken(*tok) From e5d412b8c89483386236f4cee85e6e1209c9f31c Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Sun, 22 Dec 2024 20:23:35 +0530 Subject: [PATCH 03/24] formatting --- src/blib2to3/pgen2/driver.py | 4 +--- src/blib2to3/pgen2/tokenize.py | 28 ++++++++++++++++++++++------ tests/test_tokenize.py | 6 ++++-- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py index 7fc72764808..056fab2127b 100644 --- a/src/blib2to3/pgen2/driver.py +++ b/src/blib2to3/pgen2/driver.py @@ -190,9 +190,7 @@ def parse_file( def parse_string(self, text: str, debug: bool = False) -> NL: """Parse a string and return the syntax tree.""" - tokens = tokenize.tokenize( - text, grammar=self.grammar - ) + tokens = tokenize.tokenize(text, grammar=self.grammar) return self.parse_tokens(tokens, debug) def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]: diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index d4c4c149284..a1588fe045b 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -97,12 +97,13 @@ TokenType.fstring_start: FSTRING_START, TokenType.fstring_middle: FSTRING_MIDDLE, TokenType.fstring_end: FSTRING_END, - TokenType.endmarker: ENDMARKER - + TokenType.endmarker: ENDMARKER, } + class TokenError(Exception): ... + def token_type(token: pytokens.Token, source: str) -> int: tok_type = TOKEN_TYPE_MAP[token.type] if tok_type == NAME: @@ -114,6 +115,7 @@ def token_type(token: pytokens.Token, source: str) -> int: return tok_type + def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: lines = source.split("\n") lines += [""] # For newline tokens in files that don't end in a newline @@ -124,9 +126,9 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] if token.type == TokenType.whitespace: continue - token_string = source[token.start_index:token.end_index] + token_string = source[token.start_index : token.end_index] - if token.type == TokenType.newline and token_string == '': + if token.type == TokenType.newline and token_string == "": # Black doesn't yield empty newline tokens at the end of a file # if there's no newline at the end of a file. continue @@ -141,12 +143,25 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] token_string = "." for start_col in range(token.start_col, token.start_col + 3): end_col = start_col + 1 - yield (token_type(token, token_string), token_string, (token.start_line, start_col), (token.end_line, end_col), source_line) + yield ( + token_type(token, token_string), + token_string, + (token.start_line, start_col), + (token.end_line, end_col), + source_line, + ) else: - yield (token_type(token, token_string), token_string, (token.start_line, token.start_col), (token.end_line, token.end_col), source_line) + yield ( + token_type(token, token_string), + token_string, + (token.start_line, token.start_col), + (token.end_line, token.end_col), + source_line, + ) except Exception as exc: # TODO: raise TokenError(repr(exc), (line, column)) + def printtoken( type: int, token: str, srow_col: Coord, erow_col: Coord, line: str ) -> None: # for testing @@ -156,6 +171,7 @@ def printtoken( "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) ) + if __name__ == "__main__": # testing if len(sys.argv) > 1: token_iterator = tokenize(open(sys.argv[1]).read()) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 253caf99d02..052cbb6c58d 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -8,7 +8,6 @@ import black from blib2to3.pgen2 import token, tokenize -token._name @dataclass class Token: @@ -20,7 +19,10 @@ class Token: def get_tokens(text: str) -> list[Token]: """Return the tokens produced by the tokenizer.""" - return [Token(token.tok_name[tok_type], string, start, end) for tok_type, string, start, end, _ in tokenize.tokenize(text)] + return [ + Token(token.tok_name[tok_type], string, start, end) + for tok_type, string, start, end, _ in tokenize.tokenize(text) + ] def assert_tokenizes(text: str, tokens: list[Token]) -> None: From 920445f4d62623e4221a023197883b4a7a5b5cab Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 01:01:13 +0530 Subject: [PATCH 04/24] wip --- src/blib2to3/pgen2/tokenize.py | 85 +++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index a1588fe045b..b427b788941 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -27,11 +27,8 @@ function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" -import builtins import sys -from collections.abc import Callable, Iterable, Iterator -from re import Pattern -from typing import Final, Optional, Union +from collections.abc import Iterator from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.token import ( @@ -104,60 +101,86 @@ class TokenError(Exception): ... -def token_type(token: pytokens.Token, source: str) -> int: - tok_type = TOKEN_TYPE_MAP[token.type] - if tok_type == NAME: - if source == "async": - return ASYNC - - if source == "await": - return AWAIT - - return tok_type - - def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: + async_keywords = False if grammar is None else grammar.async_keywords + lines = source.split("\n") lines += [""] # For newline tokens in files that don't end in a newline line, column = 1, 0 + + token_iterator = pytokens.tokenize(source) try: - for token in pytokens.tokenize(source): + for token in token_iterator: line, column = token.start_line, token.start_col if token.type == TokenType.whitespace: continue - token_string = source[token.start_index : token.end_index] + token_str = source[token.start_index : token.end_index] - if token.type == TokenType.newline and token_string == "": + if token.type == TokenType.newline and token_str == "": # Black doesn't yield empty newline tokens at the end of a file # if there's no newline at the end of a file. continue source_line = lines[token.start_line - 1] - if token.type == TokenType.op and token_string == "...": + if token.type == TokenType.identifier and token_str in ("async", "await"): + # Black uses `async` and `await` token types just for those two keywords + while True: + next_token = next(token_iterator) + if next_token.type == TokenType.whitespace: + continue + break + + next_token_type = TOKEN_TYPE_MAP[next_token.type] + next_str = source[next_token.start_index : next_token.end_index] + next_line = lines[next_token.start_line - 1] + + if next_token_type == NAME and next_str in ("def", "for"): + current_token_type= ASYNC if token_str == "async" else AWAIT + else: + current_token_type = TOKEN_TYPE_MAP[token.type] + + yield ( + current_token_type, + token_str, + (token.start_line, token.start_col), + (token.end_line, token.end_col), + source_line, + ) + yield ( + next_token_type, + next_str, + (next_token.start_line, next_token.start_col), + (next_token.end_line, next_token.end_col), + next_line, + ) + continue + + if token.type == TokenType.op and token_str == "...": # Black doesn't have an ellipsis token yet, yield 3 DOTs instead assert token.start_line == token.end_line assert token.end_col == token.start_col + 3 - token_string = "." + token_str = "." for start_col in range(token.start_col, token.start_col + 3): end_col = start_col + 1 yield ( - token_type(token, token_string), - token_string, + TOKEN_TYPE_MAP[token.type], + token_str, (token.start_line, start_col), (token.end_line, end_col), source_line, ) - else: - yield ( - token_type(token, token_string), - token_string, - (token.start_line, token.start_col), - (token.end_line, token.end_col), - source_line, - ) + continue + + yield ( + TOKEN_TYPE_MAP[token.type], + token_str, + (token.start_line, token.start_col), + (token.end_line, token.end_col), + source_line, + ) except Exception as exc: # TODO: raise TokenError(repr(exc), (line, column)) From 2fb18aa55e17510799dec87a1e18941e8e8b8416 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 21:22:06 +0530 Subject: [PATCH 05/24] add async/await keyword support --- src/blib2to3/pgen2/tokenize.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index b427b788941..803eba10644 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -109,6 +109,9 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] line, column = 1, 0 token_iterator = pytokens.tokenize(source) + is_async = False + current_indent = 0 + async_indent = 0 try: for token in token_iterator: line, column = token.start_line, token.start_col @@ -122,6 +125,13 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] # if there's no newline at the end of a file. continue + if token.type == TokenType.indent: + current_indent += 1 + if token.type == TokenType.dedent: + current_indent -= 1 + if is_async and current_indent < async_indent: + is_async = False + source_line = lines[token.start_line - 1] if token.type == TokenType.identifier and token_str in ("async", "await"): @@ -131,13 +141,21 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] if next_token.type == TokenType.whitespace: continue break - + next_token_type = TOKEN_TYPE_MAP[next_token.type] next_str = source[next_token.start_index : next_token.end_index] next_line = lines[next_token.start_line - 1] - - if next_token_type == NAME and next_str in ("def", "for"): - current_token_type= ASYNC if token_str == "async" else AWAIT + + if ( + token_str == "async" + and next_token_type == NAME + and next_str in ("def", "for") + ): + is_async = True + async_indent = current_indent + 1 + current_token_type = ASYNC + elif token_str == "await" and is_async: + current_token_type = AWAIT else: current_token_type = TOKEN_TYPE_MAP[token.type] From 0fbdd2be873faa2c0866132ad6aeee9b685d42e1 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 22:31:28 +0530 Subject: [PATCH 06/24] typo fix --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 803eba10644..402b2ad5908 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -190,7 +190,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] (token.end_line, end_col), source_line, ) - continue + continue yield ( TOKEN_TYPE_MAP[token.type], From bc785af9c8a2c8c1fbfe048505f27d1ecf07e0a0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 22:52:56 +0530 Subject: [PATCH 07/24] fix `\\\n` handling --- src/blib2to3/pgen2/tokenize.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 402b2ad5908..50847824126 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -101,6 +101,18 @@ class TokenError(Exception): ... +def transform_whitespace(token: pytokens.Token, source: str) -> pytokens.Token: + r""" + Black treats `\\\n` at the end of a line as a 'NL' token, while it + is ignored as whitespace in the regular Python parser. + """ + if token.type == TokenType.whitespace: + token_str = source[token.start_index : token.end_index] + if token_str.startswith("\\\n"): + return pytokens.Token(TokenType.nl, token.start_index, token.start_index + 2, token.start_line, token.start_col, token.start_line, token.start_col + 2) + + return token + def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: async_keywords = False if grammar is None else grammar.async_keywords @@ -114,6 +126,8 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] async_indent = 0 try: for token in token_iterator: + token = transform_whitespace(token, source) + line, column = token.start_line, token.start_col if token.type == TokenType.whitespace: continue @@ -138,6 +152,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] # Black uses `async` and `await` token types just for those two keywords while True: next_token = next(token_iterator) + next_token = transform_whitespace(next_token) if next_token.type == TokenType.whitespace: continue break From 945678c1d1148dac33936d6aaf4f6e4410fc42d9 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 22:54:16 +0530 Subject: [PATCH 08/24] typo fix --- src/blib2to3/pgen2/tokenize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 50847824126..bebe5323e3a 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -152,13 +152,13 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] # Black uses `async` and `await` token types just for those two keywords while True: next_token = next(token_iterator) - next_token = transform_whitespace(next_token) + next_str = source[next_token.start_index : next_token.end_index] + next_token = transform_whitespace(next_token, next_str) if next_token.type == TokenType.whitespace: continue break next_token_type = TOKEN_TYPE_MAP[next_token.type] - next_str = source[next_token.start_index : next_token.end_index] next_line = lines[next_token.start_line - 1] if ( From 90dc1c72d99b96a5b9844be64844a605c941cd90 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Mon, 23 Dec 2024 22:56:08 +0530 Subject: [PATCH 09/24] fix form feed test --- tests/data/cases/form_feeds.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/data/cases/form_feeds.py b/tests/data/cases/form_feeds.py index 957b4a1db95..48ffc98106b 100644 --- a/tests/data/cases/form_feeds.py +++ b/tests/data/cases/form_feeds.py @@ -156,7 +156,6 @@ def something(self): # - # pass From 134f7b9247dc097709424d29246bc8265a6ed9a2 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 02:00:07 +0530 Subject: [PATCH 10/24] fix EOFError case --- src/blib2to3/pgen2/tokenize.py | 2 +- tests/test_black.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index bebe5323e3a..581c98a07d1 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -215,7 +215,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] source_line, ) except Exception as exc: # TODO: - raise TokenError(repr(exc), (line, column)) + raise TokenError(str(exc), (line, column)) def printtoken( diff --git a/tests/test_black.py b/tests/test_black.py index 84061f10cdf..9793e12ec4d 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -1973,7 +1973,7 @@ def test_for_handled_unexpected_eof_error(self) -> None: with pytest.raises(black.parsing.InvalidInput) as exc_info: black.lib2to3_parse("print(", {}) - exc_info.match("Cannot parse: 2:0: EOF in multi-line statement") + exc_info.match("Cannot parse: 1:6: Unexpected EOF in multi-line statement") def test_line_ranges_with_code_option(self) -> None: code = textwrap.dedent("""\ From 7002effcc7c1cb273feb1c66323417491e0775fd Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 02:24:55 +0530 Subject: [PATCH 11/24] fix async discrepancies --- src/blib2to3/pgen2/tokenize.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 581c98a07d1..b119f4b754b 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -109,10 +109,19 @@ def transform_whitespace(token: pytokens.Token, source: str) -> pytokens.Token: if token.type == TokenType.whitespace: token_str = source[token.start_index : token.end_index] if token_str.startswith("\\\n"): - return pytokens.Token(TokenType.nl, token.start_index, token.start_index + 2, token.start_line, token.start_col, token.start_line, token.start_col + 2) + return pytokens.Token( + TokenType.nl, + token.start_index, + token.start_index + 2, + token.start_line, + token.start_col, + token.start_line, + token.start_col + 2, + ) return token + def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: async_keywords = False if grammar is None else grammar.async_keywords @@ -161,15 +170,14 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] next_token_type = TOKEN_TYPE_MAP[next_token.type] next_line = lines[next_token.start_line - 1] - if ( - token_str == "async" - and next_token_type == NAME - and next_str in ("def", "for") + if token_str == "async" and ( + async_keywords + or (next_token_type == NAME and next_str in ("def", "for")) ): is_async = True async_indent = current_indent + 1 current_token_type = ASYNC - elif token_str == "await" and is_async: + elif token_str == "await" and (async_keywords or is_async): current_token_type = AWAIT else: current_token_type = TOKEN_TYPE_MAP[token.type] From b3500c4d7737130533ae33d072e2f99cc2c1fa41 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 02:33:05 +0530 Subject: [PATCH 12/24] bump pytokens version --- pyproject.toml | 2 +- src/blib2to3/pgen2/tokenize.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 82e572bf35e..f4273a602fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", - "pytokens>=0.1.2", + "pytokens>=0.1.3", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index b119f4b754b..a5de81af9f0 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -222,8 +222,10 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] (token.end_line, token.end_col), source_line, ) - except Exception as exc: # TODO: - raise TokenError(str(exc), (line, column)) + except pytokens.UnexpectedEOF: + raise TokenError("Unexpected EOF in multi-line statement", (line, column)) + except pytokens.TokenizeError: + raise TokenError("TODO", (line, column)) def printtoken( From 51e7ce1d097243c2b8c765fbca5e86f3cf092059 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 03:59:13 +0530 Subject: [PATCH 13/24] remove empty fstring middle tokens from test --- tests/data/miscellaneous/debug_visitor.out | 20 -------------------- tests/test_tokenize.py | 13 ++++--------- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/tests/data/miscellaneous/debug_visitor.out b/tests/data/miscellaneous/debug_visitor.out index 24d7ed82472..a243ab72734 100644 --- a/tests/data/miscellaneous/debug_visitor.out +++ b/tests/data/miscellaneous/debug_visitor.out @@ -232,8 +232,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -242,8 +240,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -252,8 +248,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -399,8 +393,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -419,8 +411,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -549,8 +539,6 @@ file_input fstring FSTRING_START "f'" - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -559,8 +547,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' fstring_replacement_field LBRACE '{' @@ -569,8 +555,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -660,8 +644,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring @@ -744,8 +726,6 @@ file_input RBRACE '}' /fstring_replacement_field - FSTRING_MIDDLE - '' FSTRING_END "'" /fstring diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 052cbb6c58d..92f929118d0 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -63,11 +63,9 @@ def test_fstring() -> None: 'f"{x}"', [ Token("FSTRING_START", 'f"', (1, 0), (1, 2)), - Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)), Token("OP", "{", (1, 2), (1, 3)), Token("NAME", "x", (1, 3), (1, 4)), Token("OP", "}", (1, 4), (1, 5)), - Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)), Token("FSTRING_END", '"', (1, 5), (1, 6)), Token("ENDMARKER", "", (2, 0), (2, 0)), ], @@ -76,13 +74,11 @@ def test_fstring() -> None: 'f"{x:y}"\n', [ Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), - Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)), - Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)), + Token(type="OP", string="{", start=(1, 2), end=(1, 3)), Token(type="NAME", string="x", start=(1, 3), end=(1, 4)), Token(type="OP", string=":", start=(1, 4), end=(1, 5)), Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)), - Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)), - Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)), + Token(type="OP", string="}", start=(1, 6), end=(1, 7)), Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)), Token(type="NEWLINE", string="\n", start=(1, 8), end=(1, 9)), Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)), @@ -93,10 +89,9 @@ def test_fstring() -> None: [ Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)), Token(type="FSTRING_MIDDLE", string="x\\\n", start=(1, 2), end=(2, 0)), - Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)), + Token(type="OP", string="{", start=(2, 0), end=(2, 1)), Token(type="NAME", string="a", start=(2, 1), end=(2, 2)), - Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)), - Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)), + Token(type="OP", string="}", start=(2, 2), end=(2, 3)), Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)), Token(type="NEWLINE", string="\n", start=(2, 4), end=(2, 5)), Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)), From 8cfe444d430dae131d2dbea473cdb113583013c1 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 04:21:43 +0530 Subject: [PATCH 14/24] remove python2 test --- tests/test_black.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/test_black.py b/tests/test_black.py index 9793e12ec4d..4f8a8735618 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -458,17 +458,6 @@ def test_tab_comment_indentation(self) -> None: self.assertFormatEqual(contents_spc, fs(contents_spc)) self.assertFormatEqual(contents_spc, fs(contents_tab)) - # mixed tabs and spaces (valid Python 2 code) - contents_tab = "if 1:\n if 2:\n\t\tpass\n\t# comment\n pass\n" - contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n" - self.assertFormatEqual(contents_spc, fs(contents_spc)) - self.assertFormatEqual(contents_spc, fs(contents_tab)) - - contents_tab = "if 1:\n if 2:\n\t\tpass\n\t\t# comment\n pass\n" - contents_spc = "if 1:\n if 2:\n pass\n # comment\n pass\n" - self.assertFormatEqual(contents_spc, fs(contents_spc)) - self.assertFormatEqual(contents_spc, fs(contents_tab)) - def test_false_positive_symlink_output_issue_3384(self) -> None: # Emulate the behavior when using the CLI (`black ./child --verbose`), which # involves patching some `pathlib.Path` methods. In particular, `is_dir` is From ec0b56895e3b9577a8a75dd837ce7a516eb05b17 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 04:38:35 +0530 Subject: [PATCH 15/24] fix backslash edge case --- src/blib2to3/pgen2/tokenize.py | 28 ++++++++++++++++++++++------ tests/data/cases/form_feeds.py | 1 + 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index a5de81af9f0..6b12b914df1 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -29,6 +29,7 @@ import sys from collections.abc import Iterator +from typing import Optional from blib2to3.pgen2.grammar import Grammar from blib2to3.pgen2.token import ( @@ -101,12 +102,20 @@ class TokenError(Exception): ... -def transform_whitespace(token: pytokens.Token, source: str) -> pytokens.Token: +def transform_whitespace( + token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token] +) -> pytokens.Token: r""" Black treats `\\\n` at the end of a line as a 'NL' token, while it is ignored as whitespace in the regular Python parser. + But, only the first one. If there's a `\\\n` following it + (as in, a \ just by itself on a line), that is not made into NL. """ - if token.type == TokenType.whitespace: + if ( + token.type == TokenType.whitespace + and prev_token is not None + and prev_token.type != TokenType.nl + ): token_str = source[token.start_index : token.end_index] if token_str.startswith("\\\n"): return pytokens.Token( @@ -133,9 +142,11 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] is_async = False current_indent = 0 async_indent = 0 + + prev_token: Optional[pytokens.Token] = None try: for token in token_iterator: - token = transform_whitespace(token, source) + token = transform_whitespace(token, source, prev_token) line, column = token.start_line, token.start_col if token.type == TokenType.whitespace: @@ -146,6 +157,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] if token.type == TokenType.newline and token_str == "": # Black doesn't yield empty newline tokens at the end of a file # if there's no newline at the end of a file. + prev_token = token continue if token.type == TokenType.indent: @@ -162,7 +174,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] while True: next_token = next(token_iterator) next_str = source[next_token.start_index : next_token.end_index] - next_token = transform_whitespace(next_token, next_str) + next_token = transform_whitespace(next_token, next_str, token) if next_token.type == TokenType.whitespace: continue break @@ -196,6 +208,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] (next_token.end_line, next_token.end_col), next_line, ) + prev_token = token continue if token.type == TokenType.op and token_str == "...": @@ -213,6 +226,7 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] (token.end_line, end_col), source_line, ) + prev_token = token continue yield ( @@ -222,10 +236,12 @@ def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo] (token.end_line, token.end_col), source_line, ) + prev_token = token + except pytokens.UnexpectedEOF: raise TokenError("Unexpected EOF in multi-line statement", (line, column)) - except pytokens.TokenizeError: - raise TokenError("TODO", (line, column)) + except pytokens.TokenizeError as exc: + raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) def printtoken( diff --git a/tests/data/cases/form_feeds.py b/tests/data/cases/form_feeds.py index 48ffc98106b..957b4a1db95 100644 --- a/tests/data/cases/form_feeds.py +++ b/tests/data/cases/form_feeds.py @@ -156,6 +156,7 @@ def something(self): # + # pass From ceab5059b00717f9a304753ed9e3a5d6d3060f6f Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 04:40:46 +0530 Subject: [PATCH 16/24] fix another backslash edge case --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 6b12b914df1..cd1537ec24d 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -114,7 +114,7 @@ def transform_whitespace( if ( token.type == TokenType.whitespace and prev_token is not None - and prev_token.type != TokenType.nl + and prev_token.type not in (TokenType.nl, TokenType.newline) ): token_str = source[token.start_index : token.end_index] if token_str.startswith("\\\n"): From cb9d48a3eae917205bc136b438ad5d14670a67c0 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 04:41:56 +0530 Subject: [PATCH 17/24] fix mypyc --- src/blib2to3/pgen2/tokenize.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index cd1537ec24d..6d679e743fc 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -55,21 +55,17 @@ __author__ = "Ka-Ping Yee " __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" -import re -import token -from codecs import BOM_UTF8, lookup - import pytokens from pytokens import TokenType -from . import token +from . import token as _token -__all__ = [x for x in dir(token) if x[0] != "_"] + [ +__all__ = [x for x in dir(_token) if x[0] != "_"] + [ "tokenize", "generate_tokens", "untokenize", ] -del token +del _token Coord = tuple[int, int] TokenInfo = tuple[int, str, Coord, Coord, str] From 1e105f8c77ba50b0bc4abec0b2e4c9412480830a Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Tue, 24 Dec 2024 04:43:53 +0530 Subject: [PATCH 18/24] remove use of | --- src/blib2to3/pgen2/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 6d679e743fc..5cbfd5148d8 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -127,7 +127,7 @@ def transform_whitespace( return token -def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: +def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]: async_keywords = False if grammar is None else grammar.async_keywords lines = source.split("\n") From 3420248d9a6478d7b642304c05c1aa84541be987 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 25 Dec 2024 10:08:57 +0530 Subject: [PATCH 19/24] bump pytokens version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f4273a602fe..c08327c19e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", - "pytokens>=0.1.3", + "pytokens>=0.1.5", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] From 59ddf0618ab96c6227baf46c576ff061ca108d1c Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 25 Dec 2024 10:32:06 +0530 Subject: [PATCH 20/24] lints --- .pre-commit-config.yaml | 1 + tests/test_tokenize.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c38354437e..0f510f01ad2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,6 +50,7 @@ repos: - click >= 8.1.0, != 8.1.4, != 8.1.5 - packaging >= 22.0 - platformdirs >= 2.1.0 + - pytokens >= 0.1.5 - pytest - hypothesis - aiohttp >= 3.7.4 diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 92f929118d0..efa7ad5e80d 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -1,6 +1,5 @@ """Tests for the blib2to3 tokenizer.""" -import io import sys import textwrap from dataclasses import dataclass From b068867bdf651b0f6260968c94371485eb26df37 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 25 Dec 2024 10:57:58 +0530 Subject: [PATCH 21/24] Add changelog entry --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index d2955d2df0a..be0149dc372 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -40,6 +40,8 @@ +- Rewrite tokenizer to improve performance and compliance (#4536) + ### Performance From 74edc5ba23a729ef8debadc305ddec92b76499ec Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Wed, 8 Jan 2025 23:55:58 +0530 Subject: [PATCH 22/24] bump pytokens version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c08327c19e7..ceb23ee1250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", - "pytokens>=0.1.5", + "pytokens>=0.1.7", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] From ad4684833afc5c6ff0b42a9212c35adf8d786332 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 9 Jan 2025 23:11:45 +0530 Subject: [PATCH 23/24] bump pytokens once again --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ceb23ee1250..fbff7de3a4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dependencies = [ "packaging>=22.0", "pathspec>=0.9.0", "platformdirs>=2", - "pytokens>=0.1.7", + "pytokens>=0.1.9", "tomli>=1.1.0; python_version < '3.11'", "typing_extensions>=4.0.1; python_version < '3.11'", ] From 97a730d4be8ba4a0855e68d87c94f473356f8d12 Mon Sep 17 00:00:00 2001 From: Tushar Sadhwani Date: Thu, 9 Jan 2025 23:14:34 +0530 Subject: [PATCH 24/24] bump upload-artifact --- .github/workflows/diff_shades.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/diff_shades.yml b/.github/workflows/diff_shades.yml index 51a448a12a5..038408b94c9 100644 --- a/.github/workflows/diff_shades.yml +++ b/.github/workflows/diff_shades.yml @@ -110,19 +110,19 @@ jobs: ${{ matrix.baseline-analysis }} ${{ matrix.target-analysis }} - name: Upload diff report - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.mode }}-diff.html path: diff.html - name: Upload baseline analysis - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.baseline-analysis }} path: ${{ matrix.baseline-analysis }} - name: Upload target analysis - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.target-analysis }} path: ${{ matrix.target-analysis }} @@ -137,7 +137,7 @@ jobs: - name: Upload summary file (PR only) if: github.event_name == 'pull_request' && matrix.mode == 'preview-changes' - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: .pr-comment.json path: .pr-comment.json