From 93f35581d03b270b10ff3feed3f8afbf9233bddb Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Mon, 23 Sep 2024 14:10:22 -0400 Subject: [PATCH 01/14] Create an initial PoC for reversing parsed HCL back into text --- .gitignore | 3 +++ hcl2/__init__.py | 2 +- hcl2/api.py | 24 +++++++++++++++++- hcl2/parser.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 5a2fc4a4..aea11ed0 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,6 @@ node_modules/ # Don't commit the generated parser lark_parser.py .lark_cache.bin + +# ASDF tool-versions file +.tool-versions \ No newline at end of file diff --git a/hcl2/__init__.py b/hcl2/__init__.py index 8b41bd87..7ea2ac6b 100644 --- a/hcl2/__init__.py +++ b/hcl2/__init__.py @@ -5,4 +5,4 @@ except ImportError: __version__ = "unknown" -from .api import load, loads +from .api import load, loads, parse, parses, writes diff --git a/hcl2/api.py b/hcl2/api.py index 9079fce8..d06e36ab 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -1,7 +1,8 @@ """The API that will be exposed to users of this package""" from typing import TextIO -from hcl2.parser import hcl2 +from lark.tree import Tree +from hcl2.parser import hcl2, hcl2_reconstructor from hcl2.transformer import DictTransformer @@ -26,3 +27,24 @@ def loads(text: str, with_meta=False) -> dict: # Append a new line as a temporary fix tree = hcl2.parse(text + "\n") return DictTransformer(with_meta=with_meta).transform(tree) + + +def parse(file: TextIO) -> Tree: + """Load HCL2 syntax tree from a file. + :param file: File with hcl2 to be loaded as a dict. + """ + return parses(file.read()) + + +def parses(text: str) -> Tree: + """Load HCL2 syntax tree from a string. + :param text: Text with hcl2 to be loaded as a dict. + """ + return hcl2.parse(text + "\n") + + +def writes(ast: Tree) -> str: + """Convert an HCL2 syntax tree to a string. + :param ast: HCL2 syntax tree. + """ + return hcl2_reconstructor.reconstruct(ast) diff --git a/hcl2/parser.py b/hcl2/parser.py index e6d810bd..488cde56 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -1,7 +1,8 @@ """A parser for HCL2 implemented using the Lark parser""" from pathlib import Path -from lark import Lark +from lark import Lark, Token +from lark.reconstruct import Reconstructor PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -10,7 +11,67 @@ hcl2 = Lark.open( "hcl2.lark", parser="lalr", - cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar + # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, + maybe_placeholders=False, # Needed for reconstruction ) + +SPACE_AFTER = set(',+-*/~@<>="|:') +SPACE_BEFORE = (SPACE_AFTER - set(",:")) | set("'") + + +def _special_symbol(sym): + return Token("SPECIAL", sym.name) + + +def _postprocess_reconstruct(items): + stack = ["\n"] + actions = [] + last_was_whitespace = True + for item in items: + if isinstance(item, Token) and item.type == "SPECIAL": + actions.append(item.value) + else: + if actions: + assert ( + actions[0] == "_NEWLINE" and "_NEWLINE" not in actions[1:] + ), actions + + for a in actions[1:]: + if a == "_INDENT": + stack.append(stack[-1] + " " * 4) + else: + assert a == "_DEDENT" + stack.pop() + actions.clear() + yield stack[-1] + last_was_whitespace = True + if not last_was_whitespace: + if item[0] in SPACE_BEFORE: + yield " " + yield item + last_was_whitespace = item[-1].isspace() + if not last_was_whitespace: + if item[-1] in SPACE_AFTER: + yield " " + last_was_whitespace = True + yield "\n" + + +class HCLReconstructor: + def __init__(self, parser): + self._recons = Reconstructor( + parser, + { + "_NEWLINE": _special_symbol, + "_DEDENT": _special_symbol, + "_INDENT": _special_symbol, + }, + ) + + def reconstruct(self, tree): + return self._recons.reconstruct(tree, _postprocess_reconstruct) + + +hcl2_reconstructor = HCLReconstructor(hcl2) From e05bd9cfee42e4eb5852e529a3f68f087e6175f2 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Mon, 23 Sep 2024 14:16:21 -0400 Subject: [PATCH 02/14] Reclassify multi-line comments as "comment" instead of ignored tokens --- hcl2/hcl2.lark | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 0d6b40b5..15c381d0 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -3,7 +3,7 @@ body : (new_line_or_comment? (attribute | block))* new_line_or_comment? attribute : identifier "=" expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment -new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ )+ +new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ )+ identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH IF : "if" @@ -78,4 +78,3 @@ full_splat : "[*]" (get_attr | index)* !for_cond : "if" new_line_or_comment? expression %ignore /[ \t]+/ -%ignore /\/\*(.|\n)*?(\*\/)/ From 44a280af15e8cbe55ec5d13a5e38a5a635674660 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Tue, 24 Sep 2024 16:42:31 -0400 Subject: [PATCH 03/14] Keep track of indentation so it can be reconstructed --- hcl2/__init__.py | 2 +- hcl2/api.py | 8 ++++---- hcl2/hcl2.lark | 10 +++++++++- hcl2/indenter.py | 15 +++++++++++++++ hcl2/parser.py | 19 +++++++++++-------- 5 files changed, 40 insertions(+), 14 deletions(-) create mode 100644 hcl2/indenter.py diff --git a/hcl2/__init__.py b/hcl2/__init__.py index 7ea2ac6b..e6fff56e 100644 --- a/hcl2/__init__.py +++ b/hcl2/__init__.py @@ -5,4 +5,4 @@ except ImportError: __version__ = "unknown" -from .api import load, loads, parse, parses, writes +from .api import load, loads, parse, parses, writes, AST diff --git a/hcl2/api.py b/hcl2/api.py index d06e36ab..17c00aa4 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -1,7 +1,7 @@ """The API that will be exposed to users of this package""" from typing import TextIO -from lark.tree import Tree +from lark.tree import Tree as AST from hcl2.parser import hcl2, hcl2_reconstructor from hcl2.transformer import DictTransformer @@ -29,21 +29,21 @@ def loads(text: str, with_meta=False) -> dict: return DictTransformer(with_meta=with_meta).transform(tree) -def parse(file: TextIO) -> Tree: +def parse(file: TextIO) -> AST: """Load HCL2 syntax tree from a file. :param file: File with hcl2 to be loaded as a dict. """ return parses(file.read()) -def parses(text: str) -> Tree: +def parses(text: str) -> AST: """Load HCL2 syntax tree from a string. :param text: Text with hcl2 to be loaded as a dict. """ return hcl2.parse(text + "\n") -def writes(ast: Tree) -> str: +def writes(ast: AST) -> str: """Convert an HCL2 syntax tree to a string. :param ast: HCL2 syntax tree. """ diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 15c381d0..473925bd 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -3,7 +3,14 @@ body : (new_line_or_comment? (attribute | block))* new_line_or_comment? attribute : identifier "=" expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment -new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ )+ +new_line_or_comment: ( new_line | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ | indent | dedent )+ +// these rules are needed (rather than including the terminals in the above rule) +// because information about the terminals is discarded when the AST is built, +// making reconstruction impossible without using named rules +new_line: _NL +indent: _INDENT +dedent: _DEDENT +_NL: /\n[ \t]*/ identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH IF : "if" @@ -78,3 +85,4 @@ full_splat : "[*]" (get_attr | index)* !for_cond : "if" new_line_or_comment? expression %ignore /[ \t]+/ +%declare _INDENT _DEDENT \ No newline at end of file diff --git a/hcl2/indenter.py b/hcl2/indenter.py new file mode 100644 index 00000000..c31da696 --- /dev/null +++ b/hcl2/indenter.py @@ -0,0 +1,15 @@ +from lark.indenter import Indenter + + +class TerraformIndenter(Indenter): + """A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to Terraform syntax. + + See also: the ``postlex`` option in `Lark`. + """ + + NL_type = "_NL" + OPEN_PAREN_types = ["LPAR"] + CLOSE_PAREN_types = ["RPAR"] + INDENT_type = "_INDENT" + DEDENT_type = "_DEDENT" + tab_len = 2 diff --git a/hcl2/parser.py b/hcl2/parser.py index 488cde56..e0ea4980 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -4,6 +4,7 @@ from lark import Lark, Token from lark.reconstruct import Reconstructor +from .indenter import TerraformIndenter PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -15,6 +16,7 @@ rel_to=__file__, propagate_positions=True, maybe_placeholders=False, # Needed for reconstruction + postlex=TerraformIndenter(), ) SPACE_AFTER = set(',+-*/~@<>="|:') @@ -34,16 +36,17 @@ def _postprocess_reconstruct(items): actions.append(item.value) else: if actions: - assert ( - actions[0] == "_NEWLINE" and "_NEWLINE" not in actions[1:] - ), actions + assert actions[0] == "_NL", actions for a in actions[1:]: - if a == "_INDENT": - stack.append(stack[-1] + " " * 4) - else: - assert a == "_DEDENT" + if a == "_NL": + yield "\n" + elif a == "_INDENT": + stack.append(stack[-1] + " " * 2) + elif a == "_DEDENT": stack.pop() + else: + assert False, a actions.clear() yield stack[-1] last_was_whitespace = True @@ -64,7 +67,7 @@ def __init__(self, parser): self._recons = Reconstructor( parser, { - "_NEWLINE": _special_symbol, + "_NL": _special_symbol, "_DEDENT": _special_symbol, "_INDENT": _special_symbol, }, From 51c0972c36efd67094f7ac0ca272286a33b737ac Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Tue, 1 Oct 2024 14:53:11 -0400 Subject: [PATCH 04/14] Simplify reconstruction to just output the spaces as is around newlines --- hcl2/hcl2.lark | 12 ++----- hcl2/indenter.py | 15 -------- hcl2/parser.py | 93 ++++++++++++++++++++++++------------------------ 3 files changed, 48 insertions(+), 72 deletions(-) delete mode 100644 hcl2/indenter.py diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 473925bd..0523b87f 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -3,14 +3,7 @@ body : (new_line_or_comment? (attribute | block))* new_line_or_comment? attribute : identifier "=" expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment -new_line_or_comment: ( new_line | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ | indent | dedent )+ -// these rules are needed (rather than including the terminals in the above rule) -// because information about the terminals is discarded when the AST is built, -// making reconstruction impossible without using named rules -new_line: _NL -indent: _INDENT -dedent: _DEDENT -_NL: /\n[ \t]*/ +new_line_or_comment: ( /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ )+ identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH IF : "if" @@ -84,5 +77,4 @@ full_splat : "[*]" (get_attr | index)* !for_intro : "for" new_line_or_comment? identifier ("," identifier new_line_or_comment?)? new_line_or_comment? "in" new_line_or_comment? expression new_line_or_comment? ":" new_line_or_comment? !for_cond : "if" new_line_or_comment? expression -%ignore /[ \t]+/ -%declare _INDENT _DEDENT \ No newline at end of file +%ignore /[ \t]+/ \ No newline at end of file diff --git a/hcl2/indenter.py b/hcl2/indenter.py deleted file mode 100644 index c31da696..00000000 --- a/hcl2/indenter.py +++ /dev/null @@ -1,15 +0,0 @@ -from lark.indenter import Indenter - - -class TerraformIndenter(Indenter): - """A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to Terraform syntax. - - See also: the ``postlex`` option in `Lark`. - """ - - NL_type = "_NL" - OPEN_PAREN_types = ["LPAR"] - CLOSE_PAREN_types = ["RPAR"] - INDENT_type = "_INDENT" - DEDENT_type = "_DEDENT" - tab_len = 2 diff --git a/hcl2/parser.py b/hcl2/parser.py index e0ea4980..3b58f556 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -3,8 +3,7 @@ from lark import Lark, Token from lark.reconstruct import Reconstructor - -from .indenter import TerraformIndenter +from lark.utils import is_id_continue PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -16,65 +15,65 @@ rel_to=__file__, propagate_positions=True, maybe_placeholders=False, # Needed for reconstruction - postlex=TerraformIndenter(), ) SPACE_AFTER = set(',+-*/~@<>="|:') SPACE_BEFORE = (SPACE_AFTER - set(",:")) | set("'") - - -def _special_symbol(sym): - return Token("SPECIAL", sym.name) - +DIGITS = set("0123456789") +IDENT_JOINERS = set(".") def _postprocess_reconstruct(items): - stack = ["\n"] - actions = [] - last_was_whitespace = True + """ + Postprocess the stream of tokens derived from the AST during reconstruction. + + For HCL2, this is used exclusively for adding whitespace in the right locations. + """ + prev_item = "" for item in items: - if isinstance(item, Token) and item.type == "SPECIAL": - actions.append(item.value) - else: - if actions: - assert actions[0] == "_NL", actions - - for a in actions[1:]: - if a == "_NL": - yield "\n" - elif a == "_INDENT": - stack.append(stack[-1] + " " * 2) - elif a == "_DEDENT": - stack.pop() - else: - assert False, a - actions.clear() - yield stack[-1] - last_was_whitespace = True - if not last_was_whitespace: - if item[0] in SPACE_BEFORE: - yield " " - yield item - last_was_whitespace = item[-1].isspace() - if not last_was_whitespace: - if item[-1] in SPACE_AFTER: - yield " " - last_was_whitespace = True + # see if we need to add space after the previous identifier + if ( + prev_item + and item + and prev_item[-1] not in SPACE_AFTER + and not prev_item[-1].isspace() + and is_id_continue(prev_item[-1]) + and not item[0] in IDENT_JOINERS + and not prev_item[-1] in DIGITS + ): + yield " " + prev_item = " " + + # if the next character expects us to add a space before it + if ( + prev_item + and not prev_item[-1].isspace() + and prev_item[-1] not in SPACE_AFTER + ): + if item[0] in SPACE_BEFORE: + yield " " + prev_item = " " + + # print the actual token + yield item + + if item and item[-1] in SPACE_AFTER: + yield " " + + # store the previous item as we continue + prev_item = item yield "\n" class HCLReconstructor: def __init__(self, parser): - self._recons = Reconstructor( - parser, - { - "_NL": _special_symbol, - "_DEDENT": _special_symbol, - "_INDENT": _special_symbol, - }, - ) + self._recons = Reconstructor(parser) def reconstruct(self, tree): - return self._recons.reconstruct(tree, _postprocess_reconstruct) + return self._recons.reconstruct( + tree, + _postprocess_reconstruct, + insert_spaces=False, + ) hcl2_reconstructor = HCLReconstructor(hcl2) From 3e5e4f47cf03322b9d80be1519f5fca897653225 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Tue, 1 Oct 2024 16:14:19 -0400 Subject: [PATCH 05/14] Fix failing tests (disable caching) --- hcl2/parser.py | 4 ++++ test/unit/test_load.py | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/hcl2/parser.py b/hcl2/parser.py index 3b58f556..cf2fd63e 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -11,6 +11,10 @@ hcl2 = Lark.open( "hcl2.lark", parser="lalr", + # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: + # + # https://github.com/lark-parser/lark/issues/1472 + # # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, diff --git a/test/unit/test_load.py b/test/unit/test_load.py index d21ca02f..762412b4 100644 --- a/test/unit/test_load.py +++ b/test/unit/test_load.py @@ -19,15 +19,23 @@ class TestLoad(TestCase): def test_load_terraform(self): """Test parsing a set of hcl2 files and force recreating the parser file""" - # delete the parser file to force it to be recreated - PARSER_FILE.unlink() + # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: + # + # https://github.com/lark-parser/lark/issues/1472 + # + # # delete the parser file to force it to be recreated + # PARSER_FILE.unlink() for hcl_path in HCL2_FILES: yield self.check_terraform, hcl_path - def test_load_terraform_from_cache(self): - """Test parsing a set of hcl2 files from a cached parser file""" - for hcl_path in HCL2_FILES: - yield self.check_terraform, hcl_path + # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: + # + # https://github.com/lark-parser/lark/issues/1472 + # + # def test_load_terraform_from_cache(self): + # """Test parsing a set of hcl2 files from a cached parser file""" + # for hcl_path in HCL2_FILES: + # yield self.check_terraform, hcl_path def check_terraform(self, hcl_path_str: str): """Loads a single hcl2 file, parses it and compares with the expected json""" From c1e68f621e43b409f19b2f4a623a96c24bee9737 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Tue, 1 Oct 2024 17:27:14 -0400 Subject: [PATCH 06/14] Fix final spacing inconsistencies, fix failing tests --- hcl2/api.py | 2 +- hcl2/hcl2.lark | 6 ++++-- hcl2/parser.py | 51 +++++++++++++++++++++------------------------ hcl2/transformer.py | 8 +++++-- 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/hcl2/api.py b/hcl2/api.py index 17c00aa4..cb605e53 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -40,7 +40,7 @@ def parses(text: str) -> AST: """Load HCL2 syntax tree from a string. :param text: Text with hcl2 to be loaded as a dict. """ - return hcl2.parse(text + "\n") + return hcl2.parse(text) def writes(ast: AST) -> str: diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 0523b87f..1f964563 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -1,6 +1,6 @@ start : body body : (new_line_or_comment? (attribute | block))* new_line_or_comment? -attribute : identifier "=" expression +attribute : identifier EQ expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment new_line_or_comment: ( /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ )+ @@ -50,10 +50,12 @@ int_lit : DECIMAL+ | DECIMAL+ ("." DECIMAL+)? EXP_MARK DECIMAL+ DECIMAL : "0".."9" EXP_MARK : ("e" | "E") ("+" | "-")? +EQ : /[ \t]*=(?!=|>)/ tuple : "[" (new_line_or_comment* expression new_line_or_comment* ",")* (new_line_or_comment* expression)? new_line_or_comment* "]" object : "{" new_line_or_comment? (object_elem (new_line_and_or_comma object_elem )* new_line_and_or_comma?)? "}" -object_elem : (identifier | expression) ("=" | ":") expression +object_elem : (identifier | expression) ( EQ | ":") expression + heredoc_template : /<<(?P[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc)/ heredoc_template_trim : /<<-(?P[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc_trim)/ diff --git a/hcl2/parser.py b/hcl2/parser.py index cf2fd63e..97801f98 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -1,7 +1,7 @@ """A parser for HCL2 implemented using the Lark parser""" from pathlib import Path -from lark import Lark, Token +from lark import Lark from lark.reconstruct import Reconstructor from lark.utils import is_id_continue @@ -22,7 +22,7 @@ ) SPACE_AFTER = set(',+-*/~@<>="|:') -SPACE_BEFORE = (SPACE_AFTER - set(",:")) | set("'") +SPACE_BEFORE = (SPACE_AFTER - set(",:=")) | set("'") DIGITS = set("0123456789") IDENT_JOINERS = set(".") @@ -34,39 +34,36 @@ def _postprocess_reconstruct(items): """ prev_item = "" for item in items: - # see if we need to add space after the previous identifier + # these rules apply if we need to add space if ( - prev_item - and item - and prev_item[-1] not in SPACE_AFTER + prev_item # make sure a previous item exists + and item # and that the current item isn't empty + # if the last character of the previous item is a space, we abort and not prev_item[-1].isspace() - and is_id_continue(prev_item[-1]) - and not item[0] in IDENT_JOINERS + # if the previous character was a number we don't want to break up + # a numeric literal, abort and not prev_item[-1] in DIGITS + # if the next item has a space at the beginning, we don't need to + # add one, abort + and not item[0].isspace() + # now the scenarios when we do not abort are + and ( + # scenario 1, the prev token ended with an identifier character + # and the next character is not an "IDENT_JOINER" character + (is_id_continue(prev_item[-1]) and not item[0] in IDENT_JOINERS) + # scenario 2, the prev token ended with a character that should + # be followed by a space + or (prev_item[-1] in SPACE_AFTER) + # scenario 3, the next token begins with a character that needs + # a space preceeding it + or (item[0] in SPACE_BEFORE) + ) ): yield " " - prev_item = " " - # if the next character expects us to add a space before it - if ( - prev_item - and not prev_item[-1].isspace() - and prev_item[-1] not in SPACE_AFTER - ): - if item[0] in SPACE_BEFORE: - yield " " - prev_item = " " - - # print the actual token + # print the actual token, and store it for the next iteraction yield item - - if item and item[-1] in SPACE_AFTER: - yield " " - - # store the previous item as we continue prev_item = item - yield "\n" - class HCLReconstructor: def __init__(self, parser): diff --git a/hcl2/transformer.py b/hcl2/transformer.py index ddc7269f..05e5cd07 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -90,10 +90,14 @@ def tuple(self, args: List) -> List: return [self.to_string_dollar(arg) for arg in self.strip_new_line_tokens(args)] def object_elem(self, args: List) -> Dict: + print(args) # This returns a dict with a single key/value pair to make it easier to merge these # into a bigger dict that is returned by the "object" function key = self.strip_quotes(args[0]) - value = self.to_string_dollar(args[1]) + if len(args) == 3: + value = self.to_string_dollar(args[2]) + else: + value = self.to_string_dollar(args[1]) return {key: value} @@ -148,7 +152,7 @@ def attribute(self, args: List) -> Attribute: key = str(args[0]) if key.startswith('"') and key.endswith('"'): key = key[1:-1] - value = self.to_string_dollar(args[1]) + value = self.to_string_dollar(args[2]) return Attribute(key, value) def conditional(self, args: List) -> str: From 42b55031c560096f7a58ca0b57c361fc96e5a7d5 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Wed, 2 Oct 2024 14:53:02 -0400 Subject: [PATCH 07/14] Fix more whitespace handling issues, add extensive testing --- hcl2/parser.py | 93 +++++++++++++++++++++++++----- test/unit/test_reconstruct.py | 105 ++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 13 deletions(-) create mode 100644 test/unit/test_reconstruct.py diff --git a/hcl2/parser.py b/hcl2/parser.py index 97801f98..9194f5d6 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -21,10 +21,34 @@ maybe_placeholders=False, # Needed for reconstruction ) -SPACE_AFTER = set(',+-*/~@<>="|:') -SPACE_BEFORE = (SPACE_AFTER - set(",:=")) | set("'") +CHAR_SPACE_AFTER = set(',~@<>="|?)]:') +CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'") +KEYWORDS_SPACE_AFTER = [ + "if", + "in", + "for", + "for_each", + "==", + "!=", + "<", + ">", + "<=", + ">=", + "-", + "*", + "/", + "%", + "&&", + "||", + "+", +] +KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER DIGITS = set("0123456789") -IDENT_JOINERS = set(".") +NEVER_SPACE_AFTER = set("[(") +NEVER_SPACE_BEFORE = set("]),.") +NEVER_COMMA_BEFORE = set("])}") +# characters that are OK to come right after an identifier with no space between +IDENT_NO_SPACE = set("()[]") def _postprocess_reconstruct(items): """ @@ -34,37 +58,80 @@ def _postprocess_reconstruct(items): """ prev_item = "" for item in items: - # these rules apply if we need to add space + # first, handle any deferred tokens + if type(prev_item) == tuple and prev_item[0] == "_deferred": + prev_item = prev_item[1] + + # if the deferred token was a comma, see if we're ending a block + if prev_item == ",": + if item[0] not in NEVER_COMMA_BEFORE: + yield prev_item + else: + yield prev_item + + # these rules apply if we need to add space prior to the current token if ( prev_item # make sure a previous item exists and item # and that the current item isn't empty # if the last character of the previous item is a space, we abort and not prev_item[-1].isspace() - # if the previous character was a number we don't want to break up - # a numeric literal, abort - and not prev_item[-1] in DIGITS + # if the previous character was a number and the next character is + # a number. we don't want to break up a numeric literal, abort + and not (prev_item[-1] in DIGITS and item[0] in DIGITS) # if the next item has a space at the beginning, we don't need to # add one, abort and not item[0].isspace() + # if the previous character should not have a space after it, abort + and not prev_item[-1] in NEVER_SPACE_AFTER + # if the next character should not have a space before it, abort + and not item[0] in NEVER_SPACE_BEFORE + # double colons do not get space as they're part of a call to a + # namespaced function + and not (item == "::" or prev_item == "::") + # if both the last character and the next character are + # IDENT_NO_SPACE characters, we don't want a space between them either + and not (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) # now the scenarios when we do not abort are and ( # scenario 1, the prev token ended with an identifier character - # and the next character is not an "IDENT_JOINER" character - (is_id_continue(prev_item[-1]) and not item[0] in IDENT_JOINERS) + # and the next character is not an "IDENT_NO_SPACE" character + (is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE) # scenario 2, the prev token ended with a character that should # be followed by a space - or (prev_item[-1] in SPACE_AFTER) + or ( + prev_item[-1] in CHAR_SPACE_AFTER + or prev_item in KEYWORDS_SPACE_AFTER + ) # scenario 3, the next token begins with a character that needs # a space preceeding it - or (item[0] in SPACE_BEFORE) + or (item[0] in CHAR_SPACE_BEFORE or item in KEYWORDS_SPACE_BEFORE) + # scenario 4, the previous token was a block opening brace and + # the next token is not a closing brace (so the block is on one + # line and not empty) + or (prev_item[-1] == "{" and item[0] != "}") ) ): yield " " - # print the actual token, and store it for the next iteraction - yield item + # in some cases, we may want to defer printing the next token + defer_item = False + + # prevent the inclusion of extra commas if they are not intended + if item[0] == ",": + item = ("_deferred", item) + defer_item = True + + # print the actual token + if not defer_item: + yield item + + # store the previous item for the next token prev_item = item + # if the last token was deferred, print it before continuing + if type(prev_item) == tuple and prev_item[0] == "_deferred": + yield prev_item[1] + class HCLReconstructor: def __init__(self, parser): self._recons = Reconstructor(parser) diff --git a/test/unit/test_reconstruct.py b/test/unit/test_reconstruct.py new file mode 100644 index 00000000..f03f6aa1 --- /dev/null +++ b/test/unit/test_reconstruct.py @@ -0,0 +1,105 @@ +""" Test reconstructing hcl files""" + +import json +from pathlib import Path +from unittest import TestCase + +import hcl2 + + +HELPERS_DIR = Path(__file__).absolute().parent.parent / "helpers" +HCL2_DIR = HELPERS_DIR / "terraform-config" +HCL2_FILES = [str(file.relative_to(HCL2_DIR)) for file in HCL2_DIR.iterdir()] +JSON_DIR = HELPERS_DIR / "terraform-config-json" + + +class TestReconstruct(TestCase): + """Test reconstructing a variety of hcl files""" + + # print any differences fully to the console + maxDiff = None + + def test_write_terraform(self): + """Test reconstructing a set of hcl2 files, to make sure they parse to the same structure""" + for hcl_path in HCL2_FILES: + yield self.check_terraform, hcl_path + + def test_write_terraform_exact(self): + """Test reconstructing a set of hcl2 files, to make sure they reconstruct exactly the same, including whitespace""" + + # the reconstruction process is not precise, so some files do not + # reconstruct their whitespace exactly the same, but they are + # syntactically equivalent. This list is a target for further + # improvements to the whitespace handling of the reconstruction + # algorithm. + INEXACT_FILES = [ + # the reconstructor loses commas on the last element in an array, + # even if they're in the input file + "iam.tf", + "variables.tf", + # the reconstructor doesn't preserve indentation within comments + # perfectly + "multiline_expressions.tf", + # the reconstructor doesn't preserve the line that a ternary is + # broken on. + "route_table.tf", + ] + + for hcl_path in HCL2_FILES: + if hcl_path not in INEXACT_FILES: + yield self.check_whitespace, hcl_path + + def check_terraform(self, hcl_path_str: str): + """Loads a single hcl2 file, parses it, reconstructs it, parses the reconstructed file, and compares with the expected json""" + hcl_path = (HCL2_DIR / hcl_path_str).absolute() + json_path = JSON_DIR / hcl_path.relative_to(HCL2_DIR).with_suffix(".json") + with hcl_path.open("r") as hcl_file, json_path.open("r") as json_file: + hcl_file_content = hcl_file.read() + try: + hcl_ast = hcl2.parses(hcl_file_content) + except Exception as exc: + assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}" + + try: + hcl_reconstructed = hcl2.writes(hcl_ast) + except Exception as exc: + assert ( + False + ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}" + + try: + hcl2_dict = hcl2.loads(hcl_reconstructed) + except Exception as exc: + assert ( + False + ), f"failed to tokenize terraform in file reconstructed from `{hcl_path_str}`: {exc}" + + json_dict = json.load(json_file) + self.assertDictEqual( + hcl2_dict, + json_dict, + f"failed comparing {hcl_path_str} with reconstructed version", + ) + + def check_whitespace(self, hcl_path_str: str): + """Tests that the reconstructed file matches the original file exactly.""" + hcl_path = (HCL2_DIR / hcl_path_str).absolute() + with hcl_path.open("r") as hcl_file: + hcl_file_content = hcl_file.read() + try: + hcl_ast = hcl2.parses(hcl_file_content) + except Exception as exc: + assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}" + + try: + hcl_reconstructed = hcl2.writes(hcl_ast) + except Exception as exc: + assert ( + False + ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}" + + self.assertMultiLineEqual( + hcl_reconstructed, + hcl_file_content, + f"file {hcl_path_str} does not match its reconstructed version exactly. this is usually whitespace related.", + ) From f746a256957f0c973dea824aaca5ec04d4ad70a1 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Wed, 2 Oct 2024 14:58:03 -0400 Subject: [PATCH 08/14] Clean up a few things, move reconstructor to its own file --- hcl2/api.py | 3 +- hcl2/parser.py | 128 +---------------------------------------- hcl2/reconstructor.py | 131 ++++++++++++++++++++++++++++++++++++++++++ hcl2/transformer.py | 1 - 4 files changed, 134 insertions(+), 129 deletions(-) create mode 100644 hcl2/reconstructor.py diff --git a/hcl2/api.py b/hcl2/api.py index cb605e53..e41f1681 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -2,7 +2,8 @@ from typing import TextIO from lark.tree import Tree as AST -from hcl2.parser import hcl2, hcl2_reconstructor +from hcl2.parser import hcl2 +from hcl2.reconstructor import hcl2_reconstructor from hcl2.transformer import DictTransformer diff --git a/hcl2/parser.py b/hcl2/parser.py index 9194f5d6..8da9f9b6 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -2,8 +2,7 @@ from pathlib import Path from lark import Lark -from lark.reconstruct import Reconstructor -from lark.utils import is_id_continue + PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -20,128 +19,3 @@ propagate_positions=True, maybe_placeholders=False, # Needed for reconstruction ) - -CHAR_SPACE_AFTER = set(',~@<>="|?)]:') -CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'") -KEYWORDS_SPACE_AFTER = [ - "if", - "in", - "for", - "for_each", - "==", - "!=", - "<", - ">", - "<=", - ">=", - "-", - "*", - "/", - "%", - "&&", - "||", - "+", -] -KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER -DIGITS = set("0123456789") -NEVER_SPACE_AFTER = set("[(") -NEVER_SPACE_BEFORE = set("]),.") -NEVER_COMMA_BEFORE = set("])}") -# characters that are OK to come right after an identifier with no space between -IDENT_NO_SPACE = set("()[]") - -def _postprocess_reconstruct(items): - """ - Postprocess the stream of tokens derived from the AST during reconstruction. - - For HCL2, this is used exclusively for adding whitespace in the right locations. - """ - prev_item = "" - for item in items: - # first, handle any deferred tokens - if type(prev_item) == tuple and prev_item[0] == "_deferred": - prev_item = prev_item[1] - - # if the deferred token was a comma, see if we're ending a block - if prev_item == ",": - if item[0] not in NEVER_COMMA_BEFORE: - yield prev_item - else: - yield prev_item - - # these rules apply if we need to add space prior to the current token - if ( - prev_item # make sure a previous item exists - and item # and that the current item isn't empty - # if the last character of the previous item is a space, we abort - and not prev_item[-1].isspace() - # if the previous character was a number and the next character is - # a number. we don't want to break up a numeric literal, abort - and not (prev_item[-1] in DIGITS and item[0] in DIGITS) - # if the next item has a space at the beginning, we don't need to - # add one, abort - and not item[0].isspace() - # if the previous character should not have a space after it, abort - and not prev_item[-1] in NEVER_SPACE_AFTER - # if the next character should not have a space before it, abort - and not item[0] in NEVER_SPACE_BEFORE - # double colons do not get space as they're part of a call to a - # namespaced function - and not (item == "::" or prev_item == "::") - # if both the last character and the next character are - # IDENT_NO_SPACE characters, we don't want a space between them either - and not (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) - # now the scenarios when we do not abort are - and ( - # scenario 1, the prev token ended with an identifier character - # and the next character is not an "IDENT_NO_SPACE" character - (is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE) - # scenario 2, the prev token ended with a character that should - # be followed by a space - or ( - prev_item[-1] in CHAR_SPACE_AFTER - or prev_item in KEYWORDS_SPACE_AFTER - ) - # scenario 3, the next token begins with a character that needs - # a space preceeding it - or (item[0] in CHAR_SPACE_BEFORE or item in KEYWORDS_SPACE_BEFORE) - # scenario 4, the previous token was a block opening brace and - # the next token is not a closing brace (so the block is on one - # line and not empty) - or (prev_item[-1] == "{" and item[0] != "}") - ) - ): - yield " " - - # in some cases, we may want to defer printing the next token - defer_item = False - - # prevent the inclusion of extra commas if they are not intended - if item[0] == ",": - item = ("_deferred", item) - defer_item = True - - # print the actual token - if not defer_item: - yield item - - # store the previous item for the next token - prev_item = item - - # if the last token was deferred, print it before continuing - if type(prev_item) == tuple and prev_item[0] == "_deferred": - yield prev_item[1] - -class HCLReconstructor: - def __init__(self, parser): - self._recons = Reconstructor(parser) - - def reconstruct(self, tree): - return self._recons.reconstruct( - tree, - _postprocess_reconstruct, - insert_spaces=False, - ) - - -hcl2_reconstructor = HCLReconstructor(hcl2) diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py new file mode 100644 index 00000000..aabcbb1a --- /dev/null +++ b/hcl2/reconstructor.py @@ -0,0 +1,131 @@ +from lark.reconstruct import Reconstructor +from lark.utils import is_id_continue + +from hcl2.parser import hcl2 + +CHAR_SPACE_AFTER = set(',~@<>="|?)]:') +CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'") +KEYWORDS_SPACE_AFTER = [ + "if", + "in", + "for", + "for_each", + "==", + "!=", + "<", + ">", + "<=", + ">=", + "-", + "*", + "/", + "%", + "&&", + "||", + "+", +] +KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER +DIGITS = set("0123456789") +NEVER_SPACE_AFTER = set("[(") +NEVER_SPACE_BEFORE = set("]),.") +NEVER_COMMA_BEFORE = set("])}") +# characters that are OK to come right after an identifier with no space between +IDENT_NO_SPACE = set("()[]") + + +def _postprocess_reconstruct(items): + """ + Postprocess the stream of tokens derived from the AST during reconstruction. + + For HCL2, this is used exclusively for adding whitespace in the right locations. + """ + prev_item = "" + for item in items: + # first, handle any deferred tokens + if type(prev_item) == tuple and prev_item[0] == "_deferred": + prev_item = prev_item[1] + + # if the deferred token was a comma, see if we're ending a block + if prev_item == ",": + if item[0] not in NEVER_COMMA_BEFORE: + yield prev_item + else: + yield prev_item + + # these rules apply if we need to add space prior to the current token + if ( + prev_item # make sure a previous item exists + and item # and that the current item isn't empty + # if the last character of the previous item is a space, we abort + and not prev_item[-1].isspace() + # if the previous character was a number and the next character is + # a number. we don't want to break up a numeric literal, abort + and not (prev_item[-1] in DIGITS and item[0] in DIGITS) + # if the next item has a space at the beginning, we don't need to + # add one, abort + and not item[0].isspace() + # if the previous character should not have a space after it, abort + and not prev_item[-1] in NEVER_SPACE_AFTER + # if the next character should not have a space before it, abort + and not item[0] in NEVER_SPACE_BEFORE + # double colons do not get space as they're part of a call to a + # namespaced function + and not (item == "::" or prev_item == "::") + # if both the last character and the next character are + # IDENT_NO_SPACE characters, we don't want a space between them either + and not (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) + # now the scenarios when we do not abort are + and ( + # scenario 1, the prev token ended with an identifier character + # and the next character is not an "IDENT_NO_SPACE" character + (is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE) + # scenario 2, the prev token ended with a character that should + # be followed by a space + or ( + prev_item[-1] in CHAR_SPACE_AFTER + or prev_item in KEYWORDS_SPACE_AFTER + ) + # scenario 3, the next token begins with a character that needs + # a space preceeding it + or (item[0] in CHAR_SPACE_BEFORE or item in KEYWORDS_SPACE_BEFORE) + # scenario 4, the previous token was a block opening brace and + # the next token is not a closing brace (so the block is on one + # line and not empty) + or (prev_item[-1] == "{" and item[0] != "}") + ) + ): + yield " " + + # in some cases, we may want to defer printing the next token + defer_item = False + + # prevent the inclusion of extra commas if they are not intended + if item[0] == ",": + item = ("_deferred", item) + defer_item = True + + # print the actual token + if not defer_item: + yield item + + # store the previous item for the next token + prev_item = item + + # if the last token was deferred, print it before continuing + if type(prev_item) == tuple and prev_item[0] == "_deferred": + yield prev_item[1] + + +class HCLReconstructor: + def __init__(self, parser): + self._recons = Reconstructor(parser) + + def reconstruct(self, tree): + return self._recons.reconstruct( + tree, + _postprocess_reconstruct, + insert_spaces=False, + ) + + +hcl2_reconstructor = HCLReconstructor(hcl2) diff --git a/hcl2/transformer.py b/hcl2/transformer.py index 05e5cd07..866cef24 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -90,7 +90,6 @@ def tuple(self, args: List) -> List: return [self.to_string_dollar(arg) for arg in self.strip_new_line_tokens(args)] def object_elem(self, args: List) -> Dict: - print(args) # This returns a dict with a single key/value pair to make it easier to merge these # into a bigger dict that is returned by the "object" function key = self.strip_quotes(args[0]) From e2cbb1b349a9de286ac9bc7d808c938dbfd86fc8 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Wed, 2 Oct 2024 15:03:33 -0400 Subject: [PATCH 09/14] Add a method to transform an AST into a dictionary after any modification --- hcl2/api.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hcl2/api.py b/hcl2/api.py index e41f1681..dd891abf 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -44,8 +44,15 @@ def parses(text: str) -> AST: return hcl2.parse(text) +def transform(ast: AST, with_meta=False) -> dict: + """Convert an HCL2 AST to a dictionary. + :param ast: HCL2 syntax tree, output from `parse` or `parses` + """ + return DictTransformer(with_meta=with_meta).transform(ast) + + def writes(ast: AST) -> str: """Convert an HCL2 syntax tree to a string. - :param ast: HCL2 syntax tree. + :param ast: HCL2 syntax tree, output from `parse` or `parses` """ return hcl2_reconstructor.reconstruct(ast) From afa6dfc5e371656f5cc3dcde522a8959fcc03b1d Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Wed, 2 Oct 2024 15:29:15 -0400 Subject: [PATCH 10/14] Update reconstructor to build a new parser so regular parsing methods can retain caching benefits --- hcl2/__init__.py | 2 +- hcl2/api.py | 11 +++++++++-- hcl2/parser.py | 7 +------ hcl2/reconstructor.py | 17 ++++++++++++++++- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/hcl2/__init__.py b/hcl2/__init__.py index e6fff56e..f56787b7 100644 --- a/hcl2/__init__.py +++ b/hcl2/__init__.py @@ -5,4 +5,4 @@ except ImportError: __version__ = "unknown" -from .api import load, loads, parse, parses, writes, AST +from .api import load, loads, parse, parses, transform, writes, AST diff --git a/hcl2/api.py b/hcl2/api.py index dd891abf..6b717d63 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -3,7 +3,6 @@ from lark.tree import Tree as AST from hcl2.parser import hcl2 -from hcl2.reconstructor import hcl2_reconstructor from hcl2.transformer import DictTransformer @@ -41,7 +40,11 @@ def parses(text: str) -> AST: """Load HCL2 syntax tree from a string. :param text: Text with hcl2 to be loaded as a dict. """ - return hcl2.parse(text) + # defer this import until this method is called, due to the performance hit + # of rebuilding the grammar without cache + from hcl2.reconstructor import hcl2 as uncached_hcl2 + + return uncached_hcl2.parse(text) def transform(ast: AST, with_meta=False) -> dict: @@ -55,4 +58,8 @@ def writes(ast: AST) -> str: """Convert an HCL2 syntax tree to a string. :param ast: HCL2 syntax tree, output from `parse` or `parses` """ + # defer this import until this method is called, due to the performance hit + # of rebuilding the grammar without cache + from hcl2.reconstructor import hcl2_reconstructor + return hcl2_reconstructor.reconstruct(ast) diff --git a/hcl2/parser.py b/hcl2/parser.py index 8da9f9b6..e6d810bd 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -10,12 +10,7 @@ hcl2 = Lark.open( "hcl2.lark", parser="lalr", - # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: - # - # https://github.com/lark-parser/lark/issues/1472 - # - # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar + cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, - maybe_placeholders=False, # Needed for reconstruction ) diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index aabcbb1a..45d9f4d5 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -1,7 +1,22 @@ +from lark import Lark from lark.reconstruct import Reconstructor from lark.utils import is_id_continue -from hcl2.parser import hcl2 +# this is duplicated from `parser` because we need different options here for +# the reconstructor. please make sure changes are kept in sync between the two +# if necessary. +hcl2 = Lark.open( + "hcl2.lark", + parser="lalr", + # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: + # + # https://github.com/lark-parser/lark/issues/1472 + # + # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar + rel_to=__file__, + propagate_positions=True, + maybe_placeholders=False, # Needed for reconstruction +) CHAR_SPACE_AFTER = set(',~@<>="|?)]:') CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'") From d8c5def14c175e967c6fa38d371afe8db99e9541 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Wed, 2 Oct 2024 17:01:01 -0400 Subject: [PATCH 11/14] Fix pylint warnings --- hcl2/api.py | 8 ++- hcl2/reconstructor.py | 104 ++++++++++++++++++++-------------- test/unit/test_reconstruct.py | 17 ++++-- 3 files changed, 78 insertions(+), 51 deletions(-) diff --git a/hcl2/api.py b/hcl2/api.py index 6b717d63..4a3ec106 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -42,7 +42,9 @@ def parses(text: str) -> AST: """ # defer this import until this method is called, due to the performance hit # of rebuilding the grammar without cache - from hcl2.reconstructor import hcl2 as uncached_hcl2 + from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel + hcl2 as uncached_hcl2, + ) return uncached_hcl2.parse(text) @@ -60,6 +62,8 @@ def writes(ast: AST) -> str: """ # defer this import until this method is called, due to the performance hit # of rebuilding the grammar without cache - from hcl2.reconstructor import hcl2_reconstructor + from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel + hcl2_reconstructor, + ) return hcl2_reconstructor.reconstruct(ast) diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index 45d9f4d5..f6bbb9b7 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -1,3 +1,5 @@ +"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality""" + from lark import Lark from lark.reconstruct import Reconstructor from lark.utils import is_id_continue @@ -48,6 +50,57 @@ IDENT_NO_SPACE = set("()[]") +def _add_extra_space(prev_item, item): + # pylint: disable=too-many-boolean-expressions, too-many-return-statements + + ##### the scenarios where explicitly disallow spaces: ##### + + # if we already have a space, don't add another + if prev_item[-1].isspace() or item[0].isspace(): + return False + + # none of the following should be separated by spaces: + # - groups of digits + # - namespaced::function::calls + # - characters within an identifier like array[0]() + if ( + (prev_item[-1] in DIGITS and item[0] in DIGITS) + or item == "::" + or prev_item == "::" + or (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) + ): + return False + + # specific characters are also blocklisted from having spaces + if prev_item[-1] in NEVER_SPACE_AFTER or item[0] in NEVER_SPACE_BEFORE: + return False + + ##### the scenarios where we add spaces: ##### + + # scenario 1, the prev token ended with an identifier character + # and the next character is not an "IDENT_NO_SPACE" character + if is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE: + return True + + # scenario 2, the prev token or the next token should be followed by a space + if ( + prev_item[-1] in CHAR_SPACE_AFTER + or prev_item in KEYWORDS_SPACE_AFTER + or item[0] in CHAR_SPACE_BEFORE + or item in KEYWORDS_SPACE_BEFORE + ): + return True + + # scenario 3, the previous token was a block opening brace and + # the next token is not a closing brace (so the block is on one + # line and not empty) + if prev_item[-1] == "{" and item[0] != "}": + return True + + ##### otherwise, we don't add a space ##### + return False + + def _postprocess_reconstruct(items): """ Postprocess the stream of tokens derived from the AST during reconstruction. @@ -57,7 +110,7 @@ def _postprocess_reconstruct(items): prev_item = "" for item in items: # first, handle any deferred tokens - if type(prev_item) == tuple and prev_item[0] == "_deferred": + if isinstance(prev_item, tuple) and prev_item[0] == "_deferred": prev_item = prev_item[1] # if the deferred token was a comma, see if we're ending a block @@ -67,48 +120,9 @@ def _postprocess_reconstruct(items): else: yield prev_item - # these rules apply if we need to add space prior to the current token - if ( - prev_item # make sure a previous item exists - and item # and that the current item isn't empty - # if the last character of the previous item is a space, we abort - and not prev_item[-1].isspace() - # if the previous character was a number and the next character is - # a number. we don't want to break up a numeric literal, abort - and not (prev_item[-1] in DIGITS and item[0] in DIGITS) - # if the next item has a space at the beginning, we don't need to - # add one, abort - and not item[0].isspace() - # if the previous character should not have a space after it, abort - and not prev_item[-1] in NEVER_SPACE_AFTER - # if the next character should not have a space before it, abort - and not item[0] in NEVER_SPACE_BEFORE - # double colons do not get space as they're part of a call to a - # namespaced function - and not (item == "::" or prev_item == "::") - # if both the last character and the next character are - # IDENT_NO_SPACE characters, we don't want a space between them either - and not (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) - # now the scenarios when we do not abort are - and ( - # scenario 1, the prev token ended with an identifier character - # and the next character is not an "IDENT_NO_SPACE" character - (is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE) - # scenario 2, the prev token ended with a character that should - # be followed by a space - or ( - prev_item[-1] in CHAR_SPACE_AFTER - or prev_item in KEYWORDS_SPACE_AFTER - ) - # scenario 3, the next token begins with a character that needs - # a space preceeding it - or (item[0] in CHAR_SPACE_BEFORE or item in KEYWORDS_SPACE_BEFORE) - # scenario 4, the previous token was a block opening brace and - # the next token is not a closing brace (so the block is on one - # line and not empty) - or (prev_item[-1] == "{" and item[0] != "}") - ) - ): + # if we're between two tokens, determine if we need to add an extra space + # we need the previous item and the current item to exist to evaluate these rules + if prev_item and item and _add_extra_space(prev_item, item): yield " " # in some cases, we may want to defer printing the next token @@ -127,15 +141,17 @@ def _postprocess_reconstruct(items): prev_item = item # if the last token was deferred, print it before continuing - if type(prev_item) == tuple and prev_item[0] == "_deferred": + if isinstance(prev_item, tuple) and prev_item[0] == "_deferred": yield prev_item[1] class HCLReconstructor: + """This class converts a Lark.Tree AST back into a string representing the underlying HCL code.""" def __init__(self, parser): self._recons = Reconstructor(parser) def reconstruct(self, tree): + """Convert a Lark.Tree AST back into a string representation of HCL.""" return self._recons.reconstruct( tree, _postprocess_reconstruct, diff --git a/test/unit/test_reconstruct.py b/test/unit/test_reconstruct.py index f03f6aa1..b9545def 100644 --- a/test/unit/test_reconstruct.py +++ b/test/unit/test_reconstruct.py @@ -25,14 +25,17 @@ def test_write_terraform(self): yield self.check_terraform, hcl_path def test_write_terraform_exact(self): - """Test reconstructing a set of hcl2 files, to make sure they reconstruct exactly the same, including whitespace""" + """ + Test reconstructing a set of hcl2 files, to make sure they + reconstruct exactly the same, including whitespace. + """ # the reconstruction process is not precise, so some files do not # reconstruct their whitespace exactly the same, but they are # syntactically equivalent. This list is a target for further # improvements to the whitespace handling of the reconstruction # algorithm. - INEXACT_FILES = [ + inexact_files = [ # the reconstructor loses commas on the last element in an array, # even if they're in the input file "iam.tf", @@ -46,11 +49,14 @@ def test_write_terraform_exact(self): ] for hcl_path in HCL2_FILES: - if hcl_path not in INEXACT_FILES: + if hcl_path not in inexact_files: yield self.check_whitespace, hcl_path def check_terraform(self, hcl_path_str: str): - """Loads a single hcl2 file, parses it, reconstructs it, parses the reconstructed file, and compares with the expected json""" + """ + Loads a single hcl2 file, parses it, reconstructs it, + parses the reconstructed file, and compares with the expected json + """ hcl_path = (HCL2_DIR / hcl_path_str).absolute() json_path = JSON_DIR / hcl_path.relative_to(HCL2_DIR).with_suffix(".json") with hcl_path.open("r") as hcl_file, json_path.open("r") as json_file: @@ -101,5 +107,6 @@ def check_whitespace(self, hcl_path_str: str): self.assertMultiLineEqual( hcl_reconstructed, hcl_file_content, - f"file {hcl_path_str} does not match its reconstructed version exactly. this is usually whitespace related.", + f"file {hcl_path_str} does not match its reconstructed version \ + exactly. this is usually whitespace related.", ) From b4d59e4ad29435001bed3286f07b3ed5cea7b7cf Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Thu, 3 Oct 2024 15:09:10 -0400 Subject: [PATCH 12/14] Give terminals names to make the AST more readable, uncomment caching based tests --- hcl2/hcl2.lark | 9 ++++++--- test/unit/test_load.py | 20 ++++++-------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 1f964563..4a4489a7 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -3,9 +3,11 @@ body : (new_line_or_comment? (attribute | block))* new_line_or_comment? attribute : identifier EQ expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment -new_line_or_comment: ( /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ )+ +new_line_or_comment: ( NL_OR_COMMENT )+ +NL_OR_COMMENT: /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ -identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH +identifier : NAME | IN | FOR | IF | FOR_EACH +NAME : /[a-zA-Z_][a-zA-Z0-9_-]*/ IF : "if" IN : "in" FOR : "for" @@ -18,8 +20,9 @@ conditional : expression "?" new_line_or_comment? expression new_line_or_comment ?operation : unary_op | binary_op !unary_op : ("-" | "!") expr_term binary_op : expression binary_term new_line_or_comment? -!binary_operator : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+" +!binary_operator : BINARY_OP binary_term : binary_operator new_line_or_comment? expression +BINARY_OP : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+" expr_term : "(" new_line_or_comment? expression new_line_or_comment? ")" | float_lit diff --git a/test/unit/test_load.py b/test/unit/test_load.py index 762412b4..d21ca02f 100644 --- a/test/unit/test_load.py +++ b/test/unit/test_load.py @@ -19,23 +19,15 @@ class TestLoad(TestCase): def test_load_terraform(self): """Test parsing a set of hcl2 files and force recreating the parser file""" - # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: - # - # https://github.com/lark-parser/lark/issues/1472 - # - # # delete the parser file to force it to be recreated - # PARSER_FILE.unlink() + # delete the parser file to force it to be recreated + PARSER_FILE.unlink() for hcl_path in HCL2_FILES: yield self.check_terraform, hcl_path - # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: - # - # https://github.com/lark-parser/lark/issues/1472 - # - # def test_load_terraform_from_cache(self): - # """Test parsing a set of hcl2 files from a cached parser file""" - # for hcl_path in HCL2_FILES: - # yield self.check_terraform, hcl_path + def test_load_terraform_from_cache(self): + """Test parsing a set of hcl2 files from a cached parser file""" + for hcl_path in HCL2_FILES: + yield self.check_terraform, hcl_path def check_terraform(self, hcl_path_str: str): """Loads a single hcl2 file, parses it and compares with the expected json""" From 6dfde1ea2b09946022dff84e2849ea5d791a7d86 Mon Sep 17 00:00:00 2001 From: Kamil Kozik <115631738+kkozik-amplify@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:41:44 +0200 Subject: [PATCH 13/14] Add newline to hcl2.lark --- hcl2/hcl2.lark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 4a4489a7..d26acc1b 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -82,4 +82,4 @@ full_splat : "[*]" (get_attr | index)* !for_intro : "for" new_line_or_comment? identifier ("," identifier new_line_or_comment?)? new_line_or_comment? "in" new_line_or_comment? expression new_line_or_comment? ":" new_line_or_comment? !for_cond : "if" new_line_or_comment? expression -%ignore /[ \t]+/ \ No newline at end of file +%ignore /[ \t]+/ From d3f5e9537dab6adf149722a55b94991e3850fb8c Mon Sep 17 00:00:00 2001 From: Kamil Kozik <115631738+kkozik-amplify@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:41:57 +0200 Subject: [PATCH 14/14] Add newline to .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index aea11ed0..75af5e4b 100644 --- a/.gitignore +++ b/.gitignore @@ -123,4 +123,4 @@ lark_parser.py .lark_cache.bin # ASDF tool-versions file -.tool-versions \ No newline at end of file +.tool-versions