From ca8c74fcee1b669b339cab17ab8550e8a7f1cb0f Mon Sep 17 00:00:00 2001 From: Daniel Harding Date: Tue, 28 Mar 2023 16:07:23 +0300 Subject: [PATCH 1/3] Strip comments from a token list before sublists Avoid stripping T.Comment tokens contained within an sql.Comment before stripping the sql.Comment itself. Now an sql.Comment token will be stripped first along with any contained T.Comment tokens. --- sqlparse/filters/others.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlparse/filters/others.py b/sqlparse/filters/others.py index 6c1680ec..63d8d76d 100644 --- a/sqlparse/filters/others.py +++ b/sqlparse/filters/others.py @@ -74,8 +74,8 @@ def _get_insert_token(token): tidx, token = get_next_comment(idx=tidx) def process(self, stmt): - [self.process(sgroup) for sgroup in stmt.get_sublists()] StripCommentsFilter._process(stmt) + [self.process(sgroup) for sgroup in stmt.get_sublists()] return stmt From 8d66db342bb57b9bb4b656d727146adb2152e5c2 Mon Sep 17 00:00:00 2001 From: Daniel Harding Date: Wed, 12 May 2021 14:42:11 +0300 Subject: [PATCH 2/3] Restructure token class hierarchy. Rename Token to TokenBase and make it a superclass for TokenList and a new Token class. Move some of the functionality of TokenBase into Token and TokenList. This will make it easier to maintain separate functionality for Token versus TokenList. --- sqlparse/sql.py | 93 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/sqlparse/sql.py b/sqlparse/sql.py index be74694c..81339f3a 100644 --- a/sqlparse/sql.py +++ b/sqlparse/sql.py @@ -36,30 +36,17 @@ def get_alias(self): return self._get_first_name(reverse=True) -class Token: - """Base class for all other classes in this module. +class TokenBase: + """Base class for ``Token`` and ``TokenList``. - It represents a single token and has two instance attributes: - ``value`` is the unchanged value of the token and ``ttype`` is - the type of the token. + It has a single instance attribute, ``parent``, which if not ``None`` + represents the ``TokenList`` that contains this token. """ - __slots__ = ('value', 'ttype', 'parent', 'normalized', 'is_keyword', - 'is_group', 'is_whitespace', 'is_newline') + __slots__ = 'parent' - def __init__(self, ttype, value): - value = str(value) - self.value = value - self.ttype = ttype + def __init__(self): self.parent = None - self.is_group = False - self.is_keyword = ttype in T.Keyword - self.is_whitespace = self.ttype in T.Whitespace - self.is_newline = self.ttype in T.Newline - self.normalized = value.upper() if self.is_keyword else value - - def __str__(self): - return self.value # Pending tokenlist __len__ bug fix # def __len__(self): @@ -73,19 +60,12 @@ def __repr__(self): return "<{cls} {q}{value}{q} at 0x{id:2X}>".format( id=id(self), **locals()) - def _get_repr_name(self): - return str(self.ttype).split('.')[-1] - def _get_repr_value(self): raw = str(self) if len(raw) > 7: raw = raw[:6] + '...' return re.sub(r'\s+', ' ', raw) - def flatten(self): - """Resolve subgroups.""" - yield self - def match(self, ttype, values, regex=False): """Checks whether the token matches the given arguments. @@ -147,24 +127,73 @@ def has_ancestor(self, other): return False -class TokenList(Token): +class Token(TokenBase): + """"A single token. + + It has some additional instance attributes: + ``value`` is the unchanged value of the token + ``ttype`` is the type of the token + ``normalized`` is the value of the token, converted to uppercase if it + is a keyword + ``is_keyword`` is a boolean indicating if the token is a keyword + ``is_whitespace`` is a boolean indicating if the token is whitespace + ``is_newline`` is a boolean indicating if the token is a newline + character + """ + __slots__ = ('value', 'ttype', 'normalized', 'is_keyword', 'is_whitespace', + 'is_newline') + + is_group = False + + def __init__(self, ttype, value): + super().__init__() + value = str(value) + self.value = value + self.ttype = ttype + self.is_keyword = ttype in T.Keyword + self.is_whitespace = ttype in T.Whitespace + self.is_newline = self.ttype in T.Newline + self.normalized = value.upper() if self.is_keyword else value + + def __str__(self): + return self.value + + def _get_repr_name(self): + return str(self.ttype).split('.')[-1] + + def flatten(self): + """Resolve subgroups.""" + yield self + + +class TokenList(TokenBase): """A group of tokens. - It has an additional instance attribute ``tokens`` which holds a - list of child-tokens. + It has two additional instance attributes, ``value``, which is the value of + the token list, and ``tokens``, which holds a list of child-tokens. """ - __slots__ = 'tokens' + __slots__ = ('tokens', 'value') + + is_group = True + ttype = None + is_keyword = False + is_whitespace = False + is_newline = False def __init__(self, tokens=None): + super().__init__() self.tokens = tokens or [] + self.value = str(self) [setattr(token, 'parent', self) for token in self.tokens] - super().__init__(None, str(self)) - self.is_group = True def __str__(self): return ''.join(token.value for token in self.flatten()) + @property + def normalized(self): + return self.value + # weird bug # def __len__(self): # return len(self.tokens) From 2da3e63737577c264bbf633f9318f373101a6159 Mon Sep 17 00:00:00 2001 From: Daniel Harding Date: Wed, 12 May 2021 14:56:38 +0300 Subject: [PATCH 3/3] Make TokenList.value a property not an attribute. The fact that a new value was being computed each time TokenList.group_tokens() was called caused supra-linear runtime when token grouping was enabled. Address by making TokenList.value a dynamically-computed property rather than a static attribute. --- sqlparse/sql.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sqlparse/sql.py b/sqlparse/sql.py index 81339f3a..a0ce96d5 100644 --- a/sqlparse/sql.py +++ b/sqlparse/sql.py @@ -48,6 +48,9 @@ class TokenBase: def __init__(self): self.parent = None + def __str__(self): + return self.value + # Pending tokenlist __len__ bug fix # def __len__(self): # return len(self.value) @@ -155,9 +158,6 @@ def __init__(self, ttype, value): self.is_newline = self.ttype in T.Newline self.normalized = value.upper() if self.is_keyword else value - def __str__(self): - return self.value - def _get_repr_name(self): return str(self.ttype).split('.')[-1] @@ -169,11 +169,11 @@ def flatten(self): class TokenList(TokenBase): """A group of tokens. - It has two additional instance attributes, ``value``, which is the value of - the token list, and ``tokens``, which holds a list of child-tokens. + It has an additional instance attribute ``tokens`` which holds a + list of child-tokens. """ - __slots__ = ('tokens', 'value') + __slots__ = 'tokens' is_group = True ttype = None @@ -184,10 +184,10 @@ class TokenList(TokenBase): def __init__(self, tokens=None): super().__init__() self.tokens = tokens or [] - self.value = str(self) [setattr(token, 'parent', self) for token in self.tokens] - def __str__(self): + @property + def value(self): return ''.join(token.value for token in self.flatten()) @property @@ -352,7 +352,6 @@ def group_tokens(self, grp_cls, start, end, include_end=True, grp = start grp.tokens.extend(subtokens) del self.tokens[start_idx + 1:end_idx] - grp.value = str(start) else: subtokens = self.tokens[start_idx:end_idx] grp = grp_cls(subtokens)