Skip to content

Commit e6995c9

Browse files
committed
Allow extracting deeply nested calls in Python
Currently the Python extractor does not support deeply nested gettext calls (deeper than as a direct argument to the top-level gettext call). e.g. ```py _("Hello %s", _("Person")) _("Hello %s", random_function(", ".join([_("Person 1"), _("Person 2")]))) ``` The extraction code was refactored quite a bit to simplify the flow and support this use-case. Fixes #1125 (meanwhile also fixes #1123)
1 parent f91754b commit e6995c9

File tree

2 files changed

+148
-94
lines changed

2 files changed

+148
-94
lines changed

babel/messages/extract.py

+115-81
Original file line numberDiff line numberDiff line change
@@ -502,14 +502,6 @@ def extract_python(
502502
:param options: a dictionary of additional options (optional)
503503
:rtype: ``iterator``
504504
"""
505-
funcname = lineno = message_lineno = None
506-
call_stack = -1
507-
buf = []
508-
messages = []
509-
translator_comments = []
510-
in_def = in_translator_comments = False
511-
comment_tag = None
512-
513505
encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8')
514506
future_flags = parse_future_flags(fileobj, encoding)
515507
next_line = lambda: fileobj.readline().decode(encoding)
@@ -520,103 +512,145 @@ def extract_python(
520512
# currently parsing one.
521513
current_fstring_start = None
522514

523-
for tok, value, (lineno, _), _, _ in tokens:
524-
if call_stack == -1 and tok == NAME and value in ('def', 'class'):
515+
# Keep the stack of all function calls and its related contextual variables,
516+
# so we can handle nested gettext calls.
517+
function_stack = []
518+
# Keep the last encountered function name for when we encounter
519+
# an opening parenthesis
520+
last_function_name = None
521+
# Keep track of whether we're in a class or function definition
522+
in_def = False
523+
# Keep track of whether we're in a block of translator comments
524+
in_translator_comments = False
525+
# Keep track of the last encountered translator comments
526+
translator_comments = []
527+
# Keep track of the (split) strings encountered
528+
message_buffer = []
529+
530+
for token, value, (line_no, _), _, _ in tokens:
531+
if not function_stack and token == NAME and value in ('def', 'class'):
532+
# We're entering a class or function definition
525533
in_def = True
526-
elif tok == OP and value == '(':
527-
if in_def:
528-
# Avoid false positives for declarations such as:
529-
# def gettext(arg='message'):
530-
in_def = False
531-
continue
532-
if funcname:
533-
message_lineno = lineno
534-
call_stack += 1
535-
elif in_def and tok == OP and value == ':':
536-
# End of a class definition without parens
534+
535+
elif in_def and token == OP and value in ('(', ':'):
536+
# We're in a class or function definition and should not do anything
537537
in_def = False
538538
continue
539-
elif call_stack == -1 and tok == COMMENT:
539+
540+
elif token == OP and value == '(' and last_function_name:
541+
# We're entering a function call
542+
cur_translator_comments = translator_comments
543+
if function_stack and function_stack[-1]['function_line_no'] == line_no:
544+
# If our current function call is on the same line as the previous one,
545+
# copy their translator comments, since they also apply to us.
546+
cur_translator_comments = function_stack[-1]['translator_comments']
547+
548+
# We add all information needed later for the current function call
549+
function_stack.append({
550+
'function_line_no': line_no,
551+
'function_name': last_function_name,
552+
'message_line_no': None,
553+
'messages': [],
554+
'translator_comments': cur_translator_comments,
555+
})
556+
translator_comments = []
557+
558+
elif token == COMMENT:
540559
# Strip the comment token from the line
541560
value = value[1:].strip()
542-
if in_translator_comments and \
543-
translator_comments[-1][0] == lineno - 1:
561+
if in_translator_comments and translator_comments[-1][0] == line_no - 1:
544562
# We're already inside a translator comment, continue appending
545-
translator_comments.append((lineno, value))
563+
translator_comments.append((line_no, value))
546564
continue
547-
# If execution reaches this point, let's see if comment line
548-
# starts with one of the comment tags
565+
549566
for comment_tag in comment_tags:
550567
if value.startswith(comment_tag):
568+
# Comment starts with one of the comment tags,
569+
# so let's start capturing it
551570
in_translator_comments = True
552-
translator_comments.append((lineno, value))
571+
translator_comments.append((line_no, value))
553572
break
554-
elif funcname and call_stack == 0:
555-
nested = (tok == NAME and value in keywords)
556-
if (tok == OP and value == ')') or nested:
557-
if buf:
558-
messages.append(''.join(buf))
559-
del buf[:]
573+
574+
elif function_stack and function_stack[-1]['function_name'] in keywords:
575+
# We're inside a translation function call
576+
if token == OP and value == ')':
577+
# The call has ended, so we yield the translatable term(s)
578+
messages = function_stack[-1]['messages']
579+
line_no = (
580+
function_stack[-1]['message_line_no']
581+
or function_stack[-1]['function_line_no']
582+
)
583+
cur_translator_comments = function_stack[-1]['translator_comments']
584+
585+
if message_buffer:
586+
messages.append(''.join(message_buffer))
587+
message_buffer.clear()
560588
else:
561589
messages.append(None)
562590

563591
messages = tuple(messages) if len(messages) > 1 else messages[0]
564-
# Comments don't apply unless they immediately
565-
# precede the message
566-
if translator_comments and \
567-
translator_comments[-1][0] < message_lineno - 1:
568-
translator_comments = []
569-
570-
yield (message_lineno, funcname, messages,
571-
[comment[1] for comment in translator_comments])
572-
573-
funcname = lineno = message_lineno = None
574-
call_stack = -1
575-
messages = []
576-
translator_comments = []
577-
in_translator_comments = False
578-
if nested:
579-
funcname = value
580-
elif tok == STRING:
581-
val = _parse_python_string(value, encoding, future_flags)
582-
if val is not None:
583-
buf.append(val)
592+
if (
593+
cur_translator_comments
594+
and cur_translator_comments[-1][0] < line_no - 1
595+
):
596+
# The translator comments are not immediately preceding the current
597+
# term, so we skip them.
598+
cur_translator_comments = []
599+
600+
yield (
601+
line_no,
602+
function_stack[-1]['function_name'],
603+
messages,
604+
[comment[1] for comment in cur_translator_comments],
605+
)
606+
607+
function_stack.pop()
608+
609+
elif token == STRING:
610+
# We've encountered a string inside a translation function call
611+
string_value = _parse_python_string(value, encoding, future_flags)
612+
if not function_stack[-1]['message_line_no']:
613+
function_stack[-1]['message_line_no'] = line_no
614+
if string_value is not None:
615+
message_buffer.append(string_value)
584616

585617
# Python 3.12+, see https://peps.python.org/pep-0701/#new-tokens
586-
elif tok == FSTRING_START:
618+
elif token == FSTRING_START:
587619
current_fstring_start = value
588-
elif tok == FSTRING_MIDDLE:
620+
elif token == FSTRING_MIDDLE:
589621
if current_fstring_start is not None:
590622
current_fstring_start += value
591-
elif tok == FSTRING_END:
623+
elif token == FSTRING_END:
592624
if current_fstring_start is not None:
593625
fstring = current_fstring_start + value
594-
val = _parse_python_string(fstring, encoding, future_flags)
595-
if val is not None:
596-
buf.append(val)
597-
598-
elif tok == OP and value == ',':
599-
if buf:
600-
messages.append(''.join(buf))
601-
del buf[:]
626+
string_value = _parse_python_string(fstring, encoding, future_flags)
627+
if string_value is not None:
628+
message_buffer.append(string_value)
629+
630+
elif token == OP and value == ',':
631+
# End of a function call argument
632+
if message_buffer:
633+
function_stack[-1]['messages'].append(''.join(message_buffer))
634+
message_buffer.clear()
602635
else:
603-
messages.append(None)
604-
if translator_comments:
605-
# We have translator comments, and since we're on a
606-
# comma(,) user is allowed to break into a new line
607-
# Let's increase the last comment's lineno in order
608-
# for the comment to still be a valid one
609-
old_lineno, old_comment = translator_comments.pop()
610-
translator_comments.append((old_lineno + 1, old_comment))
611-
elif call_stack > 0 and tok == OP and value == ')':
612-
call_stack -= 1
613-
elif funcname and call_stack == -1:
614-
funcname = None
615-
elif tok == NAME and value in keywords:
616-
funcname = value
636+
function_stack[-1]['messages'].append(None)
637+
638+
elif function_stack and token == OP and value == ')':
639+
function_stack.pop()
640+
641+
if in_translator_comments and translator_comments[-1][0] < line_no:
642+
# We have a newline in between the comments, so they don't belong
643+
# together anymore
644+
in_translator_comments = False
645+
646+
if token == NAME:
647+
last_function_name = value
648+
if function_stack and not function_stack[-1]['message_line_no']:
649+
function_stack[-1]['message_line_no'] = line_no
617650

618-
if (current_fstring_start is not None
619-
and tok not in {FSTRING_START, FSTRING_MIDDLE}
651+
if (
652+
current_fstring_start is not None
653+
and token not in {FSTRING_START, FSTRING_MIDDLE}
620654
):
621655
# In Python 3.12, tokens other than FSTRING_* mean the
622656
# f-string is dynamic, so we don't wan't to extract it.

tests/messages/test_extract.py

+33-13
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@ def test_comments_with_calls_that_spawn_multiple_lines(self):
9797
messages = list(extract.extract_python(buf, ('ngettext', '_'), ['NOTE:'],
9898

9999
{'strip_comment_tags': False}))
100-
assert messages[0] == (3, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
100+
assert messages[0] == (2, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
101101
assert messages[1] == (6, '_', 'Locale deleted.', ['NOTE: This Comment SHOULD Be Extracted'])
102102
assert messages[2] == (10, 'ngettext', ('Foo deleted.', 'Foos deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
103-
assert messages[3] == (15, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too'])
103+
assert messages[3] == (14, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too'])
104104

105105
def test_declarations(self):
106106
buf = BytesIO(b"""\
@@ -422,24 +422,44 @@ def test_nested_messages(self):
422422
# NOTE: Third
423423
_(u'Hello, {0} and {1}!', _(u'Heungsub'),
424424
_(u'Armin'))
425+
426+
# NOTE: Fourth
427+
_("Hello %(person)", person=random_function(_("Person")))
428+
429+
# NOTE: Fifth
430+
_("Hello %(people)",
431+
person=random_function(
432+
", ".join([_("Person 1"), _("Person 2")])
433+
)
434+
)
425435
""")
426436
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
427-
assert messages[0][2] == ('Hello, {name}!', None)
437+
assert messages[0][2] == 'Foo Bar'
428438
assert messages[0][3] == ['NOTE: First']
429-
assert messages[1][2] == 'Foo Bar'
430-
assert messages[1][3] == []
431-
assert messages[2][2] == ('Hello, {name1} and {name2}!', None)
439+
assert messages[1][2] == ('Hello, {name}!', None)
440+
assert messages[1][3] == ['NOTE: First']
441+
assert messages[2][2] == 'Heungsub'
432442
assert messages[2][3] == ['NOTE: Second']
433-
assert messages[3][2] == 'Heungsub'
443+
assert messages[3][2] == 'Armin'
434444
assert messages[3][3] == []
435-
assert messages[4][2] == 'Armin'
436-
assert messages[4][3] == []
437-
assert messages[5][2] == ('Hello, {0} and {1}!', None)
445+
assert messages[4][2] == ('Hello, {name1} and {name2}!', None, None)
446+
assert messages[4][3] == ['NOTE: Second']
447+
assert messages[5][2] == 'Heungsub'
438448
assert messages[5][3] == ['NOTE: Third']
439-
assert messages[6][2] == 'Heungsub'
449+
assert messages[6][2] == 'Armin'
440450
assert messages[6][3] == []
441-
assert messages[7][2] == 'Armin'
442-
assert messages[7][3] == []
451+
assert messages[7][2] == ('Hello, {0} and {1}!', None, None)
452+
assert messages[7][3] == ['NOTE: Third']
453+
assert messages[8][2] == 'Person'
454+
assert messages[8][3] == ['NOTE: Fourth']
455+
assert messages[9][2] == ('Hello %(person)', None)
456+
assert messages[9][3] == ['NOTE: Fourth']
457+
assert messages[10][2] == 'Person 1'
458+
assert messages[10][3] == []
459+
assert messages[11][2] == 'Person 2'
460+
assert messages[11][3] == []
461+
assert messages[12][2] == ('Hello %(people)', None)
462+
assert messages[12][3] == ['NOTE: Fifth']
443463

444464

445465
class ExtractTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)