diff --git a/babel/messages/extract.py b/babel/messages/extract.py index 1c86e4ac8..3840be957 100644 --- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -30,10 +30,11 @@ Mapping, MutableSequence, ) +from dataclasses import dataclass from functools import lru_cache from os.path import relpath from textwrap import dedent -from tokenize import COMMENT, NAME, NL, OP, STRING, generate_tokens +from tokenize import COMMENT, NAME, OP, STRING, generate_tokens from typing import TYPE_CHECKING, Any from babel.messages._compat import find_entrypoints @@ -99,6 +100,15 @@ def tell(self) -> int: ... FSTRING_END = getattr(tokenize, "FSTRING_END", None) +@dataclass +class FunctionStackItem: + function_lineno: int + function_name: str + message_lineno: int | None + messages: list[str | None] + translator_comments: list[tuple[int, str]] + + def _strip_comment_tags(comments: MutableSequence[str], tags: Iterable[str]): """Helper function for `extract` that strips comment tags from strings in a list of comment lines. This functions operates in-place. @@ -507,14 +517,6 @@ def extract_python( :param options: a dictionary of additional options (optional) :rtype: ``iterator`` """ - funcname = lineno = message_lineno = None - call_stack = -1 - buf = [] - messages = [] - translator_comments = [] - in_def = in_translator_comments = False - comment_tag = None - encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8') future_flags = parse_future_flags(fileobj, encoding) next_line = lambda: fileobj.readline().decode(encoding) @@ -525,108 +527,148 @@ def extract_python( # currently parsing one. current_fstring_start = None + # Keep the stack of all function calls and its related contextual variables, + # so we can handle nested gettext calls. + function_stack: list[FunctionStackItem] = [] + # Keep the last encountered function/variable name for when we encounter + # an opening parenthesis + last_name = None + # Keep track of whether we're in a class or function definition + in_def = False + # Keep track of whether we're in a block of translator comments + in_translator_comments = False + # Keep track of the last encountered translator comments + translator_comments = [] + # Keep track of the (split) strings encountered + message_buffer = [] + for tok, value, (lineno, _), _, _ in tokens: - if call_stack == -1 and tok == NAME and value in ('def', 'class'): + if tok == NAME and value in ('def', 'class'): + # We're entering a class or function definition in_def = True - elif tok == OP and value == '(': - if in_def: - # Avoid false positives for declarations such as: - # def gettext(arg='message'): - in_def = False - continue - if funcname: - call_stack += 1 - elif in_def and tok == OP and value == ':': - # End of a class definition without parens + continue + + elif in_def and tok == OP and value in ('(', ':'): + # We're in a class or function definition and should not do anything in_def = False continue - elif call_stack == -1 and tok == COMMENT: + + elif tok == OP and value == '(' and last_name: + # We're entering a function call + cur_translator_comments = translator_comments + if function_stack and function_stack[-1].function_lineno == lineno: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1].translator_comments + + # We add all information needed later for the current function call + function_stack.append(FunctionStackItem( + function_lineno=lineno, + function_name=last_name, + message_lineno=None, + messages=[], + translator_comments=cur_translator_comments, + )) + translator_comments = [] + message_buffer.clear() + + elif tok == COMMENT: # Strip the comment token from the line value = value[1:].strip() - if in_translator_comments and \ - translator_comments[-1][0] == lineno - 1: + if in_translator_comments and translator_comments[-1][0] == lineno - 1: # We're already inside a translator comment, continue appending translator_comments.append((lineno, value)) continue - # If execution reaches this point, let's see if comment line - # starts with one of the comment tags + for comment_tag in comment_tags: if value.startswith(comment_tag): + # Comment starts with one of the comment tags, + # so let's start capturing it in_translator_comments = True translator_comments.append((lineno, value)) break - elif funcname and call_stack == 0: - nested = (tok == NAME and value in keywords) - if (tok == OP and value == ')') or nested: - if buf: - messages.append(''.join(buf)) - del buf[:] + + elif function_stack and function_stack[-1].function_name in keywords: + # We're inside a translation function call + if tok == OP and value == ')': + # The call has ended, so we yield the translatable term(s) + messages = function_stack[-1].messages + lineno = ( + function_stack[-1].message_lineno + or function_stack[-1].function_lineno + ) + cur_translator_comments = function_stack[-1].translator_comments + + if message_buffer: + messages.append(''.join(message_buffer)) + message_buffer.clear() else: messages.append(None) messages = tuple(messages) if len(messages) > 1 else messages[0] - # Comments don't apply unless they immediately - # precede the message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] - - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) - - funcname = lineno = message_lineno = None - call_stack = -1 - messages = [] - translator_comments = [] - in_translator_comments = False - if nested: - funcname = value + if ( + cur_translator_comments + and cur_translator_comments[-1][0] < lineno - 1 + ): + # The translator comments are not immediately preceding the current + # term, so we skip them. + cur_translator_comments = [] + + yield ( + lineno, + function_stack[-1].function_name, + messages, + [comment[1] for comment in cur_translator_comments], + ) + + function_stack.pop() + elif tok == STRING: - val = _parse_python_string(value, encoding, future_flags) - if val is not None: - if not message_lineno: - message_lineno = lineno - buf.append(val) + # We've encountered a string inside a translation function call + string_value = _parse_python_string(value, encoding, future_flags) + if not function_stack[-1].message_lineno: + function_stack[-1].message_lineno = lineno + if string_value is not None: + message_buffer.append(string_value) # Python 3.12+, see https://peps.python.org/pep-0701/#new-tokens elif tok == FSTRING_START: current_fstring_start = value - if not message_lineno: - message_lineno = lineno elif tok == FSTRING_MIDDLE: if current_fstring_start is not None: current_fstring_start += value elif tok == FSTRING_END: if current_fstring_start is not None: fstring = current_fstring_start + value - val = _parse_python_string(fstring, encoding, future_flags) - if val is not None: - buf.append(val) + string_value = _parse_python_string(fstring, encoding, future_flags) + if string_value is not None: + message_buffer.append(string_value) elif tok == OP and value == ',': - if buf: - messages.append(''.join(buf)) - del buf[:] + # End of a function call argument + if message_buffer: + function_stack[-1].messages.append(''.join(message_buffer)) + message_buffer.clear() else: - messages.append(None) - if translator_comments: - # We have translator comments, and since we're on a - # comma(,) user is allowed to break into a new line - # Let's increase the last comment's lineno in order - # for the comment to still be a valid one - old_lineno, old_comment = translator_comments.pop() - translator_comments.append((old_lineno + 1, old_comment)) - - elif tok != NL and not message_lineno: - message_lineno = lineno - elif call_stack > 0 and tok == OP and value == ')': - call_stack -= 1 - elif funcname and call_stack == -1: - funcname = None - elif tok == NAME and value in keywords: - funcname = value - - if current_fstring_start is not None and tok not in {FSTRING_START, FSTRING_MIDDLE}: + function_stack[-1].messages.append(None) + + elif function_stack and tok == OP and value == ')': + function_stack.pop() + + if in_translator_comments and translator_comments[-1][0] < lineno: + # We have a newline in between the comments, so they don't belong + # together anymore + in_translator_comments = False + + if tok == NAME: + last_name = value + if function_stack and not function_stack[-1].message_lineno: + function_stack[-1].message_lineno = lineno + + if ( + current_fstring_start is not None + and tok not in {FSTRING_START, FSTRING_MIDDLE} + ): # In Python 3.12, tokens other than FSTRING_* mean the # f-string is dynamic, so we don't wan't to extract it. # And if it's FSTRING_END, we've already handled it above. @@ -679,15 +721,22 @@ def extract_javascript( :param lineno: line number offset (for parsing embedded fragments) """ from babel.messages.jslexer import Token, tokenize, unquote_string - funcname = message_lineno = None - messages = [] - last_argument = None - translator_comments = [] - concatenate_next = False + encoding = options.get('encoding', 'utf-8') - last_token = None - call_stack = -1 dotted = any('.' in kw for kw in keywords) + last_token = None + # Keep the stack of all function calls and its related contextual variables, + # so we can handle nested gettext calls. + function_stack: list[FunctionStackItem] = [] + # Keep track of whether we're in a class or function definition + in_def = False + # Keep track of whether we're in a block of translator comments + in_translator_comments = False + # Keep track of the last encountered translator comments + translator_comments = [] + # Keep track of the (split) strings encountered + message_buffer = [] + for token in tokenize( fileobj.read().decode(encoding), jsx=options.get("jsx", True), @@ -695,38 +744,86 @@ def extract_javascript( dotted=dotted, lineno=lineno, ): - if ( # Turn keyword`foo` expressions into keyword("foo") calls: - funcname and # have a keyword... - (last_token and last_token.type == 'name') and # we've seen nothing after the keyword... - token.type == 'template_string' # this is a template string + if token.type == 'name' and token.value in ('class', 'function'): + # We're entering a class or function definition + in_def = True + + elif in_def and token.type == 'operator' and token.value in ('(', '{'): + # We're in a class or function definition and should not do anything + in_def = False + continue + + elif ( + last_token + and last_token.type == 'name' + and last_token.value in keywords + and token.type == 'template_string' ): - message_lineno = token.lineno - messages = [unquote_string(token.value)] - call_stack = 0 + # Turn keyword`foo` expressions into keyword("foo") function calls + string_value = unquote_string(token.value) + cur_translator_comments = translator_comments + if function_stack and function_stack[-1].function_lineno == last_token.lineno: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1].translator_comments + + # We add all information needed later for the current function call + function_stack.append(FunctionStackItem( + function_lineno=last_token.lineno, + function_name=last_token.value, + message_lineno=token.lineno, + messages=[string_value], + translator_comments=cur_translator_comments, + )) + translator_comments = [] + + # We act as if we are closing the function call now token = Token('operator', ')', token.lineno) - if options.get('parse_template_string') and not funcname and token.type == 'template_string': + if ( + options.get('parse_template_string') + and (not last_token or last_token.type != 'name' or last_token.value not in keywords) + and token.type == 'template_string' + ): yield from parse_template_string(token.value, keywords, comment_tags, options, token.lineno) elif token.type == 'operator' and token.value == '(': - if funcname: - message_lineno = token.lineno - call_stack += 1 + if last_token.type == 'name': + # We're entering a function call + cur_translator_comments = translator_comments + if function_stack and function_stack[-1].function_lineno == token.lineno: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1].translator_comments + + # We add all information needed later for the current function call + function_stack.append(FunctionStackItem( + function_lineno=token.lineno, + function_name=last_token.value, + message_lineno=None, + messages=[], + translator_comments=cur_translator_comments, + )) + translator_comments = [] - elif call_stack == -1 and token.type == 'linecomment': + elif token.type == 'linecomment': + # Strip the comment token from the line value = token.value[2:].strip() - if translator_comments and \ - translator_comments[-1][0] == token.lineno - 1: + if in_translator_comments and translator_comments[-1][0] == token.lineno - 1: + # We're already inside a translator comment, continue appending translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): - translator_comments.append((token.lineno, value.strip())) + # Comment starts with one of the comment tags, + # so let's start capturing it + in_translator_comments = True + translator_comments.append((token.lineno, value)) break elif token.type == 'multilinecomment': - # only one multi-line comment may precede a translation + # Only one multi-line comment may precede a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: @@ -736,68 +833,67 @@ def extract_javascript( lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): - translator_comments.append((token.lineno + offset, - line)) + translator_comments.append((token.lineno + offset, line)) break - elif funcname and call_stack == 0: + elif function_stack and function_stack[-1].function_name in keywords: + # We're inside a translation function call if token.type == 'operator' and token.value == ')': - if last_argument is not None: - messages.append(last_argument) - if len(messages) > 1: - messages = tuple(messages) - elif messages: - messages = messages[0] + # The call has ended, so we yield the translatable term(s) + messages = function_stack[-1].messages + lineno = ( + function_stack[-1].message_lineno + or function_stack[-1].function_lineno + ) + cur_translator_comments = function_stack[-1].translator_comments + + if message_buffer: + messages.append(''.join(message_buffer)) + message_buffer.clear() else: - messages = None - - # Comments don't apply unless they immediately precede the - # message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] - - if messages is not None: - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) + messages.append(None) - funcname = message_lineno = last_argument = None - concatenate_next = False - translator_comments = [] - messages = [] - call_stack = -1 + messages = tuple(messages) if len(messages) > 1 else messages[0] + if ( + cur_translator_comments + and cur_translator_comments[-1][0] < lineno - 1 + ): + # The translator comments are not immediately preceding the current + # term, so we skip them. + cur_translator_comments = [] + + yield ( + lineno, + function_stack[-1].function_name, + messages, + [comment[1] for comment in cur_translator_comments], + ) + + function_stack.pop() elif token.type in ('string', 'template_string'): - new_value = unquote_string(token.value) - if concatenate_next: - last_argument = (last_argument or '') + new_value - concatenate_next = False + # We've encountered a string inside a translation function call + string_value = unquote_string(token.value) + if not function_stack[-1].message_lineno: + function_stack[-1].message_lineno = token.lineno + if string_value is not None: + message_buffer.append(string_value) + + elif token.type == 'operator' and token.value == ',': + # End of a function call argument + if message_buffer: + function_stack[-1].messages.append(''.join(message_buffer)) + message_buffer.clear() else: - last_argument = new_value - - elif token.type == 'operator': - if token.value == ',': - if last_argument is not None: - messages.append(last_argument) - last_argument = None - else: - messages.append(None) - concatenate_next = False - elif token.value == '+': - concatenate_next = True - - elif call_stack > 0 and token.type == 'operator' \ - and token.value == ')': - call_stack -= 1 - - elif funcname and call_stack == -1: - funcname = None - - elif call_stack == -1 and token.type == 'name' and \ - token.value in keywords and \ - (last_token is None or last_token.type != 'name' or - last_token.value != 'function'): - funcname = token.value + function_stack[-1].messages.append(None) + + elif function_stack and token.type == 'operator' and token.value == ')': + function_stack.pop() + + if in_translator_comments and translator_comments[-1][0] < token.lineno: + # We have a newline in between the comments, so they don't belong + # together anymore + in_translator_comments = False last_token = token diff --git a/tests/messages/test_extract.py b/tests/messages/test_extract.py index a9b2d1178..613deda42 100644 --- a/tests/messages/test_extract.py +++ b/tests/messages/test_extract.py @@ -428,24 +428,34 @@ def test_nested_messages(self): # NOTE: Third _(u'Hello, {0} and {1}!', _(u'Heungsub'), _(u'Armin')) + +# NOTE: Fourth +_("Hello %(person)s and %(other_person)s", person=random_fn(_("Person 1")), other_person=random_obj["random_fn"](_("Person 2"))) + +# NOTE: Fifth +_("Hello %(people)s", + people=random_obj.random_fn( + ", ".join([_("Person 1"), _("Person 2")]) + ", and everyone else" + ) +) """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - assert messages[0][2] == ('Hello, {name}!', None) - assert messages[0][3] == ['NOTE: First'] - assert messages[1][2] == 'Foo Bar' - assert messages[1][3] == [] - assert messages[2][2] == ('Hello, {name1} and {name2}!', None) - assert messages[2][3] == ['NOTE: Second'] - assert messages[3][2] == 'Heungsub' - assert messages[3][3] == [] - assert messages[4][2] == 'Armin' - assert messages[4][3] == [] - assert messages[5][2] == ('Hello, {0} and {1}!', None) - assert messages[5][3] == ['NOTE: Third'] - assert messages[6][2] == 'Heungsub' - assert messages[6][3] == [] - assert messages[7][2] == 'Armin' - assert messages[7][3] == [] + assert [(m[2], m[3]) for m in messages] == [ + ('Foo Bar', ['NOTE: First']), + (('Hello, {name}!', None), ['NOTE: First']), + ('Heungsub', ['NOTE: Second']), + ('Armin', []), + (('Hello, {name1} and {name2}!', None, None), ['NOTE: Second']), + ('Heungsub', ['NOTE: Third']), + ('Armin', []), + (('Hello, {0} and {1}!', None, None), ['NOTE: Third']), + ('Person 1', ['NOTE: Fourth']), + ('Person 2', ['NOTE: Fourth']), + (('Hello %(person)s and %(other_person)s', None, None), ['NOTE: Fourth']), + ('Person 1', []), + ('Person 2', []), + (('Hello %(people)s', None), ['NOTE: Fifth']), + ] class ExtractTestCase(unittest.TestCase): diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py index fc643851e..90c619b94 100644 --- a/tests/messages/test_js_extract.py +++ b/tests/messages/test_js_extract.py @@ -191,3 +191,35 @@ def test_inside_nested_template_string(): ) assert messages == [(1, 'Greetings!', [], None), (1, 'This is a lovely evening.', [], None), (1, 'The day is really nice!', [], None)] + +def test_nested_gettext_calls(): + buf = BytesIO(b"""\ +gettext("Hello %s", gettext("User")); +// NOTE: Double +gettext("Hello %(user)s", { user: gettext("User") }); +// NOTE: Non Preceding + +gettext("Hello %s", dummy.dummyFunction(gettext("User"))); +gettext( + "Hello %(user)s", + { user: dummy.dummyFunction(gettext("User")) }, +); +// NOTE: Another Comment +gettext("Random %s", gettext`Shizzle`); +""") + messages = list( + extract.extract('javascript', buf, {"gettext": None}, ['NOTE:'], {}), + ) + + assert messages == [ + (1, 'User', [], None), + (1, 'Hello %s', [], None), + (3, 'User', ['NOTE: Double'], None), + (3, 'Hello %(user)s', ['NOTE: Double'], None), + (6, 'User', [], None), + (6, 'Hello %s', [], None), + (9, 'User', [], None), + (8, 'Hello %(user)s', [], None), + (12, 'Shizzle', ['NOTE: Another Comment'], None), + (12, 'Random %s', ['NOTE: Another Comment'], None), + ]