From 90bb63523aa1fa3785f712843ea78fc9a7d579d2 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 20 Mar 2025 13:38:33 +0100 Subject: [PATCH 01/13] cellmethods parser --- prototype/cellmethods/cellmethods_parser.py | 236 ++++++++++++++++++ .../cellmethods/test_cellmethods_parser.py | 122 +++++++++ .../cellmethods/test_xarray_translation.py | 50 ++++ setup.py | 1 + 4 files changed, 409 insertions(+) create mode 100644 prototype/cellmethods/cellmethods_parser.py create mode 100644 prototype/cellmethods/test_cellmethods_parser.py create mode 100644 prototype/cellmethods/test_xarray_translation.py diff --git a/prototype/cellmethods/cellmethods_parser.py b/prototype/cellmethods/cellmethods_parser.py new file mode 100644 index 00000000..e6df49b8 --- /dev/null +++ b/prototype/cellmethods/cellmethods_parser.py @@ -0,0 +1,236 @@ +from sly import Lexer, Parser + + +class CellMethodsLexer(Lexer): + # set of token names + tokens = {DIMENSION, ACTION, REGION, CONSTRAINT, SCOPE, SELECTION} + + # string containing ignored characters between token + ignore = " \t" + + # Regular expression rules for tokens + DIMENSION = r"area:|time:|grid_longitude:|longitude:|latitude:|depth:" + ACTION = r"mean|minimum|maximum|sum|point" + CONSTRAINT = r"within|over|where" # Must come before REGION/SELECTION + REGION = r"[a-z_]+(?![a-z_])" # Match words with underscores + SELECTION = r"[a-z_]+(?![a-z_])" # Match words with underscores + + def DIMENSION(self, t): + t.value = t.value[:-1] + return t + + def REGION(self, t): + if t.value not in ('land', 'sea', 'sea_ice', 'snow', 'ice_sheet', 'grounded_ice_sheet', 'crops', 'ice_free_sea'): + t.type = 'SELECTION' + return t + + @_(r"\(.*?\)") + def SCOPE(self, t): + t.value = t.value[1:-1] + return t + + @_(r"\n+") + def newline(self, t): + self.lineno += t.value.count("\n") + + +class CellMethodsParser(Parser): + tokens = CellMethodsLexer.tokens + + def __init__(self): + self.groups = [] + + @_('statements') + def program(self, p): + return self.groups + + @_('statement') + def statements(self, p): + return p.statement + + @_('statements statement') + def statements(self, p): + return p.statements + + @_('dimension action') + def statement(self, p): + current_group = [ + ('DIMENSION', p.dimension), + ('ACTION', p.action) + ] + self.groups.append(current_group) + return p + + @_('dimension action SCOPE') + def statement(self, p): + current_group = [ + ('DIMENSION', p.dimension), + ('ACTION', p.action), + ('SCOPE', p.SCOPE) + ] + self.groups.append(current_group) + return p + + @_('dimension action constraint region') + def statement(self, p): + current_group = [ + ('DIMENSION', p.dimension), + ('ACTION', p.action), + ('CONSTRAINT', p.constraint), + ('REGION', p.region) + ] + self.groups.append(current_group) + return p + + @_('dimension action constraint region SCOPE') + def statement(self, p): + current_group = [ + ('DIMENSION', p.dimension), + ('ACTION', p.action), + ('CONSTRAINT', p.constraint), + ('REGION', p.region), + ('SCOPE', p.SCOPE) + ] + self.groups.append(current_group) + return p + + @_('dimension action constraint SELECTION') + def statement(self, p): + current_group = [ + ('DIMENSION', p.dimension), + ('ACTION', p.action), + ('CONSTRAINT', p.constraint), + ('SELECTION', p.SELECTION) + ] + self.groups.append(current_group) + return p + + @_('dimensions ACTION') + def statement(self, p): + for dimension in p.dimensions: + current_group = [ + ('DIMENSION', dimension), + ('ACTION', p.ACTION) + ] + self.groups.append(current_group) + return p + + @_('dimension') + def dimensions(self, p): + return [p.dimension] + + @_('dimensions dimension') + def dimensions(self, p): + return p.dimensions + [p.dimension] + + @_('DIMENSION') + def dimension(self, p): + return p.DIMENSION + + @_('ACTION') + def action(self, p): + return p.ACTION + + @_('CONSTRAINT') + def constraint(self, p): + return p.CONSTRAINT + + @_('REGION') + def region(self, p): + return p.REGION + + +class XArrayTranslator: + """ + Represent parsed tree as human readable (pseudo code) xarray operations. + Produces strings and not xarray objects. + """ + def __init__(self, da_name='da'): + self.da_name = da_name + + def translate_group(self, group): + """Translate a single group of tokens into an xarray operation.""" + tokens_dict = dict(group) + + # Base operation + operation = f"{self.da_name}" + + # Handle the main action + if 'ACTION' in tokens_dict: + action = tokens_dict['ACTION'] + dim = tokens_dict.get('DIMENSION') + + if 'CONSTRAINT' in tokens_dict: + constraint = tokens_dict['CONSTRAINT'] + if constraint == 'within': + # For 'within', we first group by the selection + selection = tokens_dict.get('SELECTION') + if selection: + operation = f"{operation}.groupby('{selection}').{action}()" + elif constraint == 'over': + # For 'over', we apply the operation over the selection + selection = tokens_dict.get('SELECTION') + if selection: + operation = f"{operation}.{action}(dim='{selection}')" + elif constraint == 'where': + # For 'where', we apply a mask before the operation + region = tokens_dict.get('REGION') + if region: + operation = f"{operation}.where(mask=='{region}').{action}(dim='{dim}')" + else: + # Simple dimension reduction + operation = f"{operation}.{action}(dim='{dim}')" + + # Add any scope comments as a comment in the code + if 'SCOPE' in tokens_dict: + operation = f"{operation} # {tokens_dict['SCOPE']}" + + return operation + + def translate(self, groups): + """Translate all groups into a sequence of xarray operations.""" + operations = [] + intermediate = self.da_name + + if len(groups) == 1: + # For single operations, just return the operation directly + return self.translate_group(groups[0]) + + for i, group in enumerate(groups): + if i > 0: + # Use the result of the previous operation + self.da_name = f"result_{i}" + operations.append(f"{self.da_name} = {intermediate}") + + intermediate = self.translate_group(group) + + if i == len(groups) - 1: + # Last operation should be assigned to final result + operations.append(f"result = {intermediate}") + + return '\n'.join(operations) + + +def parse_cell_methods(text): + lexer = CellMethodsLexer() + parser = CellMethodsParser() + return parser.parse(lexer.tokenize(text)) + + +def translate_to_xarray(text): + """Convenience function to parse cell methods and translate to xarray operations.""" + parser = CellMethodsParser() + lexer = CellMethodsLexer() + translator = XArrayTranslator() + + parsed = parse_cell_methods(text) + return translator.translate(parsed) + + +if __name__ == "__main__": + text = "area: mean where sea depth: sum where sea (top 100m only) time: mean" + result = parse_cell_methods(text) + from pprint import pprint + pprint(result) + print("\nXArray translation (pseduo code, human readable strings. not xarray object):") + print(translate_to_xarray(text)) diff --git a/prototype/cellmethods/test_cellmethods_parser.py b/prototype/cellmethods/test_cellmethods_parser.py new file mode 100644 index 00000000..54aca352 --- /dev/null +++ b/prototype/cellmethods/test_cellmethods_parser.py @@ -0,0 +1,122 @@ +import pytest +from cellmethods_parser import CellMethodsLexer, parse_cell_methods + +test_cases = [ + ( + "area: depth: time: mean", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], [('DIMENSION', 'depth'), ('ACTION', 'mean')], [('DIMENSION', 'time'), ('ACTION', 'mean')] + ] + ), + ( + "area: mean", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')] + ] + ), + ( + "area: mean (comment: over land and sea ice) time: point", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean'), ('SCOPE', 'comment: over land and sea ice')], + [('DIMENSION', 'time'), ('ACTION', 'point')] + ] + ), + ( + "area: mean time: maximum", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'maximum')] + ] + ), + ( + "area: mean time: maximum within days time: mean over days", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'maximum'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] + ] + ), + ( + "area: mean time: mean within days time: mean over days", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] + ] + ), + ( + "area: mean time: mean within hours time: maximum over hours", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'hours')], + [('DIMENSION', 'time'), ('ACTION', 'maximum'), ('CONSTRAINT', 'over'), ('SELECTION', 'hours')] + ] + ), + ( + "area: mean time: mean within years time: mean over years", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'years')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'years')] + ] + ), + ( + "area: mean time: minimum", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'minimum')] + ] + ), + ( + "area: mean time: minimum within days time: mean over days", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'minimum'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], + [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] + ] + ), + ( + "area: mean time: point", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'point')] + ] + ), + ( + "area: mean time: sum", + [ + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'sum')] + ] + ) +] + +@pytest.mark.parametrize("input_text,expected_output", test_cases) +def test_cell_methods_parser(input_text, expected_output): + result = parse_cell_methods(input_text) + assert result == expected_output, f"\nInput: {input_text}\nExpected: {expected_output}\nGot: {result}" + +def test_lexer_tokens(): + lexer = CellMethodsLexer() + # Test each token type is recognized correctly + test_tokens = { + 'DIMENSION': ['area:', 'time:', 'depth:', 'grid_longitude:', 'longitude:', 'latitude:'], + 'ACTION': ['mean', 'minimum', 'maximum', 'sum', 'point'], + 'REGION': ['land', 'sea', 'sea_ice', 'snow', 'ice_sheet', 'grounded_ice_sheet', 'crops', 'ice_free_sea'], + 'SELECTION': ['all_area_types', 'days', 'years', 'hours'], + 'CONSTRAINT': ['within', 'over', 'where'], + 'SCOPE': ['(comment text)', '(top 100m only)'] + } + + for token_type, values in test_tokens.items(): + for value in values: + tokens = list(lexer.tokenize(value)) + assert len(tokens) == 1, f"Expected 1 token for {value}, got {len(tokens)}" + token = tokens[0] + assert token.type == token_type, f"Expected token type {token_type} for {value}, got {token.type}" + if token_type == 'DIMENSION': + assert token.value == value[:-1], f"Expected value {value[:-1]} for {value}, got {token.value}" + elif token_type == 'SCOPE': + assert token.value == value[1:-1], f"Expected value {value[1:-1]} for {value}, got {token.value}" + else: + assert token.value == value, f"Expected value {value}, got {token.value}" diff --git a/prototype/cellmethods/test_xarray_translation.py b/prototype/cellmethods/test_xarray_translation.py new file mode 100644 index 00000000..48c4f1f7 --- /dev/null +++ b/prototype/cellmethods/test_xarray_translation.py @@ -0,0 +1,50 @@ +from cellmethods_parser import translate_to_xarray + +test_cases = [ + ( + "area: mean", + "da.mean(dim='area')" + ), + ( + "area: mean where sea", + "da.where(mask=='sea').mean(dim='area')" + ), + ( + "area: mean where sea time: mean", + "result_1 = da.where(mask=='sea').mean(dim='area')\n" + "result = result_1.mean(dim='time')" + ), + ( + "area: mean time: maximum within days", + "result_1 = da.mean(dim='area')\n" + "result = result_1.groupby('days').maximum()" + ), + ( + "area: mean time: mean within days time: mean over days", + "result_1 = da.mean(dim='area')\n" + "result_2 = result_1.groupby('days').mean()\n" + "result = result_2.mean(dim='days')" + ), + ( + "area: mean (comment: over land and sea ice) time: point", + "result_1 = da.mean(dim='area') # comment: over land and sea ice\n" + "result = result_1.point(dim='time')" + ), + ( + "area: depth: time: mean", + "result_1 = da.mean(dim='area')\n" + "result_2 = result_1.mean(dim='depth')\n" + "result = result_2.mean(dim='time')" + ) +] + +def test_translations(): + for input_text, expected in test_cases: + result = translate_to_xarray(input_text) + assert result == expected, f"\nInput: {input_text}\nExpected:\n{expected}\nGot:\n{result}" + print(f"\nInput: {input_text}") + print("Generated xarray code:") + print(result) + +if __name__ == "__main__": + test_translations() diff --git a/setup.py b/setup.py index 2fb647a5..3056a233 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ def read(filename): "tqdm", "versioneer", "xarray", + "sly", ], extras_require={ "dev": [ From e6fe58f88b0b2eb5796a6e1d395eaf0c7b46b80c Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 20 Mar 2025 14:09:34 +0100 Subject: [PATCH 02/13] fix flake8 warnings --- prototype/cellmethods/cellmethods_parser.py | 149 ++++++++++---------- 1 file changed, 78 insertions(+), 71 deletions(-) diff --git a/prototype/cellmethods/cellmethods_parser.py b/prototype/cellmethods/cellmethods_parser.py index e6df49b8..7bb2d6ad 100644 --- a/prototype/cellmethods/cellmethods_parser.py +++ b/prototype/cellmethods/cellmethods_parser.py @@ -3,7 +3,7 @@ class CellMethodsLexer(Lexer): # set of token names - tokens = {DIMENSION, ACTION, REGION, CONSTRAINT, SCOPE, SELECTION} + tokens = {DIMENSION, ACTION, REGION, CONSTRAINT, SCOPE, SELECTION} # noqa: F821 # string containing ignored characters between token ignore = " \t" @@ -15,21 +15,30 @@ class CellMethodsLexer(Lexer): REGION = r"[a-z_]+(?![a-z_])" # Match words with underscores SELECTION = r"[a-z_]+(?![a-z_])" # Match words with underscores - def DIMENSION(self, t): + def DIMENSION(self, t): # noqa: F811 t.value = t.value[:-1] return t - def REGION(self, t): - if t.value not in ('land', 'sea', 'sea_ice', 'snow', 'ice_sheet', 'grounded_ice_sheet', 'crops', 'ice_free_sea'): - t.type = 'SELECTION' + def REGION(self, t): # noqa: F811 + if t.value not in ( + "land", + "sea", + "sea_ice", + "snow", + "ice_sheet", + "grounded_ice_sheet", + "crops", + "ice_free_sea", + ): + t.type = "SELECTION" return t - @_(r"\(.*?\)") + @_(r"\(.*?\)") # noqa: F821 def SCOPE(self, t): t.value = t.value[1:-1] return t - @_(r"\n+") + @_(r"\n+") # noqa: F821 def newline(self, t): self.lineno += t.value.count("\n") @@ -40,102 +49,96 @@ class CellMethodsParser(Parser): def __init__(self): self.groups = [] - @_('statements') + @_("statements") # noqa: F821 def program(self, p): return self.groups - @_('statement') + @_("statement") # noqa: F821 def statements(self, p): return p.statement - @_('statements statement') - def statements(self, p): + @_("statements statement") # noqa: F821 + def statements(self, p): # noqa: F811 return p.statements - @_('dimension action') + @_("dimension action") # noqa: F821 def statement(self, p): - current_group = [ - ('DIMENSION', p.dimension), - ('ACTION', p.action) - ] + current_group = [("DIMENSION", p.dimension), ("ACTION", p.action)] self.groups.append(current_group) return p - @_('dimension action SCOPE') - def statement(self, p): + @_("dimension action SCOPE") # noqa: F821 + def statement(self, p): # noqa: F811 current_group = [ - ('DIMENSION', p.dimension), - ('ACTION', p.action), - ('SCOPE', p.SCOPE) + ("DIMENSION", p.dimension), + ("ACTION", p.action), + ("SCOPE", p.SCOPE), ] self.groups.append(current_group) return p - @_('dimension action constraint region') - def statement(self, p): + @_("dimension action constraint region") # noqa: F821 + def statement(self, p): # noqa: F811 current_group = [ - ('DIMENSION', p.dimension), - ('ACTION', p.action), - ('CONSTRAINT', p.constraint), - ('REGION', p.region) + ("DIMENSION", p.dimension), + ("ACTION", p.action), + ("CONSTRAINT", p.constraint), + ("REGION", p.region), ] self.groups.append(current_group) return p - @_('dimension action constraint region SCOPE') - def statement(self, p): + @_("dimension action constraint region SCOPE") # noqa: F821 + def statement(self, p): # noqa: F811 current_group = [ - ('DIMENSION', p.dimension), - ('ACTION', p.action), - ('CONSTRAINT', p.constraint), - ('REGION', p.region), - ('SCOPE', p.SCOPE) + ("DIMENSION", p.dimension), + ("ACTION", p.action), + ("CONSTRAINT", p.constraint), + ("REGION", p.region), + ("SCOPE", p.SCOPE), ] self.groups.append(current_group) return p - @_('dimension action constraint SELECTION') - def statement(self, p): + @_("dimension action constraint SELECTION") # noqa: F821 + def statement(self, p): # noqa: F811 current_group = [ - ('DIMENSION', p.dimension), - ('ACTION', p.action), - ('CONSTRAINT', p.constraint), - ('SELECTION', p.SELECTION) + ("DIMENSION", p.dimension), + ("ACTION", p.action), + ("CONSTRAINT", p.constraint), + ("SELECTION", p.SELECTION), ] self.groups.append(current_group) return p - @_('dimensions ACTION') - def statement(self, p): + @_("dimensions ACTION") # noqa: F821 + def statement(self, p): # noqa: F811 for dimension in p.dimensions: - current_group = [ - ('DIMENSION', dimension), - ('ACTION', p.ACTION) - ] + current_group = [("DIMENSION", dimension), ("ACTION", p.ACTION)] self.groups.append(current_group) return p - @_('dimension') + @_("dimension") # noqa: F821 def dimensions(self, p): return [p.dimension] - @_('dimensions dimension') - def dimensions(self, p): + @_("dimensions dimension") # noqa: F821 + def dimensions(self, p): # noqa: F811 return p.dimensions + [p.dimension] - @_('DIMENSION') + @_("DIMENSION") # noqa: F821 def dimension(self, p): return p.DIMENSION - @_('ACTION') + @_("ACTION") # noqa: F821 def action(self, p): return p.ACTION - @_('CONSTRAINT') + @_("CONSTRAINT") # noqa: F821 def constraint(self, p): return p.CONSTRAINT - @_('REGION') + @_("REGION") # noqa: F821 def region(self, p): return p.REGION @@ -145,7 +148,8 @@ class XArrayTranslator: Represent parsed tree as human readable (pseudo code) xarray operations. Produces strings and not xarray objects. """ - def __init__(self, da_name='da'): + + def __init__(self, da_name="da"): self.da_name = da_name def translate_group(self, group): @@ -156,33 +160,35 @@ def translate_group(self, group): operation = f"{self.da_name}" # Handle the main action - if 'ACTION' in tokens_dict: - action = tokens_dict['ACTION'] - dim = tokens_dict.get('DIMENSION') + if "ACTION" in tokens_dict: + action = tokens_dict["ACTION"] + dim = tokens_dict.get("DIMENSION") - if 'CONSTRAINT' in tokens_dict: - constraint = tokens_dict['CONSTRAINT'] - if constraint == 'within': + if "CONSTRAINT" in tokens_dict: + constraint = tokens_dict["CONSTRAINT"] + if constraint == "within": # For 'within', we first group by the selection - selection = tokens_dict.get('SELECTION') + selection = tokens_dict.get("SELECTION") if selection: operation = f"{operation}.groupby('{selection}').{action}()" - elif constraint == 'over': + elif constraint == "over": # For 'over', we apply the operation over the selection - selection = tokens_dict.get('SELECTION') + selection = tokens_dict.get("SELECTION") if selection: operation = f"{operation}.{action}(dim='{selection}')" - elif constraint == 'where': + elif constraint == "where": # For 'where', we apply a mask before the operation - region = tokens_dict.get('REGION') + region = tokens_dict.get("REGION") if region: - operation = f"{operation}.where(mask=='{region}').{action}(dim='{dim}')" + operation = ( + f"{operation}.where(mask=='{region}').{action}(dim='{dim}')" + ) else: # Simple dimension reduction operation = f"{operation}.{action}(dim='{dim}')" # Add any scope comments as a comment in the code - if 'SCOPE' in tokens_dict: + if "SCOPE" in tokens_dict: operation = f"{operation} # {tokens_dict['SCOPE']}" return operation @@ -208,7 +214,7 @@ def translate(self, groups): # Last operation should be assigned to final result operations.append(f"result = {intermediate}") - return '\n'.join(operations) + return "\n".join(operations) def parse_cell_methods(text): @@ -219,8 +225,6 @@ def parse_cell_methods(text): def translate_to_xarray(text): """Convenience function to parse cell methods and translate to xarray operations.""" - parser = CellMethodsParser() - lexer = CellMethodsLexer() translator = XArrayTranslator() parsed = parse_cell_methods(text) @@ -231,6 +235,9 @@ def translate_to_xarray(text): text = "area: mean where sea depth: sum where sea (top 100m only) time: mean" result = parse_cell_methods(text) from pprint import pprint + pprint(result) - print("\nXArray translation (pseduo code, human readable strings. not xarray object):") + print( + "\nXArray translation (pseduo code, human readable strings. not xarray object):" + ) print(translate_to_xarray(text)) From 1205ffc73ec825a1fe601c8795a853d0ff4b89be Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 20 Mar 2025 15:27:47 +0100 Subject: [PATCH 03/13] fix flake8 warnings --- prototype/cellmethods/test_xarray_translation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prototype/cellmethods/test_xarray_translation.py b/prototype/cellmethods/test_xarray_translation.py index 48c4f1f7..f2d8e518 100644 --- a/prototype/cellmethods/test_xarray_translation.py +++ b/prototype/cellmethods/test_xarray_translation.py @@ -38,6 +38,7 @@ ) ] + def test_translations(): for input_text, expected in test_cases: result = translate_to_xarray(input_text) @@ -46,5 +47,6 @@ def test_translations(): print("Generated xarray code:") print(result) + if __name__ == "__main__": test_translations() From aa70cebda396b38076de8905c8e28cb957d58874 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 20 Mar 2025 15:35:42 +0100 Subject: [PATCH 04/13] fix flake8 warnings --- prototype/cellmethods/test_cellmethods_parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/prototype/cellmethods/test_cellmethods_parser.py b/prototype/cellmethods/test_cellmethods_parser.py index 54aca352..addf9ccf 100644 --- a/prototype/cellmethods/test_cellmethods_parser.py +++ b/prototype/cellmethods/test_cellmethods_parser.py @@ -5,7 +5,9 @@ ( "area: depth: time: mean", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], [('DIMENSION', 'depth'), ('ACTION', 'mean')], [('DIMENSION', 'time'), ('ACTION', 'mean')] + [('DIMENSION', 'area'), ('ACTION', 'mean')], + [('DIMENSION', 'depth'), ('ACTION', 'mean')], + [('DIMENSION', 'time'), ('ACTION', 'mean')] ] ), ( @@ -91,11 +93,13 @@ ) ] + @pytest.mark.parametrize("input_text,expected_output", test_cases) def test_cell_methods_parser(input_text, expected_output): result = parse_cell_methods(input_text) assert result == expected_output, f"\nInput: {input_text}\nExpected: {expected_output}\nGot: {result}" + def test_lexer_tokens(): lexer = CellMethodsLexer() # Test each token type is recognized correctly @@ -107,7 +111,7 @@ def test_lexer_tokens(): 'CONSTRAINT': ['within', 'over', 'where'], 'SCOPE': ['(comment text)', '(top 100m only)'] } - + for token_type, values in test_tokens.items(): for value in values: tokens = list(lexer.tokenize(value)) From 07001861c1678be97b81bdd271b08c0d615eaf2d Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 20 Mar 2025 15:41:55 +0100 Subject: [PATCH 05/13] fix black formatting --- .../cellmethods/test_cellmethods_parser.py | 193 ++++++++++++------ .../cellmethods/test_xarray_translation.py | 26 +-- 2 files changed, 145 insertions(+), 74 deletions(-) diff --git a/prototype/cellmethods/test_cellmethods_parser.py b/prototype/cellmethods/test_cellmethods_parser.py index addf9ccf..461095c5 100644 --- a/prototype/cellmethods/test_cellmethods_parser.py +++ b/prototype/cellmethods/test_cellmethods_parser.py @@ -5,111 +5,178 @@ ( "area: depth: time: mean", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'depth'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'mean')] - ] - ), - ( - "area: mean", - [ - [('DIMENSION', 'area'), ('ACTION', 'mean')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [("DIMENSION", "depth"), ("ACTION", "mean")], + [("DIMENSION", "time"), ("ACTION", "mean")], + ], ), + ("area: mean", [[("DIMENSION", "area"), ("ACTION", "mean")]]), ( "area: mean (comment: over land and sea ice) time: point", [ - [('DIMENSION', 'area'), ('ACTION', 'mean'), ('SCOPE', 'comment: over land and sea ice')], - [('DIMENSION', 'time'), ('ACTION', 'point')] - ] + [ + ("DIMENSION", "area"), + ("ACTION", "mean"), + ("SCOPE", "comment: over land and sea ice"), + ], + [("DIMENSION", "time"), ("ACTION", "point")], + ], ), ( "area: mean time: maximum", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'maximum')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [("DIMENSION", "time"), ("ACTION", "maximum")], + ], ), ( "area: mean time: maximum within days time: mean over days", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'maximum'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [ + ("DIMENSION", "time"), + ("ACTION", "maximum"), + ("CONSTRAINT", "within"), + ("SELECTION", "days"), + ], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "days"), + ], + ], ), ( "area: mean time: mean within days time: mean over days", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "within"), + ("SELECTION", "days"), + ], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "days"), + ], + ], ), ( "area: mean time: mean within hours time: maximum over hours", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'hours')], - [('DIMENSION', 'time'), ('ACTION', 'maximum'), ('CONSTRAINT', 'over'), ('SELECTION', 'hours')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "within"), + ("SELECTION", "hours"), + ], + [ + ("DIMENSION", "time"), + ("ACTION", "maximum"), + ("CONSTRAINT", "over"), + ("SELECTION", "hours"), + ], + ], ), ( "area: mean time: mean within years time: mean over years", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'within'), ('SELECTION', 'years')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'years')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "within"), + ("SELECTION", "years"), + ], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "years"), + ], + ], ), ( "area: mean time: minimum", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'minimum')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [("DIMENSION", "time"), ("ACTION", "minimum")], + ], ), ( "area: mean time: minimum within days time: mean over days", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'minimum'), ('CONSTRAINT', 'within'), ('SELECTION', 'days')], - [('DIMENSION', 'time'), ('ACTION', 'mean'), ('CONSTRAINT', 'over'), ('SELECTION', 'days')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [ + ("DIMENSION", "time"), + ("ACTION", "minimum"), + ("CONSTRAINT", "within"), + ("SELECTION", "days"), + ], + [ + ("DIMENSION", "time"), + ("ACTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "days"), + ], + ], ), ( "area: mean time: point", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'point')] - ] + [("DIMENSION", "area"), ("ACTION", "mean")], + [("DIMENSION", "time"), ("ACTION", "point")], + ], ), ( "area: mean time: sum", [ - [('DIMENSION', 'area'), ('ACTION', 'mean')], - [('DIMENSION', 'time'), ('ACTION', 'sum')] - ] - ) + [("DIMENSION", "area"), ("ACTION", "mean")], + [("DIMENSION", "time"), ("ACTION", "sum")], + ], + ), ] @pytest.mark.parametrize("input_text,expected_output", test_cases) def test_cell_methods_parser(input_text, expected_output): result = parse_cell_methods(input_text) - assert result == expected_output, f"\nInput: {input_text}\nExpected: {expected_output}\nGot: {result}" + assert ( + result == expected_output + ), f"\nInput: {input_text}\nExpected: {expected_output}\nGot: {result}" def test_lexer_tokens(): lexer = CellMethodsLexer() # Test each token type is recognized correctly test_tokens = { - 'DIMENSION': ['area:', 'time:', 'depth:', 'grid_longitude:', 'longitude:', 'latitude:'], - 'ACTION': ['mean', 'minimum', 'maximum', 'sum', 'point'], - 'REGION': ['land', 'sea', 'sea_ice', 'snow', 'ice_sheet', 'grounded_ice_sheet', 'crops', 'ice_free_sea'], - 'SELECTION': ['all_area_types', 'days', 'years', 'hours'], - 'CONSTRAINT': ['within', 'over', 'where'], - 'SCOPE': ['(comment text)', '(top 100m only)'] + "DIMENSION": [ + "area:", + "time:", + "depth:", + "grid_longitude:", + "longitude:", + "latitude:", + ], + "ACTION": ["mean", "minimum", "maximum", "sum", "point"], + "REGION": [ + "land", + "sea", + "sea_ice", + "snow", + "ice_sheet", + "grounded_ice_sheet", + "crops", + "ice_free_sea", + ], + "SELECTION": ["all_area_types", "days", "years", "hours"], + "CONSTRAINT": ["within", "over", "where"], + "SCOPE": ["(comment text)", "(top 100m only)"], } for token_type, values in test_tokens.items(): @@ -117,10 +184,18 @@ def test_lexer_tokens(): tokens = list(lexer.tokenize(value)) assert len(tokens) == 1, f"Expected 1 token for {value}, got {len(tokens)}" token = tokens[0] - assert token.type == token_type, f"Expected token type {token_type} for {value}, got {token.type}" - if token_type == 'DIMENSION': - assert token.value == value[:-1], f"Expected value {value[:-1]} for {value}, got {token.value}" - elif token_type == 'SCOPE': - assert token.value == value[1:-1], f"Expected value {value[1:-1]} for {value}, got {token.value}" + assert ( + token.type == token_type + ), f"Expected token type {token_type} for {value}, got {token.type}" + if token_type == "DIMENSION": + assert ( + token.value == value[:-1] + ), f"Expected value {value[:-1]} for {value}, got {token.value}" + elif token_type == "SCOPE": + assert ( + token.value == value[1:-1] + ), f"Expected value {value[1:-1]} for {value}, got {token.value}" else: - assert token.value == value, f"Expected value {value}, got {token.value}" + assert ( + token.value == value + ), f"Expected value {value}, got {token.value}" diff --git a/prototype/cellmethods/test_xarray_translation.py b/prototype/cellmethods/test_xarray_translation.py index f2d8e518..861ce964 100644 --- a/prototype/cellmethods/test_xarray_translation.py +++ b/prototype/cellmethods/test_xarray_translation.py @@ -1,48 +1,44 @@ from cellmethods_parser import translate_to_xarray test_cases = [ - ( - "area: mean", - "da.mean(dim='area')" - ), - ( - "area: mean where sea", - "da.where(mask=='sea').mean(dim='area')" - ), + ("area: mean", "da.mean(dim='area')"), + ("area: mean where sea", "da.where(mask=='sea').mean(dim='area')"), ( "area: mean where sea time: mean", "result_1 = da.where(mask=='sea').mean(dim='area')\n" - "result = result_1.mean(dim='time')" + "result = result_1.mean(dim='time')", ), ( "area: mean time: maximum within days", "result_1 = da.mean(dim='area')\n" - "result = result_1.groupby('days').maximum()" + "result = result_1.groupby('days').maximum()", ), ( "area: mean time: mean within days time: mean over days", "result_1 = da.mean(dim='area')\n" "result_2 = result_1.groupby('days').mean()\n" - "result = result_2.mean(dim='days')" + "result = result_2.mean(dim='days')", ), ( "area: mean (comment: over land and sea ice) time: point", "result_1 = da.mean(dim='area') # comment: over land and sea ice\n" - "result = result_1.point(dim='time')" + "result = result_1.point(dim='time')", ), ( "area: depth: time: mean", "result_1 = da.mean(dim='area')\n" "result_2 = result_1.mean(dim='depth')\n" - "result = result_2.mean(dim='time')" - ) + "result = result_2.mean(dim='time')", + ), ] def test_translations(): for input_text, expected in test_cases: result = translate_to_xarray(input_text) - assert result == expected, f"\nInput: {input_text}\nExpected:\n{expected}\nGot:\n{result}" + assert ( + result == expected + ), f"\nInput: {input_text}\nExpected:\n{expected}\nGot:\n{result}" print(f"\nInput: {input_text}") print("Generated xarray code:") print(result) From a8acf44f7c3c87a2d0f1c88e6e24f98666600914 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Tue, 1 Apr 2025 03:35:15 +0200 Subject: [PATCH 06/13] refined parser to handle wider usecases. Refactored test cases aswell --- prototype/cellmethods/cellmethods_parser.py | 330 +++++++++++------- .../cellmethods/test_cellmethods_parser.py | 323 ++++++++--------- 2 files changed, 343 insertions(+), 310 deletions(-) diff --git a/prototype/cellmethods/cellmethods_parser.py b/prototype/cellmethods/cellmethods_parser.py index 7bb2d6ad..4f54dfe1 100644 --- a/prototype/cellmethods/cellmethods_parser.py +++ b/prototype/cellmethods/cellmethods_parser.py @@ -3,39 +3,70 @@ class CellMethodsLexer(Lexer): # set of token names - tokens = {DIMENSION, ACTION, REGION, CONSTRAINT, SCOPE, SELECTION} # noqa: F821 + tokens = { + DIMENSION, # noqa: F821 + FUNCTION, # noqa: F821 + CONSTRAINT, # noqa: F821 + AREATYPE, # noqa: F821 + SELECTION, # noqa: F821 + COMMENT, # noqa: F821 + } # noqa: F821 # string containing ignored characters between token ignore = " \t" # Regular expression rules for tokens - DIMENSION = r"area:|time:|grid_longitude:|longitude:|latitude:|depth:" - ACTION = r"mean|minimum|maximum|sum|point" - CONSTRAINT = r"within|over|where" # Must come before REGION/SELECTION - REGION = r"[a-z_]+(?![a-z_])" # Match words with underscores - SELECTION = r"[a-z_]+(?![a-z_])" # Match words with underscores + # DIMENSION = r"area:|time:|grid_longitude:|longitude:|latitude:|depth:" + DIMENSION = r"[a-zA-Z_]+:" + FUNCTION = r"mean|minimum|maximum|sum|point" + CONSTRAINT = r"within|over|where" + AREATYPE = r"[a-zA-Z_]+" + SELECTION = r"[a-zA-Z_]+" def DIMENSION(self, t): # noqa: F811 t.value = t.value[:-1] return t - def REGION(self, t): # noqa: F811 - if t.value not in ( + _areatypes = set( + [ "land", + "shrubs", + "pastures", + "crops", + "trees", + "vegetation", + "unfrozen_soil", + "cloud", + "natural_grasses", + "floating_ice_shelf", + "grounded_ice_sheet", + "ice_free_sea", + "ice_sheet", "sea", "sea_ice", + "sea_ice_melt_pond", + "sea_ice_ridges", "snow", - "ice_sheet", - "grounded_ice_sheet", - "crops", - "ice_free_sea", - ): + "sector", + ] + ) + _selection = set(["hours", "days", "years", "months"]) + _selection.add("all_area_types") + + @_(r"[a-zA-Z_]+") # noqa: F821 + def AREATYPE(self, t): # noqa: F811 + if t.value in self._areatypes: + return t + if t.value in self._selection: t.type = "SELECTION" - return t + return t @_(r"\(.*?\)") # noqa: F821 - def SCOPE(self, t): - t.value = t.value[1:-1] + def COMMENT(self, t): + value = t.value[1:-1] + t.value = ( + value.replace("comment:", "").replace("[", "").replace("]", "").strip() + ) return t @_(r"\n+") # noqa: F821 @@ -45,13 +76,15 @@ def newline(self, t): class CellMethodsParser(Parser): tokens = CellMethodsLexer.tokens + debugfile = "parser.out" def __init__(self): - self.groups = [] + self.tmp = [] @_("statements") # noqa: F821 def program(self, p): - return self.groups + return corrections(p.statements) + # return p.statements @_("statement") # noqa: F821 def statements(self, p): @@ -59,68 +92,67 @@ def statements(self, p): @_("statements statement") # noqa: F821 def statements(self, p): # noqa: F811 - return p.statements + return p.statements + p.statement - @_("dimension action") # noqa: F821 + @_("dimension function") # noqa: F821 def statement(self, p): - current_group = [("DIMENSION", p.dimension), ("ACTION", p.action)] - self.groups.append(current_group) - return p + return [p.dimension + p.function] - @_("dimension action SCOPE") # noqa: F821 + @_("dimension function comment") # noqa: F821 def statement(self, p): # noqa: F811 - current_group = [ - ("DIMENSION", p.dimension), - ("ACTION", p.action), - ("SCOPE", p.SCOPE), - ] - self.groups.append(current_group) - return p + return [p.dimension + p.function + p.comment] - @_("dimension action constraint region") # noqa: F821 + @_("dimension function expr") # noqa: F821 def statement(self, p): # noqa: F811 - current_group = [ - ("DIMENSION", p.dimension), - ("ACTION", p.action), - ("CONSTRAINT", p.constraint), - ("REGION", p.region), - ] - self.groups.append(current_group) - return p + return [p.dimension + p.function + p.expr] - @_("dimension action constraint region SCOPE") # noqa: F821 + @_("dimension function exprs") # noqa: F821 def statement(self, p): # noqa: F811 - current_group = [ - ("DIMENSION", p.dimension), - ("ACTION", p.action), - ("CONSTRAINT", p.constraint), - ("REGION", p.region), - ("SCOPE", p.SCOPE), - ] - self.groups.append(current_group) - return p + return [p.dimension + p.function + p.exprs] - @_("dimension action constraint SELECTION") # noqa: F821 + @_("dimensions function") # noqa: F821 def statement(self, p): # noqa: F811 - current_group = [ - ("DIMENSION", p.dimension), - ("ACTION", p.action), - ("CONSTRAINT", p.constraint), - ("SELECTION", p.SELECTION), - ] - self.groups.append(current_group) - return p + return [dim + p.function for dim in p.dimensions] + + @_("dimensions function comment") # noqa: F821 + def statement(self, p): # noqa: F811 + return [dim + p.function + p.comment for dim in p.dimensions] + + @_("dimensions function expr") # noqa: F821 + def statement(self, p): # noqa: F811 + return [dim + p.function + p.expr for dim in p.dimensions] - @_("dimensions ACTION") # noqa: F821 + @_("dimensions function exprs") # noqa: F821 def statement(self, p): # noqa: F811 - for dimension in p.dimensions: - current_group = [("DIMENSION", dimension), ("ACTION", p.ACTION)] - self.groups.append(current_group) - return p + return [dim + p.function + expr for dim in p.dimensions for expr in p.exprs] + + @_("constraint areatype comment") # noqa: F821 + def expr(self, p): + return p.constraint + p.areatype + p.comment + + @_("constraint selection comment") # noqa: F821 + def expr(self, p): # noqa: F811 + return p.constraint + p.selection + p.comment + + @_("constraint areatype") # noqa: F821 + def expr(self, p): # noqa: F811 + return p.constraint + p.areatype + + @_("constraint selection") # noqa: F821 + def expr(self, p): # noqa: F811 + return p.constraint + p.selection + + @_("expr expr") # noqa: F821 + def exprs(self, p): + return p.expr0 + p.expr1 - @_("dimension") # noqa: F821 + @_("exprs expr") # noqa: F821 + def exprs(self, p): # noqa: F811 + return p.exprs + p.expr + + @_("dimension dimension") # noqa: F821 def dimensions(self, p): - return [p.dimension] + return [p.dimension0, p.dimension1] @_("dimensions dimension") # noqa: F821 def dimensions(self, p): # noqa: F811 @@ -128,19 +160,64 @@ def dimensions(self, p): # noqa: F811 @_("DIMENSION") # noqa: F821 def dimension(self, p): - return p.DIMENSION + return [("DIMENSION", p.DIMENSION)] - @_("ACTION") # noqa: F821 - def action(self, p): - return p.ACTION + @_("FUNCTION") # noqa: F821 + def function(self, p): + return [("FUNCTION", p.FUNCTION)] @_("CONSTRAINT") # noqa: F821 def constraint(self, p): - return p.CONSTRAINT - - @_("REGION") # noqa: F821 - def region(self, p): - return p.REGION + return [("CONSTRAINT", p.CONSTRAINT)] + + @_("AREATYPE") # noqa: F821 + def areatype(self, p): + return [("AREATYPE", p.AREATYPE)] + + @_("SELECTION") # noqa: F821 + def selection(self, p): + return [("SELECTION", p.SELECTION)] + + @_("COMMENT") # noqa: F821 + def comment(self, p): + return [("COMMENT", p.COMMENT)] + + +def corrections(groups): + result = [] + for group in groups: + grp = [] + tokens = iter(group) + tok = next(tokens) + tok_type, tok_value = tok + grp.append(tok) + if tok_type == "DIMENSION" and tok_value == "time": + while True: + try: + tok = next(tokens) + except StopIteration: + break + tok_type, tok_value = tok + # for `time` dimension, only SELECTION type is allowed as constraint + if tok_type == "AREATYPE": + grp.pop() + else: + grp.append(tok) + elif tok_type == "DIMENSION" and tok_value == "area": + while True: + try: + tok = next(tokens) + except StopIteration: + break + tok_type, tok_value = tok + if tok_type == "SELECTION" and tok_value != "all_area_types": + grp.pop() + else: + grp.append(tok) + else: + grp.extend(list(tokens)) + result.append(grp) + return result class XArrayTranslator: @@ -154,44 +231,49 @@ def __init__(self, da_name="da"): def translate_group(self, group): """Translate a single group of tokens into an xarray operation.""" - tokens_dict = dict(group) - - # Base operation - operation = f"{self.da_name}" - - # Handle the main action - if "ACTION" in tokens_dict: - action = tokens_dict["ACTION"] - dim = tokens_dict.get("DIMENSION") - - if "CONSTRAINT" in tokens_dict: - constraint = tokens_dict["CONSTRAINT"] - if constraint == "within": - # For 'within', we first group by the selection - selection = tokens_dict.get("SELECTION") - if selection: - operation = f"{operation}.groupby('{selection}').{action}()" - elif constraint == "over": - # For 'over', we apply the operation over the selection - selection = tokens_dict.get("SELECTION") - if selection: - operation = f"{operation}.{action}(dim='{selection}')" - elif constraint == "where": - # For 'where', we apply a mask before the operation - region = tokens_dict.get("REGION") - if region: - operation = ( - f"{operation}.where(mask=='{region}').{action}(dim='{dim}')" - ) - else: - # Simple dimension reduction - operation = f"{operation}.{action}(dim='{dim}')" - - # Add any scope comments as a comment in the code - if "SCOPE" in tokens_dict: - operation = f"{operation} # {tokens_dict['SCOPE']}" - - return operation + tokens = iter(group) + token_type, dim = next(tokens) + assert token_type == "DIMENSION" + token_type, function = next(tokens) + assert token_type == "FUNCTION" + texts = [] + try: + token_type, tok_value = next(tokens) + except StopIteration: + return f"{self.da_name}.{function}(dim={dim})" + else: + if token_type == "COMMENT": + if "mask=" in tok_value: + mask = tok_value.split("=")[1] + return f"{self.da_name}.where({mask}){function}(dim={dim} # comment: {tok_value})" + else: + return ( + f"{self.da_name}.{function}(dim={dim}) # comment: {tok_value}" + ) + elif token_type == "CONSTRAINT": + constraint = tok_value + token_type, tok_value = next(tokens) + text = f"{self.da_name}.{function}(dim={dim}).{constraint}({tok_value})" + texts.append(text) + if constraint == "over": + token_type, tok_value = next(tokens) + text = f"{self.da_name}.{function}(dim={dim}).{constraint}({tok_value})" + texts.append(text) + while True: + try: + token_type, tok_value = next(tokens) + except StopIteration: + break + if token_type == "COMMENT": + text = f" # comment: {tok_value}" + texts.append(text) + elif token_type == "CONSTRAINT": + constraint = tok_value + token_type, tok_value = next(tokens) + text = f".{constraint}({tok_value})" + texts.append(text) + text = "".join(texts) + return text def translate(self, groups): """Translate all groups into a sequence of xarray operations.""" @@ -217,10 +299,14 @@ def translate(self, groups): return "\n".join(operations) +lexer = CellMethodsLexer() +parser = CellMethodsParser() + + def parse_cell_methods(text): - lexer = CellMethodsLexer() - parser = CellMethodsParser() - return parser.parse(lexer.tokenize(text)) + tokens = lexer.tokenize(text) + group = parser.parse(tokens) + return group def translate_to_xarray(text): @@ -229,15 +315,3 @@ def translate_to_xarray(text): parsed = parse_cell_methods(text) return translator.translate(parsed) - - -if __name__ == "__main__": - text = "area: mean where sea depth: sum where sea (top 100m only) time: mean" - result = parse_cell_methods(text) - from pprint import pprint - - pprint(result) - print( - "\nXArray translation (pseduo code, human readable strings. not xarray object):" - ) - print(translate_to_xarray(text)) diff --git a/prototype/cellmethods/test_cellmethods_parser.py b/prototype/cellmethods/test_cellmethods_parser.py index 461095c5..9b3c576e 100644 --- a/prototype/cellmethods/test_cellmethods_parser.py +++ b/prototype/cellmethods/test_cellmethods_parser.py @@ -1,201 +1,160 @@ -import pytest -from cellmethods_parser import CellMethodsLexer, parse_cell_methods +from cellmethods_parser import parse_cell_methods -test_cases = [ - ( - "area: depth: time: mean", + +def test_single_statement_with_just_action(): + text = "area: mean" + result = parse_cell_methods(text) + expected = [[("DIMENSION", "area"), ("FUNCTION", "mean")]] + assert result == expected + + +def test_single_statement_with_action_and_constraint(): + text = "area: mean where land" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "where"), + ("AREATYPE", "land"), + ] + ] + assert result == expected + + +def test_single_statement_with_action_and_constraint_and_comment(): + text = "area: mean where land (comment: mask=landFrac)" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "where"), + ("AREATYPE", "land"), + ("COMMENT", "mask=landFrac"), + ] + ] + assert result == expected + + +def test_many_dimensions_map_to_single_function(): + text = "area: depth: time: mean" + result = parse_cell_methods(text) + expected = [ [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [("DIMENSION", "depth"), ("ACTION", "mean")], - [("DIMENSION", "time"), ("ACTION", "mean")], + ("DIMENSION", "area"), + ("FUNCTION", "mean"), ], - ), - ("area: mean", [[("DIMENSION", "area"), ("ACTION", "mean")]]), - ( - "area: mean (comment: over land and sea ice) time: point", - [ - [ - ("DIMENSION", "area"), - ("ACTION", "mean"), - ("SCOPE", "comment: over land and sea ice"), - ], - [("DIMENSION", "time"), ("ACTION", "point")], + [ + ("DIMENSION", "depth"), + ("FUNCTION", "mean"), ], - ), - ( - "area: mean time: maximum", [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [("DIMENSION", "time"), ("ACTION", "maximum")], + ("DIMENSION", "time"), + ("FUNCTION", "mean"), ], - ), - ( - "area: mean time: maximum within days time: mean over days", - [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [ - ("DIMENSION", "time"), - ("ACTION", "maximum"), - ("CONSTRAINT", "within"), - ("SELECTION", "days"), - ], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "over"), - ("SELECTION", "days"), - ], + ] + assert result == expected + + +def test_statements_with_comment_in_middle(): + text = "longitude: sum (comment: basin sum [along zig-zag grid path]) depth: sum time: mean" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "longitude"), + ("FUNCTION", "sum"), + ("COMMENT", "basin sum along zig-zag grid path"), ], - ), - ( - "area: mean time: mean within days time: mean over days", - [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "within"), - ("SELECTION", "days"), - ], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "over"), - ("SELECTION", "days"), - ], + [ + ("DIMENSION", "depth"), + ("FUNCTION", "sum"), ], - ), - ( - "area: mean time: mean within hours time: maximum over hours", - [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "within"), - ("SELECTION", "hours"), - ], - [ - ("DIMENSION", "time"), - ("ACTION", "maximum"), - ("CONSTRAINT", "over"), - ("SELECTION", "hours"), - ], + [ + ("DIMENSION", "time"), + ("FUNCTION", "mean"), ], - ), - ( - "area: mean time: mean within years time: mean over years", - [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "within"), - ("SELECTION", "years"), - ], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "over"), - ("SELECTION", "years"), - ], + ] + assert result == expected + + +def test_time_dimension_constraint_omits_areatpye(): + text = "area: time: mean where cloud" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "where"), + ("AREATYPE", "cloud"), ], - ), - ( - "area: mean time: minimum", [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [("DIMENSION", "time"), ("ACTION", "minimum")], + ("DIMENSION", "time"), + ("FUNCTION", "mean"), ], - ), - ( - "area: mean time: minimum within days time: mean over days", - [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [ - ("DIMENSION", "time"), - ("ACTION", "minimum"), - ("CONSTRAINT", "within"), - ("SELECTION", "days"), - ], - [ - ("DIMENSION", "time"), - ("ACTION", "mean"), - ("CONSTRAINT", "over"), - ("SELECTION", "days"), - ], + ] + assert result == expected + + +def test_multiple_contraints(): + text = "area: mean where land over all_area_types time: mean" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "where"), + ("AREATYPE", "land"), + ("CONSTRAINT", "over"), + ("SELECTION", "all_area_types"), ], - ), - ( - "area: mean time: point", [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [("DIMENSION", "time"), ("ACTION", "point")], + ("DIMENSION", "time"), + ("FUNCTION", "mean"), ], - ), - ( - "area: mean time: sum", + ] + assert result == expected + + +def test_statements_with_repeated_dimensions(): + text = "area: mean where crops time: minimum within days time: mean over days" + result = parse_cell_methods(text) + expected = [ [ - [("DIMENSION", "area"), ("ACTION", "mean")], - [("DIMENSION", "time"), ("ACTION", "sum")], + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "where"), + ("AREATYPE", "crops"), ], - ), -] - - -@pytest.mark.parametrize("input_text,expected_output", test_cases) -def test_cell_methods_parser(input_text, expected_output): - result = parse_cell_methods(input_text) - assert ( - result == expected_output - ), f"\nInput: {input_text}\nExpected: {expected_output}\nGot: {result}" - - -def test_lexer_tokens(): - lexer = CellMethodsLexer() - # Test each token type is recognized correctly - test_tokens = { - "DIMENSION": [ - "area:", - "time:", - "depth:", - "grid_longitude:", - "longitude:", - "latitude:", + [ + ("DIMENSION", "time"), + ("FUNCTION", "minimum"), + ("CONSTRAINT", "within"), + ("SELECTION", "days"), ], - "ACTION": ["mean", "minimum", "maximum", "sum", "point"], - "REGION": [ - "land", - "sea", - "sea_ice", - "snow", - "ice_sheet", - "grounded_ice_sheet", - "crops", - "ice_free_sea", + [ + ("DIMENSION", "time"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "days"), + ], + ] + assert result == expected + + +def test_area_dimension_contraint_omits_selection(): + text = "area: time: mean over days" + result = parse_cell_methods(text) + expected = [ + [ + ("DIMENSION", "area"), + ("FUNCTION", "mean"), + ], + [ + ("DIMENSION", "time"), + ("FUNCTION", "mean"), + ("CONSTRAINT", "over"), + ("SELECTION", "days"), ], - "SELECTION": ["all_area_types", "days", "years", "hours"], - "CONSTRAINT": ["within", "over", "where"], - "SCOPE": ["(comment text)", "(top 100m only)"], - } - - for token_type, values in test_tokens.items(): - for value in values: - tokens = list(lexer.tokenize(value)) - assert len(tokens) == 1, f"Expected 1 token for {value}, got {len(tokens)}" - token = tokens[0] - assert ( - token.type == token_type - ), f"Expected token type {token_type} for {value}, got {token.type}" - if token_type == "DIMENSION": - assert ( - token.value == value[:-1] - ), f"Expected value {value[:-1]} for {value}, got {token.value}" - elif token_type == "SCOPE": - assert ( - token.value == value[1:-1] - ), f"Expected value {value[1:-1]} for {value}, got {token.value}" - else: - assert ( - token.value == value - ), f"Expected value {value}, got {token.value}" + ] + assert result == expected From e92a101bdbb568f0267594d84ed83639abebd2d1 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Tue, 1 Apr 2025 03:41:23 +0200 Subject: [PATCH 07/13] cell methods string to xarray objects --- prototype/cellmethods/cell_methods_xarray.py | 163 +++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 prototype/cellmethods/cell_methods_xarray.py diff --git a/prototype/cellmethods/cell_methods_xarray.py b/prototype/cellmethods/cell_methods_xarray.py new file mode 100644 index 00000000..f4045fc4 --- /dev/null +++ b/prototype/cellmethods/cell_methods_xarray.py @@ -0,0 +1,163 @@ +import xarray as xr +import numpy as np +from typing import List, Tuple, Optional +from cellmethods_parser import parse_cell_methods + +""" +Prototype code only. Not sure if cellmethods are handled this way. Lot of ambiguity at many steps. +""" + + +class CellMethodsConverter: + def __init__(self): + self.function_map = { + "mean": xr.DataArray.mean, + "sum": xr.DataArray.sum, + "maximum": xr.DataArray.max, + "minimum": xr.DataArray.min, + "point": lambda x, dim: x.isel(**{dim: 0}), + } + + self.dimension_map = { + "area": "area", + "time": "time", + "depth": "depth", + "longitude": "lon", + "grid_longitude": "grid_lon", + } + + def apply_constraint( + self, da: xr.DataArray, constraint: str, value: str, scope: Optional[str] = None + ) -> xr.DataArray: + """Apply where/over constraints to the DataArray.""" + if constraint == "where": + # Handle special cases with mask variables + if scope and "(comment: mask=" in scope: + mask_var = scope.split("mask=")[1].rstrip(")") + # Assuming the mask variable is available in the same dataset + return da.where(da.coords[mask_var] > 0) + + # Handle basic area type constraints + area_types = [ + "land", + "sea", + "ice_sheet", + "sea_ice", + "crops", + "trees", + "vegetation", + "unfrozen_soil", + "cloud", + "natural_grasses", + "floating_ice_shelf", + "grounded_ice_sheet", + "ice_free_sea", + "sea_ice_melt_pond", + "sea_ice_ridges", + "snow", + "sector", + "shrubs", + "pastures", + ] + + if value in area_types: + # Use the mask from coordinates + mask_var = f"{value}_mask" + mask = da.coords[mask_var] + # Create a boolean mask array that matches the data dimensions + mask_data = mask.values > 0 + # Broadcast mask to match data dimensions + for _ in range(len(da.dims) - 1): + mask_data = mask_data[:, np.newaxis] + # Apply the mask + return da.where(mask_data) + + elif constraint == "over": + if value == "all_area_types": + # No filtering needed, already considering all areas + return da + elif value in ["days", "months", "years", "hours"]: + # This will be handled in the time aggregation + return da + + return da + + def process_cell_method( + self, da: xr.DataArray, method: List[Tuple[str, str]] + ) -> xr.DataArray: + """Process a single cell method (one group of operations).""" + result = da.copy() # Make a copy to preserve coordinates + dim = None + func = None + constraints = [] + scope = None + + for token_type, token_value in method: + if token_type == "DIMENSION": + dim = self.dimension_map.get(token_value, token_value) + elif token_type == "FUNCTION": + func = self.function_map[token_value] + elif token_type == "CONSTRAINT": + constraints.append(token_value) + elif token_type == "AREATYPE" or token_type == "SELECTION": + if constraints: + result = self.apply_constraint( + result, constraints[-1], token_value, scope + ) + elif token_type == "SCOPE": + scope = token_value + + if dim and func: + # Handle time-based selections before applying the function + if dim == "time" and constraints and constraints[-1] == "over": + # Get the appropriate time frequency + freq = {"hours": "h", "days": "D", "months": "M", "years": "Y"}.get( + token_value + ) + if freq: + result = result.resample(time=freq).mean() + else: + # Apply the main function + if func == self.function_map["point"]: + result = result.isel(**{dim: 0}) + else: + result = func(result, dim=dim) + + return result + + def apply_cell_methods( + self, da: xr.DataArray, cell_methods_str: str + ) -> xr.DataArray: + """Apply cell methods to a DataArray based on the cell_methods string.""" + parsed = parse_cell_methods(cell_methods_str) + if parsed is None: + raise ValueError(f"Failed to parse cell methods string: {cell_methods_str}") + + result = da + for method in parsed: + result = self.process_cell_method(result, method) + + return result + + +# Example usage: +def apply_cell_methods(da: xr.DataArray, cell_methods_str: str) -> xr.DataArray: + """ + Apply cell methods to a DataArray based on the cell_methods string. + + Args: + da: Input xarray DataArray + cell_methods_str: Cell methods string (e.g., "area: mean time: maximum") + + Returns: + Processed xarray DataArray + + Example: + >>> import xarray as xr + >>> import numpy as np + >>> data = np.random.rand(4, 3, 2) # time, area, depth + >>> da = xr.DataArray(data, dims=['time', 'area', 'depth']) + >>> result = apply_cell_methods(da, "area: mean time: maximum") + """ + converter = CellMethodsConverter() + return converter.apply_cell_methods(da, cell_methods_str) From bc7b1634b36e4b6238e0193372848cbe7a0462d3 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 2 Apr 2025 00:02:49 +0200 Subject: [PATCH 08/13] added cellmethods in cli validate --- src/pymorize/cli.py | 32 +++++++++++++++++++ src/pymorize/prototype/__init__.py | 0 .../prototype/cellmethods/__init__.py | 0 .../cellmethods/cell_methods_xarray.py | 0 .../cellmethods/cellmethods_parser.py | 0 .../cellmethods/test_cellmethods_parser.py | 0 .../cellmethods/test_xarray_translation.py | 0 7 files changed, 32 insertions(+) create mode 100644 src/pymorize/prototype/__init__.py create mode 100644 src/pymorize/prototype/cellmethods/__init__.py rename {prototype => src/pymorize/prototype}/cellmethods/cell_methods_xarray.py (100%) rename {prototype => src/pymorize/prototype}/cellmethods/cellmethods_parser.py (100%) rename {prototype => src/pymorize/prototype}/cellmethods/test_cellmethods_parser.py (100%) rename {prototype => src/pymorize/prototype}/cellmethods/test_xarray_translation.py (100%) diff --git a/src/pymorize/cli.py b/src/pymorize/cli.py index 84008023..d7c6d12c 100644 --- a/src/pymorize/cli.py +++ b/src/pymorize/cli.py @@ -19,6 +19,7 @@ from .logging import add_report_logger, logger from .ssh_tunnel import ssh_tunnel_cli from .validate import GENERAL_VALIDATOR, PIPELINES_VALIDATOR, RULES_VALIDATOR +from .prototype.cellmethods.cellmethods_parser import parse_cell_methods, translate_to_xarray MAX_FRAMES = int(os.environ.get("PYMORIZE_ERROR_MAX_FRAMES", 3)) """ @@ -248,6 +249,37 @@ def directory(config_file, output_dir, verbose, quiet, logfile, profile_mem): cmorizer.check_rules_for_output_dir(output_dir) +@validate.command() +@click_loguru.logging_options +@click_loguru.init_logger() +@click.argument("config_file", type=click.Path(exists=True)) +def cellmethods(config_file, verbose, quiet, logfile, profile_mem): + logger.info(f"Processing {config_file}") + with open(config_file, "r") as f: + cfg = yaml.safe_load(f) + cmorizer = CMORizer.from_dict(cfg) + seen_rules = set() + for rule in cmorizer.rules: + if rule.name in seen_rules: + continue + else: + seen_rules.add(rule.name) + cellmethod_text = rule.data_request_variable.cell_methods + if not cellmethod_text.strip(): + continue + else: + tokengroups = parse_cell_methods(cellmethod_text) + logger.info(f"Rule {rule.name!r}: Parsing cellmethods text...") + logger.info(f"{cellmethod_text}") + logger.info("Tokens:") + for tok in tokengroups: + logger.info(f" {tok}") + logger.info("xarray translation (Pseudo code)") + codelines = translate_to_xarray(cellmethod_text) + for line in codelines.splitlines(): + logger.info(f" {line}") + + ################################################################################ ################################################################################ ################################################################################ diff --git a/src/pymorize/prototype/__init__.py b/src/pymorize/prototype/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/pymorize/prototype/cellmethods/__init__.py b/src/pymorize/prototype/cellmethods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/prototype/cellmethods/cell_methods_xarray.py b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py similarity index 100% rename from prototype/cellmethods/cell_methods_xarray.py rename to src/pymorize/prototype/cellmethods/cell_methods_xarray.py diff --git a/prototype/cellmethods/cellmethods_parser.py b/src/pymorize/prototype/cellmethods/cellmethods_parser.py similarity index 100% rename from prototype/cellmethods/cellmethods_parser.py rename to src/pymorize/prototype/cellmethods/cellmethods_parser.py diff --git a/prototype/cellmethods/test_cellmethods_parser.py b/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py similarity index 100% rename from prototype/cellmethods/test_cellmethods_parser.py rename to src/pymorize/prototype/cellmethods/test_cellmethods_parser.py diff --git a/prototype/cellmethods/test_xarray_translation.py b/src/pymorize/prototype/cellmethods/test_xarray_translation.py similarity index 100% rename from prototype/cellmethods/test_xarray_translation.py rename to src/pymorize/prototype/cellmethods/test_xarray_translation.py From 088b17b75764f7f751635a056ba7d4a02eecea76 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 2 Apr 2025 14:30:57 +0200 Subject: [PATCH 09/13] fixed import statements --- src/pymorize/prototype/cellmethods/cell_methods_xarray.py | 2 +- src/pymorize/prototype/cellmethods/test_cellmethods_parser.py | 2 +- src/pymorize/prototype/cellmethods/test_xarray_translation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py index f4045fc4..e54b7b71 100644 --- a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py +++ b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py @@ -1,7 +1,7 @@ import xarray as xr import numpy as np from typing import List, Tuple, Optional -from cellmethods_parser import parse_cell_methods +from .prototype.cellmethods.cellmethods_parser import parse_cell_methods """ Prototype code only. Not sure if cellmethods are handled this way. Lot of ambiguity at many steps. diff --git a/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py b/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py index 9b3c576e..75063296 100644 --- a/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py +++ b/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py @@ -1,4 +1,4 @@ -from cellmethods_parser import parse_cell_methods +from .prototype.cellmethods.cellmethods_parser import parse_cell_methods def test_single_statement_with_just_action(): diff --git a/src/pymorize/prototype/cellmethods/test_xarray_translation.py b/src/pymorize/prototype/cellmethods/test_xarray_translation.py index 861ce964..5f245962 100644 --- a/src/pymorize/prototype/cellmethods/test_xarray_translation.py +++ b/src/pymorize/prototype/cellmethods/test_xarray_translation.py @@ -1,4 +1,4 @@ -from cellmethods_parser import translate_to_xarray +from .prototype.cellmethods.cellmethods_parser import translate_to_xarray test_cases = [ ("area: mean", "da.mean(dim='area')"), From 71447ccdadd934ec106a536910cb484dc53be959 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 2 Apr 2025 14:47:09 +0200 Subject: [PATCH 10/13] fixed isort thing --- src/pymorize/cli.py | 5 ++++- src/pymorize/prototype/cellmethods/cell_methods_xarray.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/pymorize/cli.py b/src/pymorize/cli.py index d7c6d12c..130b8a71 100644 --- a/src/pymorize/cli.py +++ b/src/pymorize/cli.py @@ -17,9 +17,12 @@ from .cmorizer import CMORizer from .filecache import fc from .logging import add_report_logger, logger +from .prototype.cellmethods.cellmethods_parser import ( + parse_cell_methods, + translate_to_xarray, +) from .ssh_tunnel import ssh_tunnel_cli from .validate import GENERAL_VALIDATOR, PIPELINES_VALIDATOR, RULES_VALIDATOR -from .prototype.cellmethods.cellmethods_parser import parse_cell_methods, translate_to_xarray MAX_FRAMES = int(os.environ.get("PYMORIZE_ERROR_MAX_FRAMES", 3)) """ diff --git a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py index e54b7b71..5a5f19c0 100644 --- a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py +++ b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py @@ -1,6 +1,8 @@ -import xarray as xr +from typing import List, Optional, Tuple + import numpy as np -from typing import List, Tuple, Optional +import xarray as xr + from .prototype.cellmethods.cellmethods_parser import parse_cell_methods """ From 0a677dc944796c07143684dc78d9f5a1ff5ed949 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 2 Apr 2025 17:26:33 +0200 Subject: [PATCH 11/13] fix relative imports --- src/pymorize/prototype/cellmethods/cell_methods_xarray.py | 2 +- src/pymorize/prototype/cellmethods/test_cellmethods_parser.py | 2 +- src/pymorize/prototype/cellmethods/test_xarray_translation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py index 5a5f19c0..883313df 100644 --- a/src/pymorize/prototype/cellmethods/cell_methods_xarray.py +++ b/src/pymorize/prototype/cellmethods/cell_methods_xarray.py @@ -3,7 +3,7 @@ import numpy as np import xarray as xr -from .prototype.cellmethods.cellmethods_parser import parse_cell_methods +from .cellmethods_parser import parse_cell_methods """ Prototype code only. Not sure if cellmethods are handled this way. Lot of ambiguity at many steps. diff --git a/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py b/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py index 75063296..72c07c38 100644 --- a/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py +++ b/src/pymorize/prototype/cellmethods/test_cellmethods_parser.py @@ -1,4 +1,4 @@ -from .prototype.cellmethods.cellmethods_parser import parse_cell_methods +from .cellmethods_parser import parse_cell_methods def test_single_statement_with_just_action(): diff --git a/src/pymorize/prototype/cellmethods/test_xarray_translation.py b/src/pymorize/prototype/cellmethods/test_xarray_translation.py index 5f245962..f698e792 100644 --- a/src/pymorize/prototype/cellmethods/test_xarray_translation.py +++ b/src/pymorize/prototype/cellmethods/test_xarray_translation.py @@ -1,4 +1,4 @@ -from .prototype.cellmethods.cellmethods_parser import translate_to_xarray +from .cellmethods_parser import translate_to_xarray test_cases = [ ("area: mean", "da.mean(dim='area')"), From 511dc490992462b93bc2b50d20f167d81941ebc0 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 3 Apr 2025 02:04:22 +0200 Subject: [PATCH 12/13] fixed xarray translation test --- .../cellmethods/cellmethods_parser.py | 24 ++++++++++++----- .../cellmethods/test_xarray_translation.py | 26 +++++++++---------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/pymorize/prototype/cellmethods/cellmethods_parser.py b/src/pymorize/prototype/cellmethods/cellmethods_parser.py index 4f54dfe1..4ee193e1 100644 --- a/src/pymorize/prototype/cellmethods/cellmethods_parser.py +++ b/src/pymorize/prototype/cellmethods/cellmethods_parser.py @@ -228,6 +228,13 @@ class XArrayTranslator: def __init__(self, da_name="da"): self.da_name = da_name + self.function_map = { + "maximum": "max", + "minimum": "min", + "point": "isel", + "within": "groupby", + "over": "groupby", + } def translate_group(self, group): """Translate a single group of tokens into an xarray operation.""" @@ -235,6 +242,7 @@ def translate_group(self, group): token_type, dim = next(tokens) assert token_type == "DIMENSION" token_type, function = next(tokens) + function = self.function_map.get(function, function) assert token_type == "FUNCTION" texts = [] try: @@ -251,14 +259,18 @@ def translate_group(self, group): f"{self.da_name}.{function}(dim={dim}) # comment: {tok_value}" ) elif token_type == "CONSTRAINT": - constraint = tok_value + _constraint = tok_value + constraint = self.function_map.get(tok_value, tok_value) token_type, tok_value = next(tokens) - text = f"{self.da_name}.{function}(dim={dim}).{constraint}({tok_value})" - texts.append(text) - if constraint == "over": - token_type, tok_value = next(tokens) + if constraint == "groupby": + text = f"{self.da_name}.{constraint}({tok_value}).{function}(dim={dim}) # {_constraint}" + else: text = f"{self.da_name}.{function}(dim={dim}).{constraint}({tok_value})" - texts.append(text) + texts.append(text) + # if constraint == "over": + # token_type, tok_value = next(tokens) + # text = f"{self.da_name}.{function}(dim={dim}).{constraint}({tok_value})" + # texts.append(text) while True: try: token_type, tok_value = next(tokens) diff --git a/src/pymorize/prototype/cellmethods/test_xarray_translation.py b/src/pymorize/prototype/cellmethods/test_xarray_translation.py index f698e792..8f47eda4 100644 --- a/src/pymorize/prototype/cellmethods/test_xarray_translation.py +++ b/src/pymorize/prototype/cellmethods/test_xarray_translation.py @@ -1,34 +1,32 @@ from .cellmethods_parser import translate_to_xarray test_cases = [ - ("area: mean", "da.mean(dim='area')"), - ("area: mean where sea", "da.where(mask=='sea').mean(dim='area')"), + ("area: mean", "da.mean(dim=area)"), + ("area: mean where sea", "da.mean(dim=area).where(sea)"), ( "area: mean where sea time: mean", - "result_1 = da.where(mask=='sea').mean(dim='area')\n" - "result = result_1.mean(dim='time')", + "result_1 = da.mean(dim=area).where(sea)\n" "result = result_1.mean(dim=time)", ), ( "area: mean time: maximum within days", - "result_1 = da.mean(dim='area')\n" - "result = result_1.groupby('days').maximum()", + "result_1 = da.mean(dim=area)\nresult = result_1.groupby(days).max(dim=time) # within", ), ( "area: mean time: mean within days time: mean over days", - "result_1 = da.mean(dim='area')\n" - "result_2 = result_1.groupby('days').mean()\n" - "result = result_2.mean(dim='days')", + "result_1 = da.mean(dim=area)\n" + "result_2 = result_1.groupby(days).mean(dim=time) # within\n" + "result = result_2.groupby(days).mean(dim=time) # over", ), ( "area: mean (comment: over land and sea ice) time: point", - "result_1 = da.mean(dim='area') # comment: over land and sea ice\n" - "result = result_1.point(dim='time')", + "result_1 = da.mean(dim=area) # comment: over land and sea ice\n" + "result = result_1.isel(dim=time)", ), ( "area: depth: time: mean", - "result_1 = da.mean(dim='area')\n" - "result_2 = result_1.mean(dim='depth')\n" - "result = result_2.mean(dim='time')", + "result_1 = da.mean(dim=area)\n" + "result_2 = result_1.mean(dim=depth)\n" + "result = result_2.mean(dim=time)", ), ] From 98431f8353e2dd02c8a8e99854482aa391469910 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Thu, 3 Apr 2025 08:35:43 +0200 Subject: [PATCH 13/13] using cmor_variable name instead of rule.name --- src/pymorize/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pymorize/cli.py b/src/pymorize/cli.py index 130b8a71..6c4aa902 100644 --- a/src/pymorize/cli.py +++ b/src/pymorize/cli.py @@ -272,7 +272,7 @@ def cellmethods(config_file, verbose, quiet, logfile, profile_mem): continue else: tokengroups = parse_cell_methods(cellmethod_text) - logger.info(f"Rule {rule.name!r}: Parsing cellmethods text...") + logger.info(f"{rule.cmor_variable!r}: Parsing cellmethods text...") logger.info(f"{cellmethod_text}") logger.info("Tokens:") for tok in tokengroups: