From cb00206b1d9bf13388d1a11a43f5f94e80ec6271 Mon Sep 17 00:00:00 2001 From: rgraber Date: Thu, 30 Jan 2025 13:32:31 -0500 Subject: [PATCH 1/4] fix: allow headers to be capitalized --- src/formpack/utils/expand_content.py | 2 +- tests/test_expand_content.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 852e50b..24d1f07 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -233,7 +233,7 @@ def _mark_special(**kwargs: str) -> None: _pluck_uniq_cols('choices') for column_name in uniq_cols.keys(): - if column_name in ['label', 'hint']: + if column_name.lower() in ['label', 'hint']: _mark_special( column_name=column_name, column=column_name, diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py index eb05746..cac6d66 100644 --- a/tests/test_expand_content.py +++ b/tests/test_expand_content.py @@ -604,5 +604,11 @@ def test_expand_translations_null_lang(): assert s1 == s1_copy +def test_expand_ignores_case(): + s1 = {'survey': [{'type': 'text', 'Label': 'hi'}]} + expand_content(s1, in_place=True) + assert s1.get('translated') == ['Label'] + + def _s(rows): return {'survey': [dict([[key, 'x']]) for key in rows]} From 00be6b9494c3f3a28cb43bdfaa809c75ce6508c1 Mon Sep 17 00:00:00 2001 From: rgraber Date: Mon, 10 Feb 2025 16:01:48 -0500 Subject: [PATCH 2/4] fixup: clean column names --- dev-requirements.txt | 1 + setup.py | 2 +- src/formpack/pack.py | 1 + src/formpack/utils/expand_content.py | 63 +++++++++++++++++++++++++++- tests/test_expand_content.py | 22 ++++++++++ 5 files changed, 87 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 2cb364d..09427c8 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -7,6 +7,7 @@ # Install testing / development requirements coverage[toml]==6.5.0 coveralls==3.3.1 +ddt==1.7.2 flake8==7.1.1 funcsigs==1.0.2 geojson-rewind==1.1.0 diff --git a/setup.py b/setup.py index 8423bb5..a40b006 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='formpack', - version='3.0.0', + version='3.0.1', description='Manipulation tools for KoBo forms', author='the formpack contributors (https://github.com/kobotoolbox/formpack/graphs/contributors)', url='https://github.com/kobotoolbox/formpack/', diff --git a/src/formpack/pack.py b/src/formpack/pack.py index f3f9163..7f5fd95 100644 --- a/src/formpack/pack.py +++ b/src/formpack/pack.py @@ -137,6 +137,7 @@ def load_version(self, schema): unique accross an entire FormPack. It can be None, but only for one version in the FormPack. """ + breakpoint() replace_aliases(schema['content'], in_place=True) expand_content(schema['content'], in_place=True) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 24d1f07..dfae2cb 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -53,6 +53,7 @@ def _expand_translatable_content( row[_expandable_col][_nti] = _oldval if col_shortname != _expandable_col: row[_expandable_col][cur_translation_index] = row[col_shortname] + breakpoint() del row[col_shortname] @@ -94,7 +95,67 @@ def _get_translations_from_special_cols( return translations, set(translated_cols) +def clean_column_name(column_name: str) -> str: + RE_MEDIA_COLUMN_NAMES = '|'.join(MEDIA_COLUMN_NAMES) + + # "LaBeL" -> "label", "HiNT" -> "hint" + if column_name.lower() in ['label', 'hint']: + return column_name.lower() + + # "Bind:Some:Thing" -> "bind:Some:Thing", "BodY:" -> "body:" + match = re.match(r'^(bind|body):.*', column_name, flags=re.IGNORECASE) + if match: + lower_cased = match.group(0).lower() + return re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + + # "Media:Audio::ES" -> "media:audio::ES", "ViDeO : ES" -> "video : ES" + match = re.match( + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})\s*::?\s*([^:]+)$', + column_name, + flags=re.IGNORECASE + ) + if match: + matched = match.groups() + lower_media_prefix = matched[0].lower() + lower_media_type = matched[1].lower() + return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', + rf'{lower_media_prefix}{lower_media_type}\3\4', + column_name, flags=re.IGNORECASE) + + # "Media: AuDiO" -> "media: audio", "VIDEO" -> "video" + match = re.match( + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name + ) + if match: + matched = match.groups() + lower_media_prefix = matched[0].lower() + lower_media_type = matched[1].lower() + return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', + rf'{lower_media_prefix}{lower_media_type}', + column_name, flags=re.IGNORECASE) + + match = re.match(r'^([^:]+)(\s*::?\s*)([^:]+)$', column_name) + if match: + # example: label::x, constraint_message::x, hint::x + matched = match.groups() + lower_column_shortname = matched[0].lower() + return re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, + flags=re.IGNORECASE) + return column_name.lower() + + +def preprocess_columns(content: Dict[str, List[Any]]) -> None: + + for sheet, rows in content.items(): + for row in rows: + for column_name, value in row.copy().items(): + cleaned_name = clean_column_name(column_name) + del row[column_name] + row[cleaned_name] = value + def expand_content_in_place(content: Dict[str, List[Any]]) -> None: + preprocess_columns(content) + specials, translations, transl_cols = _get_special_survey_cols(content) if len(translations) > 0: @@ -233,7 +294,7 @@ def _mark_special(**kwargs: str) -> None: _pluck_uniq_cols('choices') for column_name in uniq_cols.keys(): - if column_name.lower() in ['label', 'hint']: + if column_name in ['label', 'hint']: _mark_special( column_name=column_name, column=column_name, diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py index cac6d66..f6a618f 100644 --- a/tests/test_expand_content.py +++ b/tests/test_expand_content.py @@ -1,6 +1,8 @@ # coding: utf-8 import copy from collections import OrderedDict +from ddt import data, ddt, unpack +from unittest import TestCase from formpack import FormPack from formpack.constants import OR_OTHER_COLUMN as _OR_OTHER @@ -12,6 +14,8 @@ from formpack.utils.flatten_content import flatten_content from formpack.utils.string import orderable_with_none +from formpack.src.formpack.utils.expand_content import clean_column_name + def test_expand_selects_with_or_other(): assert _expand_type_to_dict('select_one xx or other').get(_OR_OTHER) == True @@ -612,3 +616,21 @@ def test_expand_ignores_case(): def _s(rows): return {'survey': [dict([[key, 'x']]) for key in rows]} + +@ddt +class ColumnTestCase(TestCase): + @data( + ('FOO', 'foo'), + ('LABEL', 'label'), + ('HINT', 'hint'), + ('BIND::FOO', 'bind::FOO'), + ('BODY : FOO', 'body : FOO'), + ('MEDIA:AUDIO:Spanish', 'media:audio:Spanish'), + ('VIDEO :: SPANISH', 'video :: SPANISH'), + ('MEDIA:AUDIO', 'media:audio'), + ('IMAGE', 'image'), + ('LABEL : SPANISH', 'label : Spanish') + ) + @unpack + def test_clean_column_name(self, name, expected): + assert clean_column_name(name) == expected From acef60f0dd920a3cdef2d6afdf38bc5a3f25d9f5 Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 11 Feb 2025 09:06:07 -0500 Subject: [PATCH 3/4] fixup!: stuff --- src/formpack/utils/expand_content.py | 43 +++++++++++++++++++--------- tests/test_expand_content.py | 5 ++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index dfae2cb..53e489e 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -95,18 +95,28 @@ def _get_translations_from_special_cols( return translations, set(translated_cols) -def clean_column_name(column_name: str) -> str: +def clean_column_name(column_name: str, already_seen: dict[str, str]) -> str: + """ + + Preserves ":" vs "::" and any spaces around the colons + """ RE_MEDIA_COLUMN_NAMES = '|'.join(MEDIA_COLUMN_NAMES) + if column_name in already_seen: + return already_seen[column_name] # "LaBeL" -> "label", "HiNT" -> "hint" if column_name.lower() in ['label', 'hint']: - return column_name.lower() + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned # "Bind:Some:Thing" -> "bind:Some:Thing", "BodY:" -> "body:" match = re.match(r'^(bind|body):.*', column_name, flags=re.IGNORECASE) if match: - lower_cased = match.group(0).lower() - return re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + lower_cased = match.groups()[0].lower() + cleaned = re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned # "Media:Audio::ES" -> "media:audio::ES", "ViDeO : ES" -> "video : ES" match = re.match( @@ -116,40 +126,47 @@ def clean_column_name(column_name: str) -> str: ) if match: matched = match.groups() - lower_media_prefix = matched[0].lower() + lower_media_prefix = matched[0].lower() if matched[0] else '' lower_media_type = matched[1].lower() - return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', rf'{lower_media_prefix}{lower_media_type}\3\4', column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned # "Media: AuDiO" -> "media: audio", "VIDEO" -> "video" match = re.match( - rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name, flags=re.IGNORECASE ) if match: matched = match.groups() - lower_media_prefix = matched[0].lower() + lower_media_prefix = matched[0].lower() if matched[0] else '' lower_media_type = matched[1].lower() - return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', rf'{lower_media_prefix}{lower_media_type}', column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned match = re.match(r'^([^:]+)(\s*::?\s*)([^:]+)$', column_name) if match: # example: label::x, constraint_message::x, hint::x matched = match.groups() lower_column_shortname = matched[0].lower() - return re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, + cleaned = re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, flags=re.IGNORECASE) - return column_name.lower() + already_seen[column_name] = cleaned + return cleaned + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned def preprocess_columns(content: Dict[str, List[Any]]) -> None: - + seen = {} for sheet, rows in content.items(): for row in rows: for column_name, value in row.copy().items(): - cleaned_name = clean_column_name(column_name) + cleaned_name = clean_column_name(column_name, seen) del row[column_name] row[cleaned_name] = value diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py index f6a618f..03c4414 100644 --- a/tests/test_expand_content.py +++ b/tests/test_expand_content.py @@ -7,14 +7,13 @@ from formpack import FormPack from formpack.constants import OR_OTHER_COLUMN as _OR_OTHER from formpack.constants import UNTRANSLATED -from formpack.utils.expand_content import SCHEMA_VERSION +from formpack.utils.expand_content import SCHEMA_VERSION, clean_column_name from formpack.utils.expand_content import _expand_tags from formpack.utils.expand_content import _get_special_survey_cols from formpack.utils.expand_content import expand_content, _expand_type_to_dict from formpack.utils.flatten_content import flatten_content from formpack.utils.string import orderable_with_none -from formpack.src.formpack.utils.expand_content import clean_column_name def test_expand_selects_with_or_other(): @@ -629,7 +628,7 @@ class ColumnTestCase(TestCase): ('VIDEO :: SPANISH', 'video :: SPANISH'), ('MEDIA:AUDIO', 'media:audio'), ('IMAGE', 'image'), - ('LABEL : SPANISH', 'label : Spanish') + ('LABEL : SPANISH', 'label : SPANISH') ) @unpack def test_clean_column_name(self, name, expected): From 8893cdf023cffedf43abb85fda9bf4841f663625 Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 12 Feb 2025 09:19:18 -0500 Subject: [PATCH 4/4] fixup: rm breakpoints --- src/formpack/pack.py | 1 - src/formpack/utils/expand_content.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/formpack/pack.py b/src/formpack/pack.py index 7f5fd95..f3f9163 100644 --- a/src/formpack/pack.py +++ b/src/formpack/pack.py @@ -137,7 +137,6 @@ def load_version(self, schema): unique accross an entire FormPack. It can be None, but only for one version in the FormPack. """ - breakpoint() replace_aliases(schema['content'], in_place=True) expand_content(schema['content'], in_place=True) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 53e489e..8e733f0 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -53,7 +53,6 @@ def _expand_translatable_content( row[_expandable_col][_nti] = _oldval if col_shortname != _expandable_col: row[_expandable_col][cur_translation_index] = row[col_shortname] - breakpoint() del row[col_shortname]