diff --git a/dev-requirements.txt b/dev-requirements.txt index 2cb364d..09427c8 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -7,6 +7,7 @@ # Install testing / development requirements coverage[toml]==6.5.0 coveralls==3.3.1 +ddt==1.7.2 flake8==7.1.1 funcsigs==1.0.2 geojson-rewind==1.1.0 diff --git a/setup.py b/setup.py index 8423bb5..a40b006 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='formpack', - version='3.0.0', + version='3.0.1', description='Manipulation tools for KoBo forms', author='the formpack contributors (https://github.com/kobotoolbox/formpack/graphs/contributors)', url='https://github.com/kobotoolbox/formpack/', diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 852e50b..8e733f0 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -94,7 +94,84 @@ def _get_translations_from_special_cols( return translations, set(translated_cols) +def clean_column_name(column_name: str, already_seen: dict[str, str]) -> str: + """ + + Preserves ":" vs "::" and any spaces around the colons + """ + RE_MEDIA_COLUMN_NAMES = '|'.join(MEDIA_COLUMN_NAMES) + if column_name in already_seen: + return already_seen[column_name] + + # "LaBeL" -> "label", "HiNT" -> "hint" + if column_name.lower() in ['label', 'hint']: + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned + + # "Bind:Some:Thing" -> "bind:Some:Thing", "BodY:" -> "body:" + match = re.match(r'^(bind|body):.*', column_name, flags=re.IGNORECASE) + if match: + lower_cased = match.groups()[0].lower() + cleaned = re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned + + # "Media:Audio::ES" -> "media:audio::ES", "ViDeO : ES" -> "video : ES" + match = re.match( + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})\s*::?\s*([^:]+)$', + column_name, + flags=re.IGNORECASE + ) + if match: + matched = match.groups() + lower_media_prefix = matched[0].lower() if matched[0] else '' + lower_media_type = matched[1].lower() + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', + rf'{lower_media_prefix}{lower_media_type}\3\4', + column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned + + # "Media: AuDiO" -> "media: audio", "VIDEO" -> "video" + match = re.match( + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name, flags=re.IGNORECASE + ) + if match: + matched = match.groups() + lower_media_prefix = matched[0].lower() if matched[0] else '' + lower_media_type = matched[1].lower() + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', + rf'{lower_media_prefix}{lower_media_type}', + column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + + match = re.match(r'^([^:]+)(\s*::?\s*)([^:]+)$', column_name) + if match: + # example: label::x, constraint_message::x, hint::x + matched = match.groups() + lower_column_shortname = matched[0].lower() + cleaned = re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, + flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned + + +def preprocess_columns(content: Dict[str, List[Any]]) -> None: + seen = {} + for sheet, rows in content.items(): + for row in rows: + for column_name, value in row.copy().items(): + cleaned_name = clean_column_name(column_name, seen) + del row[column_name] + row[cleaned_name] = value + def expand_content_in_place(content: Dict[str, List[Any]]) -> None: + preprocess_columns(content) + specials, translations, transl_cols = _get_special_survey_cols(content) if len(translations) > 0: diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py index eb05746..03c4414 100644 --- a/tests/test_expand_content.py +++ b/tests/test_expand_content.py @@ -1,11 +1,13 @@ # coding: utf-8 import copy from collections import OrderedDict +from ddt import data, ddt, unpack +from unittest import TestCase from formpack import FormPack from formpack.constants import OR_OTHER_COLUMN as _OR_OTHER from formpack.constants import UNTRANSLATED -from formpack.utils.expand_content import SCHEMA_VERSION +from formpack.utils.expand_content import SCHEMA_VERSION, clean_column_name from formpack.utils.expand_content import _expand_tags from formpack.utils.expand_content import _get_special_survey_cols from formpack.utils.expand_content import expand_content, _expand_type_to_dict @@ -13,6 +15,7 @@ from formpack.utils.string import orderable_with_none + def test_expand_selects_with_or_other(): assert _expand_type_to_dict('select_one xx or other').get(_OR_OTHER) == True assert _expand_type_to_dict('select_one xx or_other').get(_OR_OTHER) == True @@ -604,5 +607,29 @@ def test_expand_translations_null_lang(): assert s1 == s1_copy +def test_expand_ignores_case(): + s1 = {'survey': [{'type': 'text', 'Label': 'hi'}]} + expand_content(s1, in_place=True) + assert s1.get('translated') == ['Label'] + + def _s(rows): return {'survey': [dict([[key, 'x']]) for key in rows]} + +@ddt +class ColumnTestCase(TestCase): + @data( + ('FOO', 'foo'), + ('LABEL', 'label'), + ('HINT', 'hint'), + ('BIND::FOO', 'bind::FOO'), + ('BODY : FOO', 'body : FOO'), + ('MEDIA:AUDIO:Spanish', 'media:audio:Spanish'), + ('VIDEO :: SPANISH', 'video :: SPANISH'), + ('MEDIA:AUDIO', 'media:audio'), + ('IMAGE', 'image'), + ('LABEL : SPANISH', 'label : SPANISH') + ) + @unpack + def test_clean_column_name(self, name, expected): + assert clean_column_name(name) == expected