From e05704a5abe3d43d92362fcabafb915729bf738a Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 16 Mar 2023 12:09:16 +0100 Subject: [PATCH 1/3] use a visually more appealing encoding This commit uses Unidecode to translate unicode characters into the ASCII-range before employing any dataverse-specific character quotations. If unidecode returns an empty string, the name "__not_representable_" is used, where "" is the length of the original string. --- datalad_dataverse/tests/test_utils.py | 18 ++++++++++++++---- datalad_dataverse/utils.py | 9 ++++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/datalad_dataverse/tests/test_utils.py b/datalad_dataverse/tests/test_utils.py index 4453004..51f68a3 100644 --- a/datalad_dataverse/tests/test_utils.py +++ b/datalad_dataverse/tests/test_utils.py @@ -1,5 +1,6 @@ from itertools import product from pathlib import Path +from unicodedata import lookup import pytest @@ -14,6 +15,7 @@ _test_paths = [ + lookup("dog face") + lookup("cat face"), ".x", "_x", "..x", @@ -49,18 +51,26 @@ def test_format_doi(): format_doi(123) +def _check_simplified_match(path, mangled_path): + result = [ + True + if mangled_part.startswith('__not_representable') + else str(unmangle_path(mangled_part)) == part + for mangled_part, part in zip(mangled_path.parts, path.parts) + ] + assert all(result) + + def test_path_mangling_identity(): for p in _test_paths + ['?;#:eee=2.txt']: - assert Path(p) == unmangle_path(mangle_path(p)) + _check_simplified_match(Path(p), mangle_path(p)) def test_path_mangling_sub_dirs(): for p, q, r in product(_test_paths, _test_paths, _test_paths): path = Path(p) / q / r mangled_path = mangle_path(path) - for part in mangled_path.parts[:-1]: - assert part[0] != "." - assert unmangle_path(mangled_path) == path + _check_simplified_match(path, mangled_path) def test_file_quoting_identity(): diff --git a/datalad_dataverse/utils.py b/datalad_dataverse/utils.py index 4c37b35..21fd0c5 100644 --- a/datalad_dataverse/utils.py +++ b/datalad_dataverse/utils.py @@ -4,6 +4,7 @@ from pathlib import Path from pyDataverse.api import NativeApi +from unidecode import unidecode from datalad_next.utils import update_specialremote_credential @@ -295,7 +296,8 @@ def _dataverse_dirname_quote(dirname: str) -> str: dataverse, it is encoded as well to prevent name collisions, for example, between ``.datalad`` and ``datalad``. """ - quoted_dirname = _dataverse_quote(dirname, DATAVERSE_DIRNAME_SAFE) + ascii_dirname = unidecode(dirname) or f"_not_representable_{len(dirname)}" + quoted_dirname = _dataverse_quote(ascii_dirname, DATAVERSE_DIRNAME_SAFE) return _encode_leading_dot(quoted_dirname) @@ -306,14 +308,15 @@ def _dataverse_filename_quote(filename: str) -> str: ``/``, ``:``, ``*``, ``?``, ``"``, ``<``, ``>``, ``|``, ``;``, and ``#``. - In order to be able to use the some decoding for file names and directory + In order to be able to use the same decoding for file names and directory names, we also encode leading dots in file names, although that is not strictly necessary with dataverse, because it would preserve the leading dots in file names. """ - quoted_filename = _dataverse_quote(filename, DATAVERSE_FILENAME_SAFE) + ascii_filename = unidecode(filename) or f"_not_representable_{len(filename)}" + quoted_filename = _dataverse_quote(ascii_filename, DATAVERSE_FILENAME_SAFE) return _encode_leading_dot(quoted_filename) From 344c8a396db12ee5ff4c5e025b95f5bfdee4d111 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 16 Mar 2023 12:14:56 +0100 Subject: [PATCH 2/3] add Unidecode to requirements --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index feb87e4..344b731 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ install_requires = datalad_next >= 1.0.0b1 datalad >= 0.18.0 pydataverse + Unidecode packages = find_namespace: include_package_data = True From ad77bbb94a1c6c6846f1b2c2e6d3e0c5c410a7fa Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 16 Mar 2023 12:24:28 +0100 Subject: [PATCH 3/3] improve mangle_path tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit ensures that mangle_path is tested with "printable" unicode characters, e.g. `ä`. that will be converted into ascii characters by `unidecode()`. --- datalad_dataverse/tests/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_dataverse/tests/test_utils.py b/datalad_dataverse/tests/test_utils.py index 51f68a3..2d7fdbb 100644 --- a/datalad_dataverse/tests/test_utils.py +++ b/datalad_dataverse/tests/test_utils.py @@ -3,6 +3,7 @@ from unicodedata import lookup import pytest +from unidecode import unidecode from ..utils import ( _dataverse_dirname_quote, @@ -34,6 +35,7 @@ "._dir/x", "_.dir/x", "__dir/x", + "ä", "%%;;,_,?-&=", ] @@ -55,7 +57,7 @@ def _check_simplified_match(path, mangled_path): result = [ True if mangled_part.startswith('__not_representable') - else str(unmangle_path(mangled_part)) == part + else str(unmangle_path(mangled_part)) == unidecode(part) for mangled_part, part in zip(mangled_path.parts, path.parts) ] assert all(result)