diff --git a/datalad_dataverse/tests/test_utils.py b/datalad_dataverse/tests/test_utils.py index 4453004..2d7fdbb 100644 --- a/datalad_dataverse/tests/test_utils.py +++ b/datalad_dataverse/tests/test_utils.py @@ -1,7 +1,9 @@ from itertools import product from pathlib import Path +from unicodedata import lookup import pytest +from unidecode import unidecode from ..utils import ( _dataverse_dirname_quote, @@ -14,6 +16,7 @@ _test_paths = [ + lookup("dog face") + lookup("cat face"), ".x", "_x", "..x", @@ -32,6 +35,7 @@ "._dir/x", "_.dir/x", "__dir/x", + "รค", "%%;;,_,?-&=", ] @@ -49,18 +53,26 @@ def test_format_doi(): format_doi(123) +def _check_simplified_match(path, mangled_path): + result = [ + True + if mangled_part.startswith('__not_representable') + else str(unmangle_path(mangled_part)) == unidecode(part) + for mangled_part, part in zip(mangled_path.parts, path.parts) + ] + assert all(result) + + def test_path_mangling_identity(): for p in _test_paths + ['?;#:eee=2.txt']: - assert Path(p) == unmangle_path(mangle_path(p)) + _check_simplified_match(Path(p), mangle_path(p)) def test_path_mangling_sub_dirs(): for p, q, r in product(_test_paths, _test_paths, _test_paths): path = Path(p) / q / r mangled_path = mangle_path(path) - for part in mangled_path.parts[:-1]: - assert part[0] != "." - assert unmangle_path(mangled_path) == path + _check_simplified_match(path, mangled_path) def test_file_quoting_identity(): diff --git a/datalad_dataverse/utils.py b/datalad_dataverse/utils.py index 4c37b35..21fd0c5 100644 --- a/datalad_dataverse/utils.py +++ b/datalad_dataverse/utils.py @@ -4,6 +4,7 @@ from pathlib import Path from pyDataverse.api import NativeApi +from unidecode import unidecode from datalad_next.utils import update_specialremote_credential @@ -295,7 +296,8 @@ def _dataverse_dirname_quote(dirname: str) -> str: dataverse, it is encoded as well to prevent name collisions, for example, between ``.datalad`` and ``datalad``. """ - quoted_dirname = _dataverse_quote(dirname, DATAVERSE_DIRNAME_SAFE) + ascii_dirname = unidecode(dirname) or f"_not_representable_{len(dirname)}" + quoted_dirname = _dataverse_quote(ascii_dirname, DATAVERSE_DIRNAME_SAFE) return _encode_leading_dot(quoted_dirname) @@ -306,14 +308,15 @@ def _dataverse_filename_quote(filename: str) -> str: ``/``, ``:``, ``*``, ``?``, ``"``, ``<``, ``>``, ``|``, ``;``, and ``#``. - In order to be able to use the some decoding for file names and directory + In order to be able to use the same decoding for file names and directory names, we also encode leading dots in file names, although that is not strictly necessary with dataverse, because it would preserve the leading dots in file names. """ - quoted_filename = _dataverse_quote(filename, DATAVERSE_FILENAME_SAFE) + ascii_filename = unidecode(filename) or f"_not_representable_{len(filename)}" + quoted_filename = _dataverse_quote(ascii_filename, DATAVERSE_FILENAME_SAFE) return _encode_leading_dot(quoted_filename) diff --git a/setup.cfg b/setup.cfg index feb87e4..344b731 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ install_requires = datalad_next >= 1.0.0b1 datalad >= 0.18.0 pydataverse + Unidecode packages = find_namespace: include_package_data = True