From 02cbe0ceb7d9f9343911592e31abe6310d2bd38b Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 19 Dec 2024 10:20:19 +0100 Subject: [PATCH] ORM: Add `get_size_on_disk` method to `RemoteData` (#6584) By default, the new `get_size_on_disk` method of `RemoteData` calls the private `_get_size_on_disk_du` that uses `du` to obtain the total directory size in bytes. If the call to `du` fails for whatever reason, recursive `stat` is being used via `_get_size_on_disk_stat`. This route somewhat discouraged, and a warning is issued, as `stat` returns the apparent size of files, not the actual disk usage. In addition, the CLI endpoint `verdi data core.remote size` is added, and the tests for `RemoteData` are expanded and parametrized to use both, `LocalTransport` and `SshTransport`, testing the functionality added in this PR. --- .../cmdline/commands/cmd_data/cmd_remote.py | 41 ++++ src/aiida/common/utils.py | 31 +++ src/aiida/orm/nodes/data/remote/base.py | 163 ++++++++++++++- tests/orm/nodes/data/test_remote.py | 196 +++++++++++++++++- 4 files changed, 419 insertions(+), 12 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_data/cmd_remote.py b/src/aiida/cmdline/commands/cmd_data/cmd_remote.py index 82c43399f9..a3c923dc78 100644 --- a/src/aiida/cmdline/commands/cmd_data/cmd_remote.py +++ b/src/aiida/cmdline/commands/cmd_data/cmd_remote.py @@ -9,6 +9,7 @@ """`verdi data core.remote` command.""" import stat +from pathlib import Path import click @@ -87,3 +88,43 @@ def remote_show(datum): """Show information for a RemoteData object.""" echo.echo(f'- Remote computer name: {datum.computer.label}') echo.echo(f'- Remote folder full path: {datum.get_remote_path()}') + + +@remote.command('size') +@arguments.NODE() +@click.option( + '-m', + '--method', + type=click.STRING, + default='du', + help='The method that should be used to evaluate the size (either ``du`` or ``stat``.)', +) +@click.option( + '-p', + '--path', + type=click.Path(), + default=None, + help='Relative path of the object of the ``RemoteData`` node for which the size should be evaluated.', +) +@click.option( + '-b', + '--bytes', + 'return_bytes', + type=bool, + is_flag=True, + default=False, + help='Return the size in bytes or human-readable format?', +) +def remote_size(node, method, path, return_bytes): + """Obtain the total size of a file or directory at a given path that is stored via a ``RemoteData`` object.""" + try: + # `method` might change, if `du` fails, so assigning to new variable here + total_size, used_method = node.get_size_on_disk(relpath=path, method=method, return_bytes=return_bytes) + remote_path = Path(node.get_remote_path()) + full_path = remote_path / path if path is not None else remote_path + echo.echo_success( + f'Estimated total size of path `{full_path}` on the Computer ' + f'<{node.computer.label}> obtained via `{used_method}`: {total_size}' + ) + except (OSError, FileNotFoundError, NotImplementedError) as exc: + echo.echo_critical(str(exc)) diff --git a/src/aiida/common/utils.py b/src/aiida/common/utils.py index f41cbef636..1b2f2b14ce 100644 --- a/src/aiida/common/utils.py +++ b/src/aiida/common/utils.py @@ -572,3 +572,34 @@ def __init__(self, dtobj, precision): self.dtobj = dtobj self.precision = precision + + +def format_directory_size(size_in_bytes: int) -> str: + """Converts a size in bytes to a human-readable string with the appropriate prefix. + + :param size_in_bytes: Size in bytes. + :raises ValueError: If the size is negative. + :return: Human-readable size string with a prefix (e.g., "1.23 KB", "5.67 MB"). + + The function converts a given size in bytes to a more readable format by + adding the appropriate unit suffix (e.g., KB, MB, GB). It uses the binary + system (base-1024) for unit conversions. + + Example: + >>> format_directory_size(123456789) + '117.74 MB' + """ + if size_in_bytes < 0: + raise ValueError('Size cannot be negative.') + + # Define size prefixes + prefixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB'] + factor = 1024 # 1 KB = 1024 B + index = 0 + + while size_in_bytes >= factor and index < len(prefixes) - 1: + size_in_bytes /= factor + index += 1 + + # Format the size to two decimal places + return f'{size_in_bytes:.2f} {prefixes[index]}' diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index 1fc691d113..5b7fec86c2 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -8,13 +8,20 @@ ########################################################################### """Data plugin that models a folder on a remote computer.""" +from __future__ import annotations + +import logging import os +from pathlib import Path from aiida.orm import AuthInfo from aiida.orm.fields import add_field +from aiida.transports import Transport from ..data import Data +_logger = logging.getLogger(__name__) + __all__ = ('RemoteData',) @@ -96,14 +103,15 @@ def listdir(self, relpath='.'): full_path = os.path.join(self.get_remote_path(), relpath) if not transport.isdir(full_path): raise OSError( - f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a ' + f'The required remote path {full_path} on {self.computer.label} does not exist, is not a ' 'directory or has been deleted.' ) try: return transport.listdir(full_path) except OSError as exception: - if exception.errno in (2, 20): # directory not existing or not a directory + if exception.errno in (2, 20): + # directory not existing or not a directory exc = OSError( f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a ' 'directory or has been deleted.' @@ -132,7 +140,8 @@ def listdir_withattributes(self, path='.'): try: return transport.listdir_withattributes(full_path) except OSError as exception: - if exception.errno in (2, 20): # directory not existing or not a directory + if exception.errno in (2, 20): + # directory not existing or not a directory exc = OSError( f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a ' 'directory or has been deleted.' @@ -185,3 +194,151 @@ def _validate(self): def get_authinfo(self): return AuthInfo.get_collection(self.backend).get(dbcomputer=self.computer, aiidauser=self.user) + + def get_size_on_disk( + self, + relpath: Path | None = None, + method: str = 'du', + return_bytes: bool = False, + ) -> int | str: + """Connects to the remote Computer of the `RemoteData` object and returns the total size of a file or a + directory at the given `relpath` in a human-readable format. + + :param relpath: File or directory path for which the total size should be returned, relative to + ``self.get_remote_path()``. + :param method: Method to be used to evaluate the directory/file size (either ``du`` or ``stat``). + :param return_bytes: Return the total byte size is int, or in human-readable format. + + :raises FileNotFoundError: If file or directory does not exist anymore on the remote ``Computer``. + :raises NotImplementedError: If a method other than ``du`` or ``stat`` is selected. + + :return: Total size of given file or directory. + """ + + from aiida.common.utils import format_directory_size + + total_size: int = -1 + + if relpath is None: + relpath = Path('.') + + authinfo = self.get_authinfo() + full_path = Path(self.get_remote_path()) / relpath + computer_label = self.computer.label if self.computer is not None else '' + + with authinfo.get_transport() as transport: + if not transport.path_exists(str(full_path)): + exc_message = f'The required remote path {full_path} on Computer <{computer_label}> ' 'does not exist.' + raise FileNotFoundError(exc_message) + + if method not in ('du', 'stat'): + exc_message = f'Specified method `{method}` is not an valid input. Please choose either `du` or `stat`.' + raise ValueError(exc_message) + + if method == 'du': + try: + total_size: int = self._get_size_on_disk_du(full_path, transport) + _logger.report('Obtained size on the remote using `du`.') + if return_bytes: + return total_size, method + else: + return format_directory_size(size_in_bytes=total_size), method + + except (RuntimeError, NotImplementedError): + # NotImplementedError captures the fact that, e.g., FirecREST does not allow for `exec_command_wait` + stat_warn = ( + 'Problem executing `du` command. Will return total file size based on `stat` as fallback. ' + ) + + _logger.warning(stat_warn) + + if method == 'stat' or total_size < 0: + try: + total_size: int = self._get_size_on_disk_stat(full_path, transport) + _logger.report('Obtained size on the remote using `stat`.') + _logger.warning( + 'Take the result with a grain of salt, as `stat` returns the apparent size of files, ' + 'not their actual disk usage.' + ) + if return_bytes: + return total_size, 'stat' + else: + return format_directory_size(size_in_bytes=total_size), 'stat' + + # This should typically not even be reached, as the OSError occours if the path is not a directory or + # does not exist. Though, we check for its existence already in the beginning of this method. + except OSError: + _logger.critical('Could not evaluate directory size using either `du` or `stat`.') + raise + + def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: + """Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using + the ``du`` command. + + :param full_path: Full path of file or directory for which the size should be evaluated. + :param transport: Open transport instance. + + :raises NotImplementedError: When ``exec_command_wait`` is not implemented, e.g., for the FirecREST plugin. + :raises RuntimeError: When ``du`` command cannot be successfully executed. + + :return: Total size of the file/directory in bytes (including all its contents). + """ + + try: + retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}') + except NotImplementedError as exc: + raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc + + if stderr or retval != 0: + raise RuntimeError(f'Error executing `du` command: {stderr}') + else: + total_size: int = int(stdout.split('\t')[0]) + return total_size + + def _get_size_on_disk_stat(self, full_path: Path, transport: Transport) -> int: + """Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using + the ``stat`` command. + + Connects to the remote folder and returns the total size of all files in the directory in bytes using ``stat``. + Note that `stat` returns the apparent file size, not actual disk usage. Thus, even if a file is only 1 byte, on + disk, it still occupies one full disk block size. As such, getting accurate measures of the total expected size + on disk when retrieving a ``RemoteData`` is not straightforward with ``stat``, as one would need to consider the + occupied block sizes for each file, as well as repository metadata. Therefore, this function only serves as a + fallback in the absence of the ``du`` command, and the returned estimate is expected to be smaller than the size + on disk that is actually occupied. Further note that the `Transport.get_attribute` method that is + eventually being called on each file, calls `lstat`, which is equivalent to ``os.stat(follow_symlinks=False)``. + + :param full_path: Full path of file or directory of which the size should be evaluated. + :param transport: Open transport instance. + + :raises OSError: When object at ``full_path`` doesn't exist. + + :return: Total size of the file/directory in bytes (including all its contents). + """ + + def _get_size_on_disk_stat_recursive(full_path: Path, transport: Transport): + """Helper function for recursive directory traversal.""" + + total_size = 0 + contents = self.listdir_withattributes(full_path) + + for item in contents: + item_path = full_path / item['name'] + # Add size of current item (file or directory metadata) + total_size += item['attributes']['st_size'] + + # If it's a directory, recursively get size of contents + if item['isdir']: + total_size += _get_size_on_disk_stat_recursive(item_path, transport) + + return total_size + + if transport.isfile(path=str(full_path)): + return transport.get_attribute(str(full_path))['st_size'] + + try: + return _get_size_on_disk_stat_recursive(full_path, transport) + + except OSError: + # Not a directory or not existing anymore. Exception is captured outside in `get_size_on_disk`. + raise diff --git a/tests/orm/nodes/data/test_remote.py b/tests/orm/nodes/data/test_remote.py index 333e59c547..ad12d97273 100644 --- a/tests/orm/nodes/data/test_remote.py +++ b/tests/orm/nodes/data/test_remote.py @@ -8,26 +8,204 @@ ########################################################################### """Tests for the :mod:`aiida.orm.nodes.data.remote.base.RemoteData` module.""" +from __future__ import annotations + +from pathlib import Path + import pytest from aiida.orm import RemoteData @pytest.fixture -def remote_data(tmp_path, aiida_localhost): - """Return a non-empty ``RemoteData`` instance.""" - node = RemoteData(computer=aiida_localhost) - node.set_remote_path(str(tmp_path)) - node.store() - (tmp_path / 'file.txt').write_bytes(b'some content') - return node +def remote_data_factory(tmp_path, aiida_computer_local, aiida_computer_ssh): + """Factory fixture to create RemoteData instances.""" + + def _create_remote_data(mode: str, content: str | bytes = b'some content'): + if mode == 'local': + computer = aiida_computer_local(label='localhost') + elif mode == 'ssh': + computer = aiida_computer_ssh(label='localhost-ssh') + else: + raise ValueError(f'Unknown mode: {mode}') + node = RemoteData(computer=computer) + node.set_remote_path(str(tmp_path)) + node.store() + (tmp_path / 'file.txt').write_bytes(content) + return node -def test_clean(remote_data): + return _create_remote_data + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +def test_clean(remote_data_factory, mode): """Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method.""" + + remote_data = remote_data_factory(mode=mode) + assert not remote_data.is_empty assert not remote_data.is_cleaned - remote_data._clean() assert remote_data.is_empty assert remote_data.is_cleaned + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +@pytest.mark.parametrize( + 'setup, results', + ( + (('du', False), ('4.01 KB', 'du')), + (('du', True), (4108, 'du')), + (('stat', False), ('12.00 B', 'stat')), + (('stat', True), (12, 'stat')), + ), +) +def test_get_size_on_disk_params(remote_data_factory, mode, setup, results): + """Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.get_size_on_disk` method.""" + + remote_data = remote_data_factory(mode=mode) + + size_on_disk, method = remote_data.get_size_on_disk(method=setup[0], return_bytes=setup[1]) + assert (size_on_disk, method) == results + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +@pytest.mark.parametrize( + 'content, sizes', + ( + (b'a', {'du': 4097, 'stat': 1, 'human': '4.00 KB'}), + (10 * b'a', {'du': 4106, 'stat': 10, 'human': '4.01 KB'}), + (100 * b'a', {'du': 4196, 'stat': 100, 'human': '4.10 KB'}), + (1000 * b'a', {'du': 5096, 'stat': 1000, 'human': '4.98 KB'}), + (1000000 * b'a', {'du': 1004096, 'stat': int(1e6), 'human': '980.56 KB'}), + ), + ids=['1-byte', '10-bytes', '100-bytes', '1000-bytes', '1e6-bytes'], +) +def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes): + """Test the different implementations to obtain the size of a ``RemoteData`` on disk.""" + + remote_data = remote_data_factory(mode=mode, content=content) + + authinfo = remote_data.get_authinfo() + full_path = Path(remote_data.get_remote_path()) + + with authinfo.get_transport() as transport: + size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path) + size_on_disk_stat = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path) + size_on_disk_human, _ = remote_data.get_size_on_disk() + + assert size_on_disk_du == sizes['du'] + assert size_on_disk_stat == sizes['stat'] + assert size_on_disk_human == sizes['human'] + + +@pytest.mark.parametrize( + 'num_char, relpath, sizes', + ( + (1, '.', {'du': 12291, 'stat': 8195, 'human': '12.00 KB'}), + (100, '.', {'du': 12588, 'stat': 8492, 'human': '12.29 KB'}), + (int(1e6), '.', {'du': 3012288, 'stat': 3008192, 'human': '2.87 MB'}), + (1, 'subdir1', {'du': 8194, 'stat': 4098, 'human': '8.00 KB'}), + (100, 'subdir1', {'du': 8392, 'stat': 4296, 'human': '8.20 KB'}), + (int(1e6), 'subdir1', {'du': 2008192, 'stat': 2004096, 'human': '1.92 MB'}), + ), +) +def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes): + # TODO: Use create file hierarchy fixture from test_execmanager? + sub_dir1 = tmp_path / 'subdir1' + sub_dir1.mkdir() + + sub_dir2 = tmp_path / 'subdir1' / 'subdir2' + sub_dir2.mkdir() + + # Create some files with known sizes + file1 = sub_dir1 / 'file1.txt' + file1.write_text('a' * num_char) + + file2 = sub_dir2 / 'file2.bin' + file2.write_bytes(b'a' * num_char) + + file3 = tmp_path / 'file3.txt' + file3.write_text('a' * num_char) + + remote_data = RemoteData(computer=aiida_localhost, remote_path=tmp_path) + + authinfo = remote_data.get_authinfo() + full_path = Path(remote_data.get_remote_path()) / relpath + + with authinfo.get_transport() as transport: + size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path) + size_on_disk_stat = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path) + + size_on_disk_human, _ = remote_data.get_size_on_disk(relpath=relpath) + + assert size_on_disk_du == sizes['du'] + assert size_on_disk_stat == sizes['stat'] + assert size_on_disk_human == sizes['human'] + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +def test_get_size_on_disk_excs(remote_data_factory, mode): + """Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.get_size_on_disk` method.""" + # Extra function to avoid unnecessary parametrization here + remote_data = remote_data_factory(mode=mode) + + # Path/file non-existent + with pytest.raises(FileNotFoundError, match='.*does not exist.*'): + remote_data.get_size_on_disk(relpath=Path('non-existent')) + + # Non-valid method + with pytest.raises(ValueError, match='.*is not an valid input. Please choose either.*.'): + remote_data.get_size_on_disk(method='fake-du') + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +def test_get_size_on_disk_du(remote_data_factory, mode, monkeypatch): + """Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData._get_size_on_disk_du` private method.""" + # No additional parametrization here, as already done in `test_get_size_on_disk_sizes`. + + remote_data = remote_data_factory(mode=mode) + + # Normal call + authinfo = remote_data.get_authinfo() + full_path = Path(remote_data.get_remote_path()) + + with authinfo.get_transport() as transport: + size_on_disk = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path) + assert size_on_disk == 4108 + + # Monkeypatch transport exec_command_wait command to simulate it not being implemented, e.g., for FirecREST plugin + def mock_exec_command_wait(command): + raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') + + monkeypatch.setattr(transport, 'exec_command_wait', mock_exec_command_wait) + with pytest.raises(NotImplementedError, match='`exec_command_wait` not implemented.*'): + remote_data._get_size_on_disk_du(full_path, transport) + + # Monkeypatch transport exec_command_wait command to simulate `du` failure + def mock_exec_command_wait(command): + return (1, '', 'Error executing `du` command') + + monkeypatch.setattr(transport, 'exec_command_wait', mock_exec_command_wait) + with pytest.raises(RuntimeError, match='Error executing `du`.*'): + remote_data._get_size_on_disk_du(full_path, transport) + + +@pytest.mark.parametrize('mode', ('local', 'ssh')) +def test_get_size_on_disk_stat(remote_data_factory, mode): + """Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData._get_size_on_disk_stat` private method.""" + # No additional parametrization here, as already done in `test_get_size_on_disk_sizes`. + + remote_data = remote_data_factory(mode=mode) + + authinfo = remote_data.get_authinfo() + full_path = Path(remote_data.get_remote_path()) + + with authinfo.get_transport() as transport: + size_on_disk = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path) + assert size_on_disk == 12 + + # Raises OSError for non-existent directory + with pytest.raises(OSError, match='The required remote folder.*'): + remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path / 'non-existent')