Skip to content

Commit

Permalink
ORM: Add get_size_on_disk method to RemoteData (#6584)
Browse files Browse the repository at this point in the history
By default, the new `get_size_on_disk` method of `RemoteData` calls the private `_get_size_on_disk_du` that uses `du` to obtain the total directory size in bytes. If the call to `du` fails for whatever reason, recursive `stat` is being used via `_get_size_on_disk_stat`. This route somewhat discouraged, and a warning is issued, as `stat` returns the apparent size of files, not the actual disk usage.

In addition, the CLI endpoint `verdi data core.remote size` is added, and the tests for `RemoteData` are expanded and parametrized to use both, `LocalTransport` and `SshTransport`, testing the functionality added in this PR.
  • Loading branch information
GeigerJ2 authored Dec 19, 2024
1 parent c532b34 commit 02cbe0c
Show file tree
Hide file tree
Showing 4 changed files with 419 additions and 12 deletions.
41 changes: 41 additions & 0 deletions src/aiida/cmdline/commands/cmd_data/cmd_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""`verdi data core.remote` command."""

import stat
from pathlib import Path

import click

Expand Down Expand Up @@ -87,3 +88,43 @@ def remote_show(datum):
"""Show information for a RemoteData object."""
echo.echo(f'- Remote computer name: {datum.computer.label}')
echo.echo(f'- Remote folder full path: {datum.get_remote_path()}')


@remote.command('size')
@arguments.NODE()
@click.option(
'-m',
'--method',
type=click.STRING,
default='du',
help='The method that should be used to evaluate the size (either ``du`` or ``stat``.)',
)
@click.option(
'-p',
'--path',
type=click.Path(),
default=None,
help='Relative path of the object of the ``RemoteData`` node for which the size should be evaluated.',
)
@click.option(
'-b',
'--bytes',
'return_bytes',
type=bool,
is_flag=True,
default=False,
help='Return the size in bytes or human-readable format?',
)
def remote_size(node, method, path, return_bytes):
"""Obtain the total size of a file or directory at a given path that is stored via a ``RemoteData`` object."""
try:
# `method` might change, if `du` fails, so assigning to new variable here
total_size, used_method = node.get_size_on_disk(relpath=path, method=method, return_bytes=return_bytes)
remote_path = Path(node.get_remote_path())
full_path = remote_path / path if path is not None else remote_path
echo.echo_success(
f'Estimated total size of path `{full_path}` on the Computer '
f'<{node.computer.label}> obtained via `{used_method}`: {total_size}'
)
except (OSError, FileNotFoundError, NotImplementedError) as exc:
echo.echo_critical(str(exc))
31 changes: 31 additions & 0 deletions src/aiida/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,3 +572,34 @@ def __init__(self, dtobj, precision):

self.dtobj = dtobj
self.precision = precision


def format_directory_size(size_in_bytes: int) -> str:
"""Converts a size in bytes to a human-readable string with the appropriate prefix.
:param size_in_bytes: Size in bytes.
:raises ValueError: If the size is negative.
:return: Human-readable size string with a prefix (e.g., "1.23 KB", "5.67 MB").
The function converts a given size in bytes to a more readable format by
adding the appropriate unit suffix (e.g., KB, MB, GB). It uses the binary
system (base-1024) for unit conversions.
Example:
>>> format_directory_size(123456789)
'117.74 MB'
"""
if size_in_bytes < 0:
raise ValueError('Size cannot be negative.')

# Define size prefixes
prefixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
factor = 1024 # 1 KB = 1024 B
index = 0

while size_in_bytes >= factor and index < len(prefixes) - 1:
size_in_bytes /= factor
index += 1

# Format the size to two decimal places
return f'{size_in_bytes:.2f} {prefixes[index]}'
163 changes: 160 additions & 3 deletions src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,20 @@
###########################################################################
"""Data plugin that models a folder on a remote computer."""

from __future__ import annotations

import logging
import os
from pathlib import Path

from aiida.orm import AuthInfo
from aiida.orm.fields import add_field
from aiida.transports import Transport

from ..data import Data

_logger = logging.getLogger(__name__)

__all__ = ('RemoteData',)


Expand Down Expand Up @@ -96,14 +103,15 @@ def listdir(self, relpath='.'):
full_path = os.path.join(self.get_remote_path(), relpath)
if not transport.isdir(full_path):
raise OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
f'The required remote path {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
)

try:
return transport.listdir(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (2, 20):
# directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -132,7 +140,8 @@ def listdir_withattributes(self, path='.'):
try:
return transport.listdir_withattributes(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (2, 20):
# directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -185,3 +194,151 @@ def _validate(self):

def get_authinfo(self):
return AuthInfo.get_collection(self.backend).get(dbcomputer=self.computer, aiidauser=self.user)

def get_size_on_disk(
self,
relpath: Path | None = None,
method: str = 'du',
return_bytes: bool = False,
) -> int | str:
"""Connects to the remote Computer of the `RemoteData` object and returns the total size of a file or a
directory at the given `relpath` in a human-readable format.
:param relpath: File or directory path for which the total size should be returned, relative to
``self.get_remote_path()``.
:param method: Method to be used to evaluate the directory/file size (either ``du`` or ``stat``).
:param return_bytes: Return the total byte size is int, or in human-readable format.
:raises FileNotFoundError: If file or directory does not exist anymore on the remote ``Computer``.
:raises NotImplementedError: If a method other than ``du`` or ``stat`` is selected.
:return: Total size of given file or directory.
"""

from aiida.common.utils import format_directory_size

total_size: int = -1

if relpath is None:
relpath = Path('.')

authinfo = self.get_authinfo()
full_path = Path(self.get_remote_path()) / relpath
computer_label = self.computer.label if self.computer is not None else ''

with authinfo.get_transport() as transport:
if not transport.path_exists(str(full_path)):
exc_message = f'The required remote path {full_path} on Computer <{computer_label}> ' 'does not exist.'
raise FileNotFoundError(exc_message)

if method not in ('du', 'stat'):
exc_message = f'Specified method `{method}` is not an valid input. Please choose either `du` or `stat`.'
raise ValueError(exc_message)

if method == 'du':
try:
total_size: int = self._get_size_on_disk_du(full_path, transport)
_logger.report('Obtained size on the remote using `du`.')
if return_bytes:
return total_size, method
else:
return format_directory_size(size_in_bytes=total_size), method

except (RuntimeError, NotImplementedError):
# NotImplementedError captures the fact that, e.g., FirecREST does not allow for `exec_command_wait`
stat_warn = (
'Problem executing `du` command. Will return total file size based on `stat` as fallback. '
)

_logger.warning(stat_warn)

if method == 'stat' or total_size < 0:
try:
total_size: int = self._get_size_on_disk_stat(full_path, transport)
_logger.report('Obtained size on the remote using `stat`.')
_logger.warning(
'Take the result with a grain of salt, as `stat` returns the apparent size of files, '
'not their actual disk usage.'
)
if return_bytes:
return total_size, 'stat'
else:
return format_directory_size(size_in_bytes=total_size), 'stat'

# This should typically not even be reached, as the OSError occours if the path is not a directory or
# does not exist. Though, we check for its existence already in the beginning of this method.
except OSError:
_logger.critical('Could not evaluate directory size using either `du` or `stat`.')
raise

def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``du`` command.
:param full_path: Full path of file or directory for which the size should be evaluated.
:param transport: Open transport instance.
:raises NotImplementedError: When ``exec_command_wait`` is not implemented, e.g., for the FirecREST plugin.
:raises RuntimeError: When ``du`` command cannot be successfully executed.
:return: Total size of the file/directory in bytes (including all its contents).
"""

try:
retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}')
except NotImplementedError as exc:
raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc

if stderr or retval != 0:
raise RuntimeError(f'Error executing `du` command: {stderr}')
else:
total_size: int = int(stdout.split('\t')[0])
return total_size

def _get_size_on_disk_stat(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``stat`` command.
Connects to the remote folder and returns the total size of all files in the directory in bytes using ``stat``.
Note that `stat` returns the apparent file size, not actual disk usage. Thus, even if a file is only 1 byte, on
disk, it still occupies one full disk block size. As such, getting accurate measures of the total expected size
on disk when retrieving a ``RemoteData`` is not straightforward with ``stat``, as one would need to consider the
occupied block sizes for each file, as well as repository metadata. Therefore, this function only serves as a
fallback in the absence of the ``du`` command, and the returned estimate is expected to be smaller than the size
on disk that is actually occupied. Further note that the `Transport.get_attribute` method that is
eventually being called on each file, calls `lstat`, which is equivalent to ``os.stat(follow_symlinks=False)``.
:param full_path: Full path of file or directory of which the size should be evaluated.
:param transport: Open transport instance.
:raises OSError: When object at ``full_path`` doesn't exist.
:return: Total size of the file/directory in bytes (including all its contents).
"""

def _get_size_on_disk_stat_recursive(full_path: Path, transport: Transport):
"""Helper function for recursive directory traversal."""

total_size = 0
contents = self.listdir_withattributes(full_path)

for item in contents:
item_path = full_path / item['name']
# Add size of current item (file or directory metadata)
total_size += item['attributes']['st_size']

# If it's a directory, recursively get size of contents
if item['isdir']:
total_size += _get_size_on_disk_stat_recursive(item_path, transport)

return total_size

if transport.isfile(path=str(full_path)):
return transport.get_attribute(str(full_path))['st_size']

try:
return _get_size_on_disk_stat_recursive(full_path, transport)

except OSError:
# Not a directory or not existing anymore. Exception is captured outside in `get_size_on_disk`.
raise
Loading

0 comments on commit 02cbe0c

Please sign in to comment.