add support for different file metadata encodings (#125)

Changaco · web-flow · commit ed0e591eba97 · 2023-07-04T10:03:17.000+02:00
diff --git a/README.rst b/README.rst
@@ -114,6 +114,21 @@ and the optional third argument is the compression format (called “filter” i
 libarchive). The acceptable values are listed in ``libarchive.ffi.WRITE_FORMATS``
 and ``libarchive.ffi.WRITE_FILTERS``.
 
+File metadata codecs
+--------------------
+
+By default, UTF-8 is used to read and write file attributes from and to archives.
+A different codec can be specified through the ``header_codec`` arguments of the
+``*_reader`` and ``*_writer`` functions. Example::
+
+    with libarchive.file_writer('test.tar', 'ustar', header_codec='cp037') as archive:
+        ...
+    with file_reader('test.tar', header_codec='cp037') as archive:
+        ...
+
+In addition to file paths (``pathname`` and ``linkpath``), the specified codec is
+used to encode and decode user and group names (``uname`` and ``gname``).
+
 License
 =======
 
diff --git a/libarchive/entry.py b/libarchive/entry.py
@@ -1,5 +1,5 @@
 from contextlib import contextmanager
-from ctypes import c_char_p, create_string_buffer
+from ctypes import create_string_buffer
 from enum import IntEnum
 import math
 
@@ -34,15 +34,19 @@ def format_time(seconds, nanos):
 
 class ArchiveEntry:
 
-    __slots__ = ('_archive_p', '_entry_p')
+    __slots__ = ('_archive_p', '_entry_p', 'header_codec')
 
-    def __init__(self, archive_p=None, **attributes):
+    def __init__(self, archive_p=None, header_codec='utf-8', **attributes):
         """Allocate memory for an `archive_entry` struct.
 
-        The attributes are passed to the `modify` method.
+        The `header_codec` is used to decode and encode file paths and other
+        attributes.
+
+        The `**attributes` are passed to the `modify` method.
         """
         self._archive_p = archive_p
         self._entry_p = ffi.entry_new()
+        self.header_codec = header_codec
         if attributes:
             self.modify(**attributes)
 
@@ -54,7 +58,7 @@ def __str__(self):
         """Returns the file's path"""
         return self.pathname
 
-    def modify(self, **attributes):
+    def modify(self, header_codec=None, **attributes):
         """Convenience method to modify the entry's attributes.
 
         Args:
@@ -83,6 +87,8 @@ def modify(self, **attributes):
             rdevmajor (int): major part of the device number
             rdevminor (int): minor part of the device number
         """
+        if header_codec:
+            self.header_codec = header_codec
         for name, value in attributes.items():
             setattr(self, name, value)
 
@@ -112,23 +118,45 @@ def gid(self, gid):
 
     @property
     def uname(self):
-        return ffi.entry_uname_w(self._entry_p)
+        uname = ffi.entry_uname_w(self._entry_p)
+        if not uname:
+            uname = ffi.entry_uname(self._entry_p)
+            if uname is not None:
+                try:
+                    uname = uname.decode(self.header_codec)
+                except UnicodeError:
+                    pass
+        return uname
 
     @uname.setter
     def uname(self, value):
         if not isinstance(value, bytes):
-            value = value.encode('utf8')
-        ffi.entry_update_uname_utf8(self._entry_p, value)
+            value = value.encode(self.header_codec)
+        if self.header_codec == 'utf-8':
+            ffi.entry_update_uname_utf8(self._entry_p, value)
+        else:
+            ffi.entry_copy_uname(self._entry_p, value)
 
     @property
     def gname(self):
-        return ffi.entry_gname_w(self._entry_p)
+        gname = ffi.entry_gname_w(self._entry_p)
+        if not gname:
+            gname = ffi.entry_gname(self._entry_p)
+            if gname is not None:
+                try:
+                    gname = gname.decode(self.header_codec)
+                except UnicodeError:
+                    pass
+        return gname
 
     @gname.setter
     def gname(self, value):
         if not isinstance(value, bytes):
-            value = value.encode('utf8')
-        ffi.entry_update_gname_utf8(self._entry_p, value)
+            value = value.encode(self.header_codec)
+        if self.header_codec == 'utf-8':
+            ffi.entry_update_gname_utf8(self._entry_p, value)
+        else:
+            ffi.entry_copy_gname(self._entry_p, value)
 
     def get_blocks(self, block_size=ffi.page_size):
         """Read the file's content, keeping only one chunk in memory at a time.
@@ -294,28 +322,48 @@ def pathname(self):
         path = ffi.entry_pathname_w(self._entry_p)
         if not path:
             path = ffi.entry_pathname(self._entry_p)
-            try:
-                path = path.decode()
-            except UnicodeError:
-                pass
+            if path is not None:
+                try:
+                    path = path.decode(self.header_codec)
+                except UnicodeError:
+                    pass
         return path
 
     @pathname.setter
     def pathname(self, value):
         if not isinstance(value, bytes):
-            value = value.encode('utf8')
-        ffi.entry_update_pathname_utf8(self._entry_p, c_char_p(value))
+            value = value.encode(self.header_codec)
+        if self.header_codec == 'utf-8':
+            ffi.entry_update_pathname_utf8(self._entry_p, value)
+        else:
+            ffi.entry_copy_pathname(self._entry_p, value)
 
     @property
     def linkpath(self):
-        return (ffi.entry_symlink_w(self._entry_p) or
+        path = (
+            (
+                ffi.entry_symlink_w(self._entry_p) or
+                ffi.entry_symlink(self._entry_p)
+            ) if self.issym else (
                 ffi.entry_hardlink_w(self._entry_p) or
-                ffi.entry_symlink(self._entry_p) or
-                ffi.entry_hardlink(self._entry_p))
+                ffi.entry_hardlink(self._entry_p)
+            )
+        )
+        if isinstance(path, bytes):
+            try:
+                path = path.decode(self.header_codec)
+            except UnicodeError:
+                pass
+        return path
 
     @linkpath.setter
     def linkpath(self, value):
-        ffi.entry_update_link_utf8(self._entry_p, value)
+        if not isinstance(value, bytes):
+            value = value.encode(self.header_codec)
+        if self.header_codec == 'utf-8':
+            ffi.entry_update_link_utf8(self._entry_p, value)
+        else:
+            ffi.entry_copy_link(self._entry_p, value)
 
     # aliases for compatibility with the standard `tarfile` module
     path = property(pathname.fget, pathname.fset, doc="alias of pathname")
diff --git a/libarchive/ffi.py b/libarchive/ffi.py
@@ -200,6 +200,8 @@ def get_write_filter_function(filter_name):
 ffi('entry_rdevminor', [c_archive_entry_p], c_uint)
 ffi('entry_uid', [c_archive_entry_p], c_longlong)
 ffi('entry_gid', [c_archive_entry_p], c_longlong)
+ffi('entry_uname', [c_archive_entry_p], c_char_p)
+ffi('entry_gname', [c_archive_entry_p], c_char_p)
 ffi('entry_uname_w', [c_archive_entry_p], c_wchar_p)
 ffi('entry_gname_w', [c_archive_entry_p], c_wchar_p)
 
@@ -222,9 +224,13 @@ def get_write_filter_function(filter_name):
 ffi('entry_unset_ctime', [c_archive_entry_p], None)
 ffi('entry_unset_birthtime', [c_archive_entry_p], None)
 
+ffi('entry_copy_pathname', [c_archive_entry_p, c_char_p], None)
 ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
+ffi('entry_copy_link', [c_archive_entry_p, c_char_p], None)
 ffi('entry_update_link_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
+ffi('entry_copy_uname', [c_archive_entry_p, c_char_p], None)
 ffi('entry_update_uname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
+ffi('entry_copy_gname', [c_archive_entry_p, c_char_p], None)
 ffi('entry_update_gname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
 
 ffi('entry_clear', [c_archive_entry_p], c_archive_entry_p)
diff --git a/libarchive/read.py b/libarchive/read.py
@@ -12,16 +12,18 @@
 
 class ArchiveRead:
 
-    def __init__(self, archive_p):
+    def __init__(self, archive_p, header_codec='utf-8'):
         self._pointer = archive_p
+        self.header_codec = header_codec
 
     def __iter__(self):
         """Iterates through an archive's entries.
         """
         archive_p = self._pointer
+        header_codec = self.header_codec
         read_next_header2 = ffi.read_next_header2
         while 1:
-            entry = ArchiveEntry(archive_p)
+            entry = ArchiveEntry(archive_p, header_codec)
             r = read_next_header2(archive_p, entry._entry_p)
             if r == ARCHIVE_EOF:
                 return
@@ -68,6 +70,7 @@ def custom_reader(
     read_func, format_name='all', filter_name='all',
     open_func=None, seek_func=None, close_func=None,
     block_size=page_size, archive_read_class=ArchiveRead, passphrase=None,
+    header_codec='utf-8',
 ):
     """Read an archive using a custom function.
     """
@@ -79,12 +82,13 @@ def custom_reader(
         if seek_func:
             ffi.read_set_seek_callback(archive_p, seek_cb)
         ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
-        yield archive_read_class(archive_p)
+        yield archive_read_class(archive_p, header_codec)
 
 
 @contextmanager
 def fd_reader(
     fd, format_name='all', filter_name='all', block_size=4096, passphrase=None,
+    header_codec='utf-8',
 ):
     """Read an archive from a file descriptor.
     """
@@ -94,12 +98,13 @@ def fd_reader(
         except (OSError, AttributeError):  # pragma: no cover
             pass
         ffi.read_open_fd(archive_p, fd, block_size)
-        yield ArchiveRead(archive_p)
+        yield ArchiveRead(archive_p, header_codec)
 
 
 @contextmanager
 def file_reader(
     path, format_name='all', filter_name='all', block_size=4096, passphrase=None,
+    header_codec='utf-8',
 ):
     """Read an archive from a file.
     """
@@ -109,22 +114,25 @@ def file_reader(
         except (OSError, AttributeError):  # pragma: no cover
             pass
         ffi.read_open_filename_w(archive_p, path, block_size)
-        yield ArchiveRead(archive_p)
+        yield ArchiveRead(archive_p, header_codec)
 
 
 @contextmanager
-def memory_reader(buf, format_name='all', filter_name='all', passphrase=None):
+def memory_reader(
+    buf, format_name='all', filter_name='all', passphrase=None,
+    header_codec='utf-8',
+):
     """Read an archive from memory.
     """
     with new_archive_read(format_name, filter_name, passphrase) as archive_p:
         ffi.read_open_memory(archive_p, cast(buf, c_void_p), len(buf))
-        yield ArchiveRead(archive_p)
+        yield ArchiveRead(archive_p, header_codec)
 
 
 @contextmanager
 def stream_reader(
     stream, format_name='all', filter_name='all', block_size=page_size,
-    passphrase=None,
+    passphrase=None, header_codec='utf-8',
 ):
     """Read an archive from a stream.
 
@@ -158,7 +166,7 @@ def seek_func(archive_p, context, offset, whence):
         if stream.seekable():
             ffi.read_set_seek_callback(archive_p, seek_cb)
         ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
-        yield ArchiveRead(archive_p)
+        yield ArchiveRead(archive_p, header_codec)
 
 
 seekable_stream_reader = stream_reader
diff --git a/libarchive/write.py b/libarchive/write.py
diff --git a/tests/test_entry.py b/tests/test_entry.py