Skip to content

Commit ed0e591

Browse files
authored
add support for different file metadata encodings (#125)
1 parent fbf0362 commit ed0e591

File tree

6 files changed

+135
-38
lines changed

6 files changed

+135
-38
lines changed

README.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,21 @@ and the optional third argument is the compression format (called “filter” i
114114
libarchive). The acceptable values are listed in ``libarchive.ffi.WRITE_FORMATS``
115115
and ``libarchive.ffi.WRITE_FILTERS``.
116116

117+
File metadata codecs
118+
--------------------
119+
120+
By default, UTF-8 is used to read and write file attributes from and to archives.
121+
A different codec can be specified through the ``header_codec`` arguments of the
122+
``*_reader`` and ``*_writer`` functions. Example::
123+
124+
with libarchive.file_writer('test.tar', 'ustar', header_codec='cp037') as archive:
125+
...
126+
with file_reader('test.tar', header_codec='cp037') as archive:
127+
...
128+
129+
In addition to file paths (``pathname`` and ``linkpath``), the specified codec is
130+
used to encode and decode user and group names (``uname`` and ``gname``).
131+
117132
License
118133
=======
119134

libarchive/entry.py

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from contextlib import contextmanager
2-
from ctypes import c_char_p, create_string_buffer
2+
from ctypes import create_string_buffer
33
from enum import IntEnum
44
import math
55

@@ -34,15 +34,19 @@ def format_time(seconds, nanos):
3434

3535
class ArchiveEntry:
3636

37-
__slots__ = ('_archive_p', '_entry_p')
37+
__slots__ = ('_archive_p', '_entry_p', 'header_codec')
3838

39-
def __init__(self, archive_p=None, **attributes):
39+
def __init__(self, archive_p=None, header_codec='utf-8', **attributes):
4040
"""Allocate memory for an `archive_entry` struct.
4141
42-
The attributes are passed to the `modify` method.
42+
The `header_codec` is used to decode and encode file paths and other
43+
attributes.
44+
45+
The `**attributes` are passed to the `modify` method.
4346
"""
4447
self._archive_p = archive_p
4548
self._entry_p = ffi.entry_new()
49+
self.header_codec = header_codec
4650
if attributes:
4751
self.modify(**attributes)
4852

@@ -54,7 +58,7 @@ def __str__(self):
5458
"""Returns the file's path"""
5559
return self.pathname
5660

57-
def modify(self, **attributes):
61+
def modify(self, header_codec=None, **attributes):
5862
"""Convenience method to modify the entry's attributes.
5963
6064
Args:
@@ -83,6 +87,8 @@ def modify(self, **attributes):
8387
rdevmajor (int): major part of the device number
8488
rdevminor (int): minor part of the device number
8589
"""
90+
if header_codec:
91+
self.header_codec = header_codec
8692
for name, value in attributes.items():
8793
setattr(self, name, value)
8894

@@ -112,23 +118,45 @@ def gid(self, gid):
112118

113119
@property
114120
def uname(self):
115-
return ffi.entry_uname_w(self._entry_p)
121+
uname = ffi.entry_uname_w(self._entry_p)
122+
if not uname:
123+
uname = ffi.entry_uname(self._entry_p)
124+
if uname is not None:
125+
try:
126+
uname = uname.decode(self.header_codec)
127+
except UnicodeError:
128+
pass
129+
return uname
116130

117131
@uname.setter
118132
def uname(self, value):
119133
if not isinstance(value, bytes):
120-
value = value.encode('utf8')
121-
ffi.entry_update_uname_utf8(self._entry_p, value)
134+
value = value.encode(self.header_codec)
135+
if self.header_codec == 'utf-8':
136+
ffi.entry_update_uname_utf8(self._entry_p, value)
137+
else:
138+
ffi.entry_copy_uname(self._entry_p, value)
122139

123140
@property
124141
def gname(self):
125-
return ffi.entry_gname_w(self._entry_p)
142+
gname = ffi.entry_gname_w(self._entry_p)
143+
if not gname:
144+
gname = ffi.entry_gname(self._entry_p)
145+
if gname is not None:
146+
try:
147+
gname = gname.decode(self.header_codec)
148+
except UnicodeError:
149+
pass
150+
return gname
126151

127152
@gname.setter
128153
def gname(self, value):
129154
if not isinstance(value, bytes):
130-
value = value.encode('utf8')
131-
ffi.entry_update_gname_utf8(self._entry_p, value)
155+
value = value.encode(self.header_codec)
156+
if self.header_codec == 'utf-8':
157+
ffi.entry_update_gname_utf8(self._entry_p, value)
158+
else:
159+
ffi.entry_copy_gname(self._entry_p, value)
132160

133161
def get_blocks(self, block_size=ffi.page_size):
134162
"""Read the file's content, keeping only one chunk in memory at a time.
@@ -294,28 +322,48 @@ def pathname(self):
294322
path = ffi.entry_pathname_w(self._entry_p)
295323
if not path:
296324
path = ffi.entry_pathname(self._entry_p)
297-
try:
298-
path = path.decode()
299-
except UnicodeError:
300-
pass
325+
if path is not None:
326+
try:
327+
path = path.decode(self.header_codec)
328+
except UnicodeError:
329+
pass
301330
return path
302331

303332
@pathname.setter
304333
def pathname(self, value):
305334
if not isinstance(value, bytes):
306-
value = value.encode('utf8')
307-
ffi.entry_update_pathname_utf8(self._entry_p, c_char_p(value))
335+
value = value.encode(self.header_codec)
336+
if self.header_codec == 'utf-8':
337+
ffi.entry_update_pathname_utf8(self._entry_p, value)
338+
else:
339+
ffi.entry_copy_pathname(self._entry_p, value)
308340

309341
@property
310342
def linkpath(self):
311-
return (ffi.entry_symlink_w(self._entry_p) or
343+
path = (
344+
(
345+
ffi.entry_symlink_w(self._entry_p) or
346+
ffi.entry_symlink(self._entry_p)
347+
) if self.issym else (
312348
ffi.entry_hardlink_w(self._entry_p) or
313-
ffi.entry_symlink(self._entry_p) or
314-
ffi.entry_hardlink(self._entry_p))
349+
ffi.entry_hardlink(self._entry_p)
350+
)
351+
)
352+
if isinstance(path, bytes):
353+
try:
354+
path = path.decode(self.header_codec)
355+
except UnicodeError:
356+
pass
357+
return path
315358

316359
@linkpath.setter
317360
def linkpath(self, value):
318-
ffi.entry_update_link_utf8(self._entry_p, value)
361+
if not isinstance(value, bytes):
362+
value = value.encode(self.header_codec)
363+
if self.header_codec == 'utf-8':
364+
ffi.entry_update_link_utf8(self._entry_p, value)
365+
else:
366+
ffi.entry_copy_link(self._entry_p, value)
319367

320368
# aliases for compatibility with the standard `tarfile` module
321369
path = property(pathname.fget, pathname.fset, doc="alias of pathname")

libarchive/ffi.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ def get_write_filter_function(filter_name):
200200
ffi('entry_rdevminor', [c_archive_entry_p], c_uint)
201201
ffi('entry_uid', [c_archive_entry_p], c_longlong)
202202
ffi('entry_gid', [c_archive_entry_p], c_longlong)
203+
ffi('entry_uname', [c_archive_entry_p], c_char_p)
204+
ffi('entry_gname', [c_archive_entry_p], c_char_p)
203205
ffi('entry_uname_w', [c_archive_entry_p], c_wchar_p)
204206
ffi('entry_gname_w', [c_archive_entry_p], c_wchar_p)
205207

@@ -222,9 +224,13 @@ def get_write_filter_function(filter_name):
222224
ffi('entry_unset_ctime', [c_archive_entry_p], None)
223225
ffi('entry_unset_birthtime', [c_archive_entry_p], None)
224226

227+
ffi('entry_copy_pathname', [c_archive_entry_p, c_char_p], None)
225228
ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
229+
ffi('entry_copy_link', [c_archive_entry_p, c_char_p], None)
226230
ffi('entry_update_link_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
231+
ffi('entry_copy_uname', [c_archive_entry_p, c_char_p], None)
227232
ffi('entry_update_uname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
233+
ffi('entry_copy_gname', [c_archive_entry_p, c_char_p], None)
228234
ffi('entry_update_gname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int)
229235

230236
ffi('entry_clear', [c_archive_entry_p], c_archive_entry_p)

libarchive/read.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,18 @@
1212

1313
class ArchiveRead:
1414

15-
def __init__(self, archive_p):
15+
def __init__(self, archive_p, header_codec='utf-8'):
1616
self._pointer = archive_p
17+
self.header_codec = header_codec
1718

1819
def __iter__(self):
1920
"""Iterates through an archive's entries.
2021
"""
2122
archive_p = self._pointer
23+
header_codec = self.header_codec
2224
read_next_header2 = ffi.read_next_header2
2325
while 1:
24-
entry = ArchiveEntry(archive_p)
26+
entry = ArchiveEntry(archive_p, header_codec)
2527
r = read_next_header2(archive_p, entry._entry_p)
2628
if r == ARCHIVE_EOF:
2729
return
@@ -68,6 +70,7 @@ def custom_reader(
6870
read_func, format_name='all', filter_name='all',
6971
open_func=None, seek_func=None, close_func=None,
7072
block_size=page_size, archive_read_class=ArchiveRead, passphrase=None,
73+
header_codec='utf-8',
7174
):
7275
"""Read an archive using a custom function.
7376
"""
@@ -79,12 +82,13 @@ def custom_reader(
7982
if seek_func:
8083
ffi.read_set_seek_callback(archive_p, seek_cb)
8184
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
82-
yield archive_read_class(archive_p)
85+
yield archive_read_class(archive_p, header_codec)
8386

8487

8588
@contextmanager
8689
def fd_reader(
8790
fd, format_name='all', filter_name='all', block_size=4096, passphrase=None,
91+
header_codec='utf-8',
8892
):
8993
"""Read an archive from a file descriptor.
9094
"""
@@ -94,12 +98,13 @@ def fd_reader(
9498
except (OSError, AttributeError): # pragma: no cover
9599
pass
96100
ffi.read_open_fd(archive_p, fd, block_size)
97-
yield ArchiveRead(archive_p)
101+
yield ArchiveRead(archive_p, header_codec)
98102

99103

100104
@contextmanager
101105
def file_reader(
102106
path, format_name='all', filter_name='all', block_size=4096, passphrase=None,
107+
header_codec='utf-8',
103108
):
104109
"""Read an archive from a file.
105110
"""
@@ -109,22 +114,25 @@ def file_reader(
109114
except (OSError, AttributeError): # pragma: no cover
110115
pass
111116
ffi.read_open_filename_w(archive_p, path, block_size)
112-
yield ArchiveRead(archive_p)
117+
yield ArchiveRead(archive_p, header_codec)
113118

114119

115120
@contextmanager
116-
def memory_reader(buf, format_name='all', filter_name='all', passphrase=None):
121+
def memory_reader(
122+
buf, format_name='all', filter_name='all', passphrase=None,
123+
header_codec='utf-8',
124+
):
117125
"""Read an archive from memory.
118126
"""
119127
with new_archive_read(format_name, filter_name, passphrase) as archive_p:
120128
ffi.read_open_memory(archive_p, cast(buf, c_void_p), len(buf))
121-
yield ArchiveRead(archive_p)
129+
yield ArchiveRead(archive_p, header_codec)
122130

123131

124132
@contextmanager
125133
def stream_reader(
126134
stream, format_name='all', filter_name='all', block_size=page_size,
127-
passphrase=None,
135+
passphrase=None, header_codec='utf-8',
128136
):
129137
"""Read an archive from a stream.
130138
@@ -158,7 +166,7 @@ def seek_func(archive_p, context, offset, whence):
158166
if stream.seekable():
159167
ffi.read_set_seek_callback(archive_p, seek_cb)
160168
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
161-
yield ArchiveRead(archive_p)
169+
yield ArchiveRead(archive_p, header_codec)
162170

163171

164172
seekable_stream_reader = stream_reader

0 commit comments

Comments
 (0)