Skip to content

Commit a89e035

Browse files
committedJun 2, 2021
Add new extract_archive() API function #16
- This is to extract a single archive file of any supported format non recursively. - Also apply minor formatting and refactoring for readability - Improve docstrings - Add tests Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 37a9d5f commit a89e035

File tree

12 files changed

+211
-32
lines changed

12 files changed

+211
-32
lines changed
 

Diff for: ‎CHANGELOG.rst

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ v21.6.2
99
-------
1010

1111
- Add new --list-formats command line option to list supported archive formats
12+
- Add new exttractcode.api.extract_archive() API function to extract a single
13+
archive file of any supported format, non recursively.
1214

1315

1416
v21.6.1

Diff for: ‎src/extractcode/api.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,20 @@ def extract_archives(
2121
):
2222
"""
2323
Yield ExtractEvent while extracting archive(s) and compressed files at
24-
`location`.
24+
``location``.
2525
26-
If `recurse` is True, extract nested archives-in-archives recursively.
27-
If `all_formats` is True, extract all supported archives formats.
26+
If ``recurse`` is True, extract nested archives-in-archives recursively.
27+
If ``all_formats`` is True, extract all supported archives formats. The
28+
default is to only extract the common "extractcode.default_kinds"
2829
29-
Archives and compressed files are extracted in a directory named
30-
"<file_name>-extract" created in the same directory as the archive.
30+
Archives and compressed files are extracted in a new directory named
31+
"<file_name>-extract" created in the same directory as each extracted
32+
archive.
33+
34+
If ``replace_originals`` is True, the extracted archives are replaced by the
35+
extracted content and removed when extraction is complete
36+
37+
``ignore_pattern`` is a list of glob patterns to ignore.
3138
3239
Note: this API is returning an iterable and NOT a sequence.
3340
"""
@@ -46,3 +53,24 @@ def extract_archives(
4653
ignore_pattern=ignore_pattern,
4754
):
4855
yield xevent
56+
57+
58+
def extract_archive(location, target, verbose=False):
59+
"""
60+
Yield ExtractEvent while extracting a single archive or compressed file at
61+
``location`` to the ``target`` directory if the file is of any supported
62+
archive format. Note: this API is returning an iterable and NOT a sequence
63+
and does not extract recursively.
64+
65+
If ``verbose`` is True, ExtractEvent.errors will contain a full error
66+
traceback if any.
67+
"""
68+
69+
from extractcode.extract import extract_file
70+
from extractcode import all_kinds
71+
return extract_file(
72+
location=location,
73+
target=target,
74+
kinds=all_kinds,
75+
verbose=verbose,
76+
)

Diff for: ‎src/extractcode/archive.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
from collections import namedtuple
1110
import logging
1211
import os
12+
from collections import namedtuple
1313

1414
from commoncode import fileutils
1515
from commoncode import filetype
@@ -97,11 +97,14 @@ def can_extract(location):
9797

9898
def should_extract(location, kinds, ignore_pattern=()):
9999
"""
100-
Return True if this location should be extracted based on the provided kinds
100+
Return True if this ``location`` should be extracted based on the provided
101+
``kinds`` tuple and an ``ignore_pattern`` list of glob patterns.
101102
"""
102103
location = os.path.abspath(os.path.expanduser(location))
103-
ignore_pattern = {extension : 'User ignore: Supplied by --ignore'
104-
for extension in ignore_pattern}
104+
ignore_pattern = {
105+
extension : 'User ignore: Supplied by --ignore'
106+
for extension in ignore_pattern
107+
}
105108
should_ignore = is_ignored(location, ignore_pattern)
106109
extractor = get_extractor(location, kinds=kinds)
107110

Diff for: ‎src/extractcode/cli.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,11 @@ def display_extract_summary():
292292
if not quiet:
293293
echo_stderr('Extracting archives...', fg='green')
294294

295-
with cliutils.progressmanager(extractibles,
296-
item_show_func=extract_event, verbose=verbose) as extraction_events:
295+
with cliutils.progressmanager(
296+
extractibles,
297+
item_show_func=extract_event,
298+
verbose=verbose
299+
) as extraction_events:
297300

298301
for xev in extraction_events:
299302
if xev.done and (xev.warnings or xev.errors):

Diff for: ‎src/extractcode/extract.py

+34-19
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,11 @@ def extract(
108108
If `replace_originals` is True, the extracted archives are replaced by the
109109
extracted content.
110110
111-
Note that while the original file system is walked top-down, breadth-first,
112-
if recurse and a nested archive is found, it is extracted to full depth
113-
first before resuming the file system walk.
111+
``ignore_pattern`` is a list of glob patterns to ignore.
112+
113+
Note that while the original filesystem is walked top-down, breadth-first,
114+
if ``recurse`` and a nested archive is found, it is extracted at full depth
115+
first before resuming the filesystem walk.
114116
"""
115117

116118
extract_events = extract_files(
@@ -159,16 +161,18 @@ def extract_files(
159161
160162
If `recurse` is false, then do not extract further an already
161163
extracted archive identified by the corresponding extract suffix location.
164+
165+
``ignore_pattern`` is a list of glob patterns to ignore.
162166
"""
163167
ignored = partial(ignore.is_ignored, ignores=ignore.default_ignores, unignores={})
164168
if TRACE:
165-
logger.debug('extract:start: %(location)r recurse: %(recurse)r\n' % locals())
169+
logger.debug('extract:start: %(location)r recurse: %(recurse)r\n' % locals())
166170

167171
abs_location = abspath(expanduser(location))
168172
for top, dirs, files in fileutils.walk(abs_location, ignored):
169173
if TRACE:
170174
logger.debug(
171-
'extract:walk: top: %(top)r dirs: %(dirs)r files: r(files)r' % locals())
175+
'extract:walk: top: %(top)r dirs: %(dirs)r files: r(files)r' % locals())
172176

173177
if not recurse:
174178
if TRACE:
@@ -177,21 +181,25 @@ def extract_files(
177181
if extractcode.is_extraction_path(d):
178182
dirs.remove(d)
179183
if TRACE:
180-
logger.debug(
181-
'extract:walk: not recurse: removed dirs:'
182-
+repr(drs.symmetric_difference(set(dirs)))
183-
)
184+
rd = repr(drs.symmetric_difference(set(dirs)))
185+
logger.debug(f'extract:walk: not recurse: removed dirs: {rd}')
184186

185187
for f in files:
186188
loc = join(top, f)
187189
if not recurse and extractcode.is_extraction_path(loc):
188190
if TRACE:
189-
logger.debug('extract:walk not recurse: skipped file: %(loc)r' % locals())
191+
logger.debug(
192+
'extract:walk not recurse: skipped file: %(loc)r' % locals())
190193
continue
191194

192-
if not extractcode.archive.should_extract(loc, kinds, ignore_pattern):
195+
if not extractcode.archive.should_extract(
196+
location=loc,
197+
kinds=kinds,
198+
ignore_pattern=ignore_pattern
199+
):
193200
if TRACE:
194-
logger.debug('extract:walk: skipped file: not should_extract: %(loc)r' % locals())
201+
logger.debug(
202+
'extract:walk: skipped file: not should_extract: %(loc)r' % locals())
195203
continue
196204

197205
target = join(abspath(top), extractcode.get_extraction_path(loc))
@@ -227,15 +235,21 @@ def extract_file(
227235
target,
228236
kinds=extractcode.default_kinds,
229237
verbose=False,
230-
all_formats=False,
238+
*args,
239+
**kwargs,
231240
):
232241
"""
233-
Extract a single archive at `location` in the `target` directory if it is of
234-
a kind supported in the `kinds` kind tuple.
242+
Extract a single archive file at ``location`` to the ``target`` directory if
243+
this file is of a kind supported in the ``kinds`` kind tuple. Yield
244+
ExtractEvents. Does not extract recursively.
235245
"""
236246
warnings = []
237247
errors = []
238-
extractor = extractcode.archive.get_extractor(location, kinds)
248+
extractor = extractcode.archive.get_extractor(
249+
location=location,
250+
kinds=kinds,
251+
)
252+
239253
if TRACE:
240254
emodule = getattr(extractor, '__module__', '')
241255
ename = getattr(extractor, '__name__', '')
@@ -254,22 +268,23 @@ def extract_file(
254268
)
255269

256270
try:
257-
# extract first to a temp directory: if there is an error, the
258-
# extracted files will not be moved to target
271+
# Extract first to a temp directory: if there is an error, the
272+
# extracted files will not be moved to the target.
259273
tmp_tgt = fileutils.get_temp_dir(prefix='extractcode-extract-')
260274
abs_location = abspath(expanduser(location))
261275
warns = extractor(abs_location, tmp_tgt) or []
262276
warnings.extend(warns)
263277
fileutils.copytree(tmp_tgt, target)
264278
fileutils.delete(tmp_tgt)
279+
265280
except Exception as e:
266281
errors = [str(e).strip(' \'"')]
267282
if verbose:
268283
errors.append(traceback.format_exc())
269284
if TRACE:
270285
tb = traceback.format_exc()
271286
logger.debug(
272-
'extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals())
287+
f'extract_file: ERROR: {location}: {errors}\n{e}\n{tb}')
273288

274289
finally:
275290
yield ExtractEvent(
File renamed without changes.

Diff for: ‎tests/data/api/doc.docx

890 Bytes
Binary file not shown.

Diff for: ‎tests/data/extract/all_formats/c.zip

890 Bytes
Binary file not shown.

Diff for: ‎tests/data/extract/all_formats/doc.docx

890 Bytes
Binary file not shown.

Diff for: ‎tests/extractcode_assert_utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def check_files(test_dir, expected, regen=False):
6262
result.append(path)
6363

6464
expected_is_json_file = False
65+
6566
if not isinstance(expected, (list, tuple)) and expected.endswith('.json'):
6667
expected_is_json_file = True
6768
# this is a path to a JSON file
@@ -79,13 +80,13 @@ def check_files(test_dir, expected, regen=False):
7980
result = sorted(result)
8081

8182
try:
82-
assert expected_content == result
83+
assert result == expected_content
8384
except AssertionError:
8485
files = [
8586
'test_dir: file://{}'.format(test_dir),
8687
'expected: file://{}'.format(expected if expected_is_json_file else ''),
8788
]
88-
assert files + expected_content == result
89+
assert result == files + expected_content
8990

9091
for location in locs:
9192
assert filetype.is_file(location)

Diff for: ‎tests/test_extract.py

+77
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from extractcode_assert_utils import check_files
2525
from extractcode_assert_utils import check_no_error
2626
from extractcode_assert_utils import BaseArchiveTestCase
27+
from extractcode import all_kinds
2728

2829
project_root = os.path.dirname(os.path.dirname(__file__))
2930

@@ -891,3 +892,79 @@ def test_extract_ignore_pattern(self):
891892
result = list(extract.extract(test_dir, recurse=True, ignore_pattern=('b*.zip',)))
892893
check_no_error(result)
893894
check_files(test_dir, expected)
895+
896+
def test_extract_file_ignores_archives_not_of_default_kinds(self):
897+
test_dir = self.get_test_loc('extract/all_formats/doc.docx', copy=True)
898+
base = fileutils.parent_directory(test_dir)
899+
expected = []
900+
cleaned_test_file = test_dir.replace(base, '')
901+
expected_events = []
902+
903+
target = extractcode.get_extraction_path(test_dir)
904+
result = list(extract.extract_file(test_dir, target))
905+
result = [
906+
r._replace(
907+
source=cleaned_test_file,
908+
target=extractcode.get_extraction_path(cleaned_test_file))
909+
for r in result
910+
]
911+
assert result == expected_events
912+
check_files(target, expected)
913+
914+
def test_extract_file_handles_archives_of_default_kinds(self):
915+
test_dir = self.get_test_loc('extract/all_formats/c.zip', copy=True)
916+
base = fileutils.parent_directory(test_dir)
917+
expected = [
918+
'c/a/a.txt',
919+
'c/b/a.txt',
920+
'c/c/a.txt',
921+
]
922+
cleaned_test_file = test_dir.replace(base, '')
923+
expected_events = [
924+
extract.ExtractEvent(
925+
source=cleaned_test_file,
926+
target=extractcode.get_extraction_path(cleaned_test_file),
927+
done=False, warnings=[], errors=[]
928+
),
929+
extract.ExtractEvent(
930+
source=cleaned_test_file,
931+
target=extractcode.get_extraction_path(cleaned_test_file),
932+
done=True, warnings=[], errors=[]
933+
)
934+
]
935+
936+
target = extractcode.get_extraction_path(test_dir)
937+
result = list(extract.extract_file(test_dir, target))
938+
result = [
939+
r._replace(
940+
source=cleaned_test_file,
941+
target=extractcode.get_extraction_path(cleaned_test_file))
942+
for r in result
943+
]
944+
assert result == expected_events
945+
check_files(target, expected)
946+
947+
def test_extract_file_works_with_all_kinds(self):
948+
test_dir = self.get_test_loc('extract/all_formats/doc.docx', copy=True)
949+
base = fileutils.parent_directory(test_dir)
950+
expected = [
951+
'c/a/a.txt',
952+
'c/b/a.txt',
953+
'c/c/a.txt',
954+
]
955+
cleaned_test_file = test_dir.replace(base, '')
956+
expected_events = [
957+
extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=False, warnings=[], errors=[]),
958+
extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=True, warnings=[], errors=[]),
959+
]
960+
961+
target = extractcode.get_extraction_path(test_dir)
962+
result = list(extract.extract_file(test_dir, target, kinds=all_kinds))
963+
result = [
964+
r._replace(
965+
source=cleaned_test_file,
966+
target=extractcode.get_extraction_path(cleaned_test_file))
967+
for r in result
968+
]
969+
assert result == expected_events
970+
check_files(target, expected)

Diff for: ‎tests/test_extractcode_api.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) nexB Inc. and others. All rights reserved.
4+
# ScanCode is a trademark of nexB Inc.
5+
# SPDX-License-Identifier: Apache-2.0
6+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
7+
# See https://github.com/nexB/extractcode for support or download.
8+
# See https://aboutcode.org for more information about nexB OSS projects.
9+
#
10+
11+
import os
12+
13+
from commoncode import fileutils
14+
15+
import extractcode
16+
from extractcode import extract
17+
from extractcode import api
18+
from extractcode_assert_utils import check_files
19+
from extractcode_assert_utils import BaseArchiveTestCase
20+
21+
project_root = os.path.dirname(os.path.dirname(__file__))
22+
23+
24+
class TestExtractApi(BaseArchiveTestCase):
25+
test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
26+
27+
def test_extract_archive(self):
28+
test_dir = self.get_test_loc('api/doc.docx', copy=True)
29+
base = fileutils.parent_directory(test_dir)
30+
expected = [
31+
'c/a/a.txt',
32+
'c/b/a.txt',
33+
'c/c/a.txt',
34+
]
35+
36+
cleaned_test_file = test_dir.replace(base, '')
37+
expected_event = [
38+
extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=False, warnings=[], errors=[]),
39+
extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=True, warnings=[], errors=[]),
40+
]
41+
target = extractcode.get_extraction_path(test_dir)
42+
result = list(api.extract_archive(test_dir, target))
43+
result = [
44+
r._replace(
45+
source=cleaned_test_file,
46+
target=extractcode.get_extraction_path(cleaned_test_file))
47+
for r in result
48+
]
49+
assert expected_event == result
50+
check_files(target, expected)

0 commit comments

Comments
 (0)
Please sign in to comment.