Add new extract_archive() API function #16

pombredanne · pombredanne · commit a89e035ce814 · 2021-06-02T14:01:46.000+02:00
- This is to extract a single archive file of any supported format
  non recursively.
- Also apply minor formatting and refactoring for readability
- Improve docstrings
- Add tests

Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,8 @@ v21.6.2
 -------
 
 - Add new --list-formats command line option to list supported archive formats
+- Add new exttractcode.api.extract_archive() API function to extract a single
+  archive file of any supported format, non recursively.
 
 
 v21.6.1
diff --git a/src/extractcode/api.py b/src/extractcode/api.py
@@ -21,13 +21,20 @@ def extract_archives(
 ):
     """
     Yield ExtractEvent while extracting archive(s) and compressed files at
-    `location`.
+    ``location``.
 
-    If `recurse` is True, extract nested archives-in-archives recursively.
-    If `all_formats` is True, extract all supported archives formats.
+    If ``recurse`` is True, extract nested archives-in-archives recursively.
+    If ``all_formats`` is True, extract all supported archives formats. The
+    default is to only extract the common "extractcode.default_kinds"
 
-    Archives and compressed files are extracted in a directory named
-    "<file_name>-extract" created in the same directory as the archive.
+    Archives and compressed files are extracted in a new directory named
+    "<file_name>-extract" created in the same directory as each extracted
+    archive.
+
+    If ``replace_originals`` is True, the extracted archives are replaced by the
+    extracted content and removed when extraction is complete
+
+    ``ignore_pattern`` is a list of glob patterns to ignore.
 
     Note: this API is returning an iterable and NOT a sequence.
     """
@@ -46,3 +53,24 @@ def extract_archives(
         ignore_pattern=ignore_pattern,
     ):
         yield xevent
+
+
+def extract_archive(location, target, verbose=False):
+    """
+    Yield ExtractEvent while extracting a single archive or compressed file at
+    ``location`` to the ``target`` directory if the file is of any supported
+    archive format. Note: this API is returning an iterable and NOT a sequence
+    and does not extract recursively.
+
+    If ``verbose`` is True, ExtractEvent.errors will contain a full error
+    traceback if any.
+    """
+
+    from extractcode.extract import extract_file
+    from extractcode import all_kinds
+    return extract_file(
+        location=location,
+        target=target,
+        kinds=all_kinds,
+        verbose=verbose,
+    )
diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py
@@ -7,9 +7,9 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-from collections import namedtuple
 import logging
 import os
+from collections import namedtuple
 
 from commoncode import fileutils
 from commoncode import filetype
@@ -97,11 +97,14 @@ def can_extract(location):
 
 def should_extract(location, kinds, ignore_pattern=()):
     """
-    Return True if this location should be extracted based on the provided kinds
+    Return True if this ``location`` should be extracted based on the provided
+    ``kinds`` tuple and an ``ignore_pattern`` list of glob patterns.
     """
     location = os.path.abspath(os.path.expanduser(location))
-    ignore_pattern = {extension : 'User ignore: Supplied by --ignore'
-        for extension in ignore_pattern}
+    ignore_pattern = {
+        extension : 'User ignore: Supplied by --ignore'
+        for extension in ignore_pattern
+    }
     should_ignore = is_ignored(location, ignore_pattern)
     extractor = get_extractor(location, kinds=kinds)
 
diff --git a/src/extractcode/cli.py b/src/extractcode/cli.py
@@ -292,8 +292,11 @@ def display_extract_summary():
     if not quiet:
         echo_stderr('Extracting archives...', fg='green')
 
-        with cliutils.progressmanager(extractibles,
-            item_show_func=extract_event, verbose=verbose) as extraction_events:
+        with cliutils.progressmanager(
+            extractibles,
+            item_show_func=extract_event, 
+            verbose=verbose
+        ) as extraction_events:
 
             for xev in extraction_events:
                 if xev.done and (xev.warnings or xev.errors):
diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py
@@ -108,9 +108,11 @@ def extract(
     If `replace_originals` is True, the extracted archives are replaced by the
     extracted content.
 
-    Note that while the original file system is walked top-down, breadth-first,
-    if recurse and a nested archive is found, it is extracted to full depth
-    first before resuming the file system walk.
+    ``ignore_pattern`` is a list of glob patterns to ignore.
+
+    Note that while the original filesystem is walked top-down, breadth-first,
+    if ``recurse`` and a nested archive is found, it is extracted at full depth
+    first before resuming the filesystem walk.
     """
 
     extract_events = extract_files(
@@ -159,16 +161,18 @@ def extract_files(
 
     If `recurse` is false, then do not extract further an already
     extracted archive identified by the corresponding extract suffix location.
+
+    ``ignore_pattern`` is a list of glob patterns to ignore.
     """
     ignored = partial(ignore.is_ignored, ignores=ignore.default_ignores, unignores={})
     if TRACE:
-        logger.debug('extract:start: %(location)r  recurse: %(recurse)r\n' % locals())
+        logger.debug('extract:start: %(location)r recurse: %(recurse)r\n' % locals())
 
     abs_location = abspath(expanduser(location))
     for top, dirs, files in fileutils.walk(abs_location, ignored):
         if TRACE:
             logger.debug(
-                'extract:walk: top:  %(top)r dirs: %(dirs)r files: r(files)r' % locals())
+                'extract:walk: top: %(top)r dirs: %(dirs)r files: r(files)r' % locals())
 
         if not recurse:
             if TRACE:
@@ -177,21 +181,25 @@ def extract_files(
                 if extractcode.is_extraction_path(d):
                     dirs.remove(d)
             if TRACE:
-                logger.debug(
-                    'extract:walk: not recurse: removed dirs:'
-                    +repr(drs.symmetric_difference(set(dirs)))
-                )
+                rd = repr(drs.symmetric_difference(set(dirs)))
+                logger.debug(f'extract:walk: not recurse: removed dirs: {rd}')
 
         for f in files:
             loc = join(top, f)
             if not recurse and extractcode.is_extraction_path(loc):
                 if TRACE:
-                    logger.debug('extract:walk not recurse: skipped  file: %(loc)r' % locals())
+                    logger.debug(
+                        'extract:walk not recurse: skipped  file: %(loc)r' % locals())
                 continue
 
-            if not extractcode.archive.should_extract(loc, kinds, ignore_pattern):
+            if not extractcode.archive.should_extract(
+                location=loc,
+                kinds=kinds,
+                ignore_pattern=ignore_pattern
+            ):
                 if TRACE:
-                    logger.debug('extract:walk: skipped file: not should_extract: %(loc)r' % locals())
+                    logger.debug(
+                        'extract:walk: skipped file: not should_extract: %(loc)r' % locals())
                 continue
 
             target = join(abspath(top), extractcode.get_extraction_path(loc))
@@ -227,15 +235,21 @@ def extract_file(
     target,
     kinds=extractcode.default_kinds,
     verbose=False,
-    all_formats=False,
+    *args,
+    **kwargs,
 ):
     """
-    Extract a single archive at `location` in the `target` directory if it is of
-    a kind supported in the `kinds` kind tuple.
+    Extract a single archive file at ``location`` to the ``target`` directory if
+    this file is of a kind supported in the ``kinds`` kind tuple. Yield
+    ExtractEvents. Does not extract recursively.
     """
     warnings = []
     errors = []
-    extractor = extractcode.archive.get_extractor(location, kinds)
+    extractor = extractcode.archive.get_extractor(
+        location=location,
+        kinds=kinds,
+    )
+
     if TRACE:
         emodule = getattr(extractor, '__module__', '')
         ename = getattr(extractor, '__name__', '')
@@ -254,22 +268,23 @@ def extract_file(
         )
 
         try:
-            # extract first to a temp directory: if there is an error,  the
-            # extracted files will not be moved to target
+            # Extract first to a temp directory: if there is an error, the
+            # extracted files will not be moved to the target.
             tmp_tgt = fileutils.get_temp_dir(prefix='extractcode-extract-')
             abs_location = abspath(expanduser(location))
             warns = extractor(abs_location, tmp_tgt) or []
             warnings.extend(warns)
             fileutils.copytree(tmp_tgt, target)
             fileutils.delete(tmp_tgt)
+
         except Exception as e:
             errors = [str(e).strip(' \'"')]
             if verbose:
                 errors.append(traceback.format_exc())
             if TRACE:
                 tb = traceback.format_exc()
                 logger.debug(
-                    'extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals())
+                    f'extract_file: ERROR: {location}: {errors}\n{e}\n{tb}')
 
         finally:
             yield ExtractEvent(
diff --git a/tests/data/api/c.zip b/tests/data/api/c.zip
diff --git a/tests/data/api/doc.docx b/tests/data/api/doc.docx
diff --git a/tests/data/extract/all_formats/c.zip b/tests/data/extract/all_formats/c.zip
diff --git a/tests/data/extract/all_formats/doc.docx b/tests/data/extract/all_formats/doc.docx
diff --git a/tests/extractcode_assert_utils.py b/tests/extractcode_assert_utils.py
@@ -62,6 +62,7 @@ def check_files(test_dir, expected, regen=False):
             result.append(path)
 
     expected_is_json_file = False
+
     if not isinstance(expected, (list, tuple)) and expected.endswith('.json'):
         expected_is_json_file = True
         # this is a path to a JSON file
@@ -79,13 +80,13 @@ def check_files(test_dir, expected, regen=False):
     result = sorted(result)
 
     try:
-        assert expected_content == result
+        assert result == expected_content
     except AssertionError:
         files = [
             'test_dir: file://{}'.format(test_dir),
             'expected: file://{}'.format(expected if expected_is_json_file else ''),
         ]
-        assert files + expected_content == result
+        assert result == files + expected_content
 
     for location in locs:
         assert filetype.is_file(location)
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -24,6 +24,7 @@
 from extractcode_assert_utils import check_files
 from extractcode_assert_utils import check_no_error
 from extractcode_assert_utils import BaseArchiveTestCase
+from extractcode import all_kinds
 
 project_root = os.path.dirname(os.path.dirname(__file__))
 
@@ -891,3 +892,79 @@ def test_extract_ignore_pattern(self):
         result = list(extract.extract(test_dir, recurse=True, ignore_pattern=('b*.zip',)))
         check_no_error(result)
         check_files(test_dir, expected)
+
+    def test_extract_file_ignores_archives_not_of_default_kinds(self):
+        test_dir = self.get_test_loc('extract/all_formats/doc.docx', copy=True)
+        base = fileutils.parent_directory(test_dir)
+        expected = []
+        cleaned_test_file = test_dir.replace(base, '')
+        expected_events = []
+
+        target = extractcode.get_extraction_path(test_dir)
+        result = list(extract.extract_file(test_dir, target))
+        result = [
+            r._replace(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file))
+            for r in result
+        ]
+        assert result == expected_events
+        check_files(target, expected)
+
+    def test_extract_file_handles_archives_of_default_kinds(self):
+        test_dir = self.get_test_loc('extract/all_formats/c.zip', copy=True)
+        base = fileutils.parent_directory(test_dir)
+        expected = [
+            'c/a/a.txt',
+            'c/b/a.txt',
+            'c/c/a.txt',
+        ]
+        cleaned_test_file = test_dir.replace(base, '')
+        expected_events = [
+            extract.ExtractEvent(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file),
+                done=False, warnings=[], errors=[]
+            ),
+            extract.ExtractEvent(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file),
+                done=True, warnings=[], errors=[]
+            )
+        ]
+
+        target = extractcode.get_extraction_path(test_dir)
+        result = list(extract.extract_file(test_dir, target))
+        result = [
+            r._replace(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file))
+            for r in result
+        ]
+        assert result == expected_events
+        check_files(target, expected)
+
+    def test_extract_file_works_with_all_kinds(self):
+        test_dir = self.get_test_loc('extract/all_formats/doc.docx', copy=True)
+        base = fileutils.parent_directory(test_dir)
+        expected = [
+            'c/a/a.txt',
+            'c/b/a.txt',
+            'c/c/a.txt',
+        ]
+        cleaned_test_file = test_dir.replace(base, '')
+        expected_events = [
+            extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=False, warnings=[], errors=[]),
+            extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=True, warnings=[], errors=[]),
+        ]
+
+        target = extractcode.get_extraction_path(test_dir)
+        result = list(extract.extract_file(test_dir, target, kinds=all_kinds))
+        result = [
+            r._replace(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file))
+            for r in result
+        ]
+        assert result == expected_events
+        check_files(target, expected)
diff --git a/tests/test_extractcode_api.py b/tests/test_extractcode_api.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/extractcode for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import os
+
+from commoncode import fileutils
+
+import extractcode
+from extractcode import extract
+from extractcode import api
+from extractcode_assert_utils import check_files
+from extractcode_assert_utils import BaseArchiveTestCase
+
+project_root = os.path.dirname(os.path.dirname(__file__))
+
+
+class TestExtractApi(BaseArchiveTestCase):
+    test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+    def test_extract_archive(self):
+        test_dir = self.get_test_loc('api/doc.docx', copy=True)
+        base = fileutils.parent_directory(test_dir)
+        expected = [
+            'c/a/a.txt',
+            'c/b/a.txt',
+            'c/c/a.txt',
+        ]
+
+        cleaned_test_file = test_dir.replace(base, '')
+        expected_event = [
+            extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=False, warnings=[], errors=[]),
+            extract.ExtractEvent(source='doc.docx', target='doc.docx-extract', done=True, warnings=[], errors=[]),
+        ]
+        target = extractcode.get_extraction_path(test_dir)
+        result = list(api.extract_archive(test_dir, target))
+        result = [
+            r._replace(
+                source=cleaned_test_file,
+                target=extractcode.get_extraction_path(cleaned_test_file))
+            for r in result
+        ]
+        assert expected_event == result
+        check_files(target, expected)