diff --git a/gdown/download_folder.py b/gdown/download_folder.py index 7814162b..81d34bd7 100644 --- a/gdown/download_folder.py +++ b/gdown/download_folder.py @@ -1,3 +1,4 @@ +import collections import itertools import json import os @@ -5,6 +6,8 @@ import re import sys import warnings +from typing import List +from typing import Union import bs4 @@ -188,6 +191,11 @@ def _get_directory_structure(gdrive_file, previous_path): return directory_structure +GoogleDriveFileToDownload = collections.namedtuple( + "GoogleDriveFileToDownload", ("id", "path", "local_path") +) + + def download_folder( url=None, id=None, @@ -199,7 +207,8 @@ def download_folder( remaining_ok=False, verify=True, user_agent=None, -): + skip_download: bool = False, +) -> Union[List[str], List[GoogleDriveFileToDownload], None]: """Downloads entire folder from URL. Parameters @@ -226,11 +235,16 @@ def download_folder( to a CA bundle to use. Default is True. user_agent: str, optional User-agent to use in the HTTP request. + skip_download: bool, optional + If True, return the list of files to download without downloading them. + Defaults to False. Returns ------- - filenames: list of str - List of files downloaded, or None if failed. + files: List[str] or List[GoogleDriveFileToDownload] or None + If dry_run is False, list of local file paths downloaded or None if failed. + If dry_run is True, list of GoogleDriveFileToDownload that contains + id, path, and local_path. Example ------- @@ -251,53 +265,61 @@ def download_folder( if not quiet: print("Retrieving folder contents", file=sys.stderr) - return_code, gdrive_file = _download_and_parse_google_drive_link( + is_success, gdrive_file = _download_and_parse_google_drive_link( sess, url, quiet=quiet, remaining_ok=remaining_ok, verify=verify, ) + if not is_success: + print("Failed to retrieve folder contents", file=sys.stderr) + return None - if not return_code: - return return_code if not quiet: print("Retrieving folder contents completed", file=sys.stderr) print("Building directory structure", file=sys.stderr) + directory_structure = _get_directory_structure(gdrive_file, previous_path="") + if not quiet: + print("Building directory structure completed", file=sys.stderr) + if output is None: output = os.getcwd() + osp.sep if output.endswith(osp.sep): - root_folder = osp.join(output, gdrive_file.name) + root_dir = osp.join(output, gdrive_file.name) else: - root_folder = output - directory_structure = _get_directory_structure(gdrive_file, root_folder) - if not osp.exists(root_folder): - os.makedirs(root_folder) + root_dir = output + if not osp.exists(root_dir): + os.makedirs(root_dir) - if not quiet: - print("Building directory structure completed") - filenames = [] - for file_id, file_path in directory_structure: - if file_id is None: # folder - if not osp.exists(file_path): - os.makedirs(file_path) - continue + files = [] + for id, path in directory_structure: + local_path = osp.join(root_dir, path) - filename = download( - url="https://drive.google.com/uc?id=" + file_id, - output=file_path, - quiet=quiet, - proxy=proxy, - speed=speed, - use_cookies=use_cookies, - verify=verify, - ) + if id is None: # folder + if not skip_download and not osp.exists(local_path): + os.makedirs(local_path) + continue - if filename is None: - if not quiet: - print("Download ended unsuccessfully", file=sys.stderr) - return - filenames.append(filename) + if skip_download: + files.append( + GoogleDriveFileToDownload(id=id, path=path, local_path=local_path) + ) + else: + local_path = download( + url="https://drive.google.com/uc?id=" + id, + output=local_path, + quiet=quiet, + proxy=proxy, + speed=speed, + use_cookies=use_cookies, + verify=verify, + ) + if local_path is None: + if not quiet: + print("Download ended unsuccessfully", file=sys.stderr) + return None + files.append(local_path) if not quiet: print("Download completed", file=sys.stderr) - return filenames + return files diff --git a/tests/test_download_folder.py b/tests/test_download_folder.py index 7fc0e6d5..f1ce332f 100644 --- a/tests/test_download_folder.py +++ b/tests/test_download_folder.py @@ -1,6 +1,8 @@ import os.path as osp +import tempfile from gdown.download_folder import _parse_google_drive_file +from gdown.download_folder import download_folder here = osp.dirname(osp.abspath(__file__)) @@ -58,3 +60,14 @@ def test_valid_page(): assert actual_children_ids == expected_children_ids assert actual_children_names == expected_children_names assert actual_children_types == expected_children_types + + +def test_download_folder_dry_run(): + url = "https://drive.google.com/drive/folders/1KpLl_1tcK0eeehzN980zbG-3M2nhbVks" + tmp_dir = tempfile.mkdtemp() + files = download_folder(url=url, output=tmp_dir, skip_download=True) + assert len(files) == 6 + for file in files: + assert hasattr(file, "id") + assert hasattr(file, "path") + assert hasattr(file, "local_path")