Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skip_download option to download_folder to return the list of files without download #317

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 56 additions & 34 deletions gdown/download_folder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import collections
import itertools
import json
import os
import os.path as osp
import re
import sys
import warnings
from typing import List
from typing import Union

import bs4

Expand Down Expand Up @@ -188,6 +191,11 @@ def _get_directory_structure(gdrive_file, previous_path):
return directory_structure


GoogleDriveFileToDownload = collections.namedtuple(
"GoogleDriveFileToDownload", ("id", "path", "local_path")
)


def download_folder(
url=None,
id=None,
Expand All @@ -199,7 +207,8 @@ def download_folder(
remaining_ok=False,
verify=True,
user_agent=None,
):
skip_download: bool = False,
) -> Union[List[str], List[GoogleDriveFileToDownload], None]:
"""Downloads entire folder from URL.

Parameters
Expand All @@ -226,11 +235,16 @@ def download_folder(
to a CA bundle to use. Default is True.
user_agent: str, optional
User-agent to use in the HTTP request.
skip_download: bool, optional
If True, return the list of files to download without downloading them.
Defaults to False.

Returns
-------
filenames: list of str
List of files downloaded, or None if failed.
files: List[str] or List[GoogleDriveFileToDownload] or None
If dry_run is False, list of local file paths downloaded or None if failed.
If dry_run is True, list of GoogleDriveFileToDownload that contains
id, path, and local_path.

Example
-------
Expand All @@ -251,53 +265,61 @@ def download_folder(

if not quiet:
print("Retrieving folder contents", file=sys.stderr)
return_code, gdrive_file = _download_and_parse_google_drive_link(
is_success, gdrive_file = _download_and_parse_google_drive_link(
sess,
url,
quiet=quiet,
remaining_ok=remaining_ok,
verify=verify,
)
if not is_success:
print("Failed to retrieve folder contents", file=sys.stderr)
return None

if not return_code:
return return_code
if not quiet:
print("Retrieving folder contents completed", file=sys.stderr)
print("Building directory structure", file=sys.stderr)
directory_structure = _get_directory_structure(gdrive_file, previous_path="")
if not quiet:
print("Building directory structure completed", file=sys.stderr)

if output is None:
output = os.getcwd() + osp.sep
if output.endswith(osp.sep):
root_folder = osp.join(output, gdrive_file.name)
root_dir = osp.join(output, gdrive_file.name)
else:
root_folder = output
directory_structure = _get_directory_structure(gdrive_file, root_folder)
if not osp.exists(root_folder):
os.makedirs(root_folder)
root_dir = output
if not osp.exists(root_dir):
os.makedirs(root_dir)

if not quiet:
print("Building directory structure completed")
filenames = []
for file_id, file_path in directory_structure:
if file_id is None: # folder
if not osp.exists(file_path):
os.makedirs(file_path)
continue
files = []
for id, path in directory_structure:
local_path = osp.join(root_dir, path)

filename = download(
url="https://drive.google.com/uc?id=" + file_id,
output=file_path,
quiet=quiet,
proxy=proxy,
speed=speed,
use_cookies=use_cookies,
verify=verify,
)
if id is None: # folder
if not skip_download and not osp.exists(local_path):
os.makedirs(local_path)
continue

if filename is None:
if not quiet:
print("Download ended unsuccessfully", file=sys.stderr)
return
filenames.append(filename)
if skip_download:
files.append(
GoogleDriveFileToDownload(id=id, path=path, local_path=local_path)
)
else:
local_path = download(
url="https://drive.google.com/uc?id=" + id,
output=local_path,
quiet=quiet,
proxy=proxy,
speed=speed,
use_cookies=use_cookies,
verify=verify,
)
if local_path is None:
if not quiet:
print("Download ended unsuccessfully", file=sys.stderr)
return None
files.append(local_path)
if not quiet:
print("Download completed", file=sys.stderr)
return filenames
return files
13 changes: 13 additions & 0 deletions tests/test_download_folder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os.path as osp
import tempfile

from gdown.download_folder import _parse_google_drive_file
from gdown.download_folder import download_folder

here = osp.dirname(osp.abspath(__file__))

Expand Down Expand Up @@ -58,3 +60,14 @@ def test_valid_page():
assert actual_children_ids == expected_children_ids
assert actual_children_names == expected_children_names
assert actual_children_types == expected_children_types


def test_download_folder_dry_run():
url = "https://drive.google.com/drive/folders/1KpLl_1tcK0eeehzN980zbG-3M2nhbVks"
tmp_dir = tempfile.mkdtemp()
files = download_folder(url=url, output=tmp_dir, skip_download=True)
assert len(files) == 6
for file in files:
assert hasattr(file, "id")
assert hasattr(file, "path")
assert hasattr(file, "local_path")
Loading