Skip to content

Commit

Permalink
Add skip_download option to download_folder
Browse files Browse the repository at this point in the history
  • Loading branch information
wkentaro committed Feb 3, 2024
1 parent 71ea616 commit f3bee0c
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 34 deletions.
90 changes: 56 additions & 34 deletions gdown/download_folder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import collections
import itertools
import json
import os
import os.path as osp
import re
import sys
import warnings
from typing import List
from typing import Union

import bs4

Expand Down Expand Up @@ -188,6 +191,11 @@ def _get_directory_structure(gdrive_file, previous_path):
return directory_structure


GoogleDriveFileToDownload = collections.namedtuple(
"GoogleDriveFileToDownload", ("id", "path", "local_path")
)


def download_folder(
url=None,
id=None,
Expand All @@ -199,7 +207,8 @@ def download_folder(
remaining_ok=False,
verify=True,
user_agent=None,
):
skip_download: bool = False,
) -> Union[List[str], List[GoogleDriveFileToDownload], None]:
"""Downloads entire folder from URL.
Parameters
Expand All @@ -226,11 +235,16 @@ def download_folder(
to a CA bundle to use. Default is True.
user_agent: str, optional
User-agent to use in the HTTP request.
skip_download: bool, optional
If True, return the list of files to download without downloading them.
Defaults to False.
Returns
-------
filenames: list of str
List of files downloaded, or None if failed.
files: List[str] or List[GoogleDriveFileToDownload] or None
If dry_run is False, list of local file paths downloaded or None if failed.
If dry_run is True, list of GoogleDriveFileToDownload that contains
id, path, and local_path.
Example
-------
Expand All @@ -251,53 +265,61 @@ def download_folder(

if not quiet:
print("Retrieving folder contents", file=sys.stderr)
return_code, gdrive_file = _download_and_parse_google_drive_link(
is_success, gdrive_file = _download_and_parse_google_drive_link(
sess,
url,
quiet=quiet,
remaining_ok=remaining_ok,
verify=verify,
)
if not is_success:
print("Failed to retrieve folder contents", file=sys.stderr)
return None

if not return_code:
return return_code
if not quiet:
print("Retrieving folder contents completed", file=sys.stderr)
print("Building directory structure", file=sys.stderr)
directory_structure = _get_directory_structure(gdrive_file, previous_path="")
if not quiet:
print("Building directory structure completed", file=sys.stderr)

if output is None:
output = os.getcwd() + osp.sep
if output.endswith(osp.sep):
root_folder = osp.join(output, gdrive_file.name)
root_dir = osp.join(output, gdrive_file.name)
else:
root_folder = output
directory_structure = _get_directory_structure(gdrive_file, root_folder)
if not osp.exists(root_folder):
os.makedirs(root_folder)
root_dir = output
if not osp.exists(root_dir):
os.makedirs(root_dir)

if not quiet:
print("Building directory structure completed", file=sys.stderr)
filenames = []
for file_id, file_path in directory_structure:
if file_id is None: # folder
if not osp.exists(file_path):
os.makedirs(file_path)
continue
files = []
for id, path in directory_structure:
local_path = osp.join(root_dir, path)

filename = download(
url="https://drive.google.com/uc?id=" + file_id,
output=file_path,
quiet=quiet,
proxy=proxy,
speed=speed,
use_cookies=use_cookies,
verify=verify,
)
if id is None: # folder
if not skip_download and not osp.exists(local_path):
os.makedirs(local_path)
continue

if filename is None:
if not quiet:
print("Download ended unsuccessfully", file=sys.stderr)
return
filenames.append(filename)
if skip_download:
files.append(
GoogleDriveFileToDownload(id=id, path=path, local_path=local_path)
)
else:
local_path = download(
url="https://drive.google.com/uc?id=" + id,
output=local_path,
quiet=quiet,
proxy=proxy,
speed=speed,
use_cookies=use_cookies,
verify=verify,
)
if local_path is None:
if not quiet:
print("Download ended unsuccessfully", file=sys.stderr)
return None
files.append(local_path)
if not quiet:
print("Download completed", file=sys.stderr)
return filenames
return files
13 changes: 13 additions & 0 deletions tests/test_download_folder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os.path as osp
import tempfile

from gdown.download_folder import _parse_google_drive_file
from gdown.download_folder import download_folder

here = osp.dirname(osp.abspath(__file__))

Expand Down Expand Up @@ -58,3 +60,14 @@ def test_valid_page():
assert actual_children_ids == expected_children_ids
assert actual_children_names == expected_children_names
assert actual_children_types == expected_children_types


def test_download_folder_dry_run():
url = "https://drive.google.com/drive/folders/1KpLl_1tcK0eeehzN980zbG-3M2nhbVks"
tmp_dir = tempfile.mkdtemp()
files = download_folder(url=url, output=tmp_dir, skip_download=True)
assert len(files) == 6
for file in files:
assert hasattr(file, "id")
assert hasattr(file, "path")
assert hasattr(file, "local_path")

0 comments on commit f3bee0c

Please sign in to comment.