Skip to content

Commit ae0cafa

Browse files
committed
feat(run-task): implement efficient git clones
1 parent dfbad3b commit ae0cafa

File tree

3 files changed

+391
-26
lines changed

3 files changed

+391
-26
lines changed

src/taskgraph/run-task/run-task

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ def git_checkout(
600600
commit: Optional[str],
601601
ssh_key_file: Optional[Path],
602602
ssh_known_hosts_file: Optional[Path],
603+
efficient_clone: bool = False,
604+
sparse_dirs: Optional[str] = None,
603605
):
604606
env = {
605607
# abort if transfer speed is lower than 1kB/s for 1 minute
@@ -636,22 +638,43 @@ def git_checkout(
636638
args = [
637639
"git",
638640
"clone",
641+
]
642+
643+
if efficient_clone:
644+
# Use blobless clone for faster initial clone
645+
# This fetches commit and tree objects but not file contents
646+
args.extend(["--filter=blob:none"])
647+
# Use shallow clone with depth 1 for minimal history
648+
args.extend(["--depth=1"])
649+
# Skip checkout initially, we'll do sparse checkout later
650+
args.extend(["--no-checkout"])
651+
elif sparse_dirs:
652+
# For sparse checkout without efficient clone, still skip initial checkout
653+
# so we can set up sparse checkout before checking out files
654+
args.extend(["--no-checkout"])
655+
656+
args.extend([
639657
base_repo if base_repo else head_repo,
640658
destination_path,
641-
]
659+
])
642660

643661
retry_required_command(b"vcs", args, extra_env=env)
644662

645663
if base_ref:
646-
args = ["git", "fetch", "origin", base_ref]
664+
args = ["git", "fetch"]
665+
if efficient_clone:
666+
# For shallow clones, we need to deepen to fetch more history
667+
args.extend(["--depth=100"])
668+
args.extend(["origin", base_ref])
647669

648670
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
649671

650672
# Create local branch so that taskgraph is able to compute differences
651673
# between the head branch and the base one, if needed
652-
args = ["git", "checkout", base_ref]
653-
654-
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
674+
if not efficient_clone and not sparse_dirs:
675+
# Only checkout if we didn't use --no-checkout initially
676+
args = ["git", "checkout", base_ref]
677+
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
655678

656679
# When commits are force-pushed (like on a testing branch), base_rev doesn't
657680
# exist on base_ref. Fetching it allows taskgraph to compute differences
@@ -660,7 +683,11 @@ def git_checkout(
660683
# Unlike base_ref just above, there is no need to checkout the revision:
661684
# it's immediately available after the fetch.
662685
if base_rev and base_rev != NULL_REVISION:
663-
args = ["git", "fetch", "origin", base_rev]
686+
args = ["git", "fetch"]
687+
if efficient_clone:
688+
# For shallow clones, we need to deepen to fetch more history
689+
args.extend(["--depth=100"])
690+
args.extend(["origin", base_rev])
664691

665692
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
666693

@@ -671,28 +698,44 @@ def git_checkout(
671698
# in not having a tag, or worse: having an outdated version of one.
672699
# `--force` is needed to be able to update an existing tag.
673700
if ref and base_repo == head_repo:
674-
args = [
675-
"git",
676-
"fetch",
677-
"--tags",
678-
"--force",
679-
base_repo,
680-
ref,
681-
]
701+
args = ["git", "fetch"]
702+
if efficient_clone:
703+
# For shallow clones, we need to deepen to fetch more history
704+
args.extend(["--depth=100"])
705+
args.extend(["--tags", "--force", base_repo, ref])
682706

683707
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
684708

685709
# If a ref isn't provided, we fetch all refs from head_repo, which may be slow
686-
args = [
687-
"git",
688-
"fetch",
689-
"--no-tags",
690-
head_repo,
691-
ref if ref else "+refs/heads/*:refs/remotes/work/*",
692-
]
710+
args = ["git", "fetch"]
711+
if efficient_clone:
712+
# For shallow clones, we need to deepen to fetch more history
713+
args.extend(["--depth=100"])
714+
# With blobless clones, we only fetch the blobs we need
715+
args.extend(["--filter=blob:none"])
716+
args.extend(["--no-tags", head_repo, ref if ref else "+refs/heads/*:refs/remotes/work/*"])
693717

694718
retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env)
695719

720+
if sparse_dirs:
721+
# When sparse directories/files are specified, set up sparse checkout
722+
# The sparse_dirs should be a colon-separated list of directories or files
723+
#
724+
# Note: Git's sparse-checkout behavior in cone mode (default since Git 2.37):
725+
# - Root-level files: Checked out exactly as specified
726+
# - Files in subdirectories: Entire parent directory is included
727+
# - Directories: All contents included
728+
729+
# Enable sparse checkout (cone mode is default since Git 2.37)
730+
args = ["git", "sparse-checkout", "init"]
731+
run_required_command(b"vcs", args, cwd=destination_path)
732+
733+
# Set the sparse entries
734+
entries = sparse_dirs.split(":")
735+
args = ["git", "sparse-checkout", "set"] + entries
736+
run_required_command(b"vcs", args, cwd=destination_path)
737+
738+
# Now do the actual checkout
696739
args = [
697740
"git",
698741
"checkout",
@@ -879,11 +922,17 @@ def add_vcs_arguments(parser, project, name):
879922
"--%s-sparse-profile" % project,
880923
help="Path to sparse profile for %s checkout" % name,
881924
)
925+
parser.add_argument(
926+
"--%s-efficient-clone" % project,
927+
action="store_true",
928+
help="Use efficient cloning strategies (blobless, shallow, no-checkout) for %s" % name,
929+
)
882930

883931

884932
def collect_vcs_options(args, project, name):
885933
checkout = getattr(args, "%s_checkout" % project)
886934
sparse_profile = getattr(args, "%s_sparse_profile" % project)
935+
efficient_clone = getattr(args, "%s_efficient_clone" % project)
887936

888937
env_prefix = project.upper()
889938

@@ -896,6 +945,7 @@ def collect_vcs_options(args, project, name):
896945
ref = os.environ.get("%s_HEAD_REF" % env_prefix)
897946
pip_requirements = os.environ.get("%s_PIP_REQUIREMENTS" % env_prefix)
898947
private_key_secret = os.environ.get("%s_SSH_SECRET_NAME" % env_prefix)
948+
sparse_dirs = os.environ.get("%s_SPARSE_DIRS" % env_prefix)
899949

900950
store_path = os.environ.get("HG_STORE_PATH")
901951

@@ -930,6 +980,8 @@ def collect_vcs_options(args, project, name):
930980
"repo-type": repo_type,
931981
"ssh-secret-name": private_key_secret,
932982
"pip-requirements": pip_requirements,
983+
"efficient-clone": efficient_clone,
984+
"sparse-dirs": sparse_dirs,
933985
}
934986

935987

@@ -978,6 +1030,8 @@ def vcs_checkout_from_args(options):
9781030
revision,
9791031
ssh_key_file,
9801032
ssh_known_hosts_file,
1033+
options.get("efficient-clone", False),
1034+
options.get("sparse-dirs"),
9811035
)
9821036
elif options["repo-type"] == "hg":
9831037
if not revision and not ref:

src/taskgraph/util/vcs.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,10 @@ def base_rev(self):
388388
def branch(self):
389389
return self.run("branch", "--show-current").strip() or None
390390

391+
@property
392+
def is_shallow(self):
393+
return self.run("rev-parse", "--is-shallow-repository").strip() == "true"
394+
391395
@property
392396
def all_remote_names(self):
393397
remotes = self.run("remote").splitlines()
@@ -546,10 +550,25 @@ def update(self, ref):
546550
self.run("checkout", ref)
547551

548552
def find_latest_common_revision(self, base_ref_or_rev, head_rev):
549-
try:
550-
return self.run("merge-base", base_ref_or_rev, head_rev).strip()
551-
except subprocess.CalledProcessError:
552-
return self.NULL_REVISION
553+
554+
def run_merge_base():
555+
try:
556+
return self.run("merge-base", base_ref_or_rev, head_rev).strip()
557+
except subprocess.CalledProcessError:
558+
return None
559+
560+
if not self.is_shallow:
561+
return run_merge_base() or self.NULL_REVISION
562+
563+
deepen = 10
564+
rev = run_merge_base()
565+
while not rev:
566+
self.run("fetch", "--deepen", str(deepen), self.remote_name)
567+
rev = run_merge_base()
568+
deepen = deepen * 10
569+
570+
return rev
571+
553572

554573
def does_revision_exist_locally(self, revision):
555574
try:

0 commit comments

Comments
 (0)