Skip to content

Commit 35b1abd

Browse files
authored
Add Job Cancelling and More Unit Tests (#79)
* Job cancel + first unit test * Rest of unit tests * Review feedback
1 parent 8266307 commit 35b1abd

File tree

2 files changed

+131
-1
lines changed

2 files changed

+131
-1
lines changed

src/codeflare_sdk/job/jobs.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def submit(self, cluster: "Cluster" = None) -> "Job":
153153

154154

155155
class DDPJob(Job):
156-
def __init__(self, job_definition: "DDPJobDefinition", cluster: "Cluster"):
156+
def __init__(self, job_definition: "DDPJobDefinition", cluster: "Cluster" = None):
157157
self.job_definition = job_definition
158158
self.cluster = cluster
159159
if self.cluster:
@@ -169,3 +169,6 @@ def status(self) -> str:
169169

170170
def logs(self) -> str:
171171
return "".join(torchx_runner.log_lines(self._app_handle, None))
172+
173+
def cancel(self):
174+
torchx_runner.cancel(self._app_handle)

tests/unit_test.py

+127
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
from torchx.specs import AppDryRunInfo, AppDef
6262
from torchx.runner import get_runner, Runner
6363
from torchx.schedulers.ray_scheduler import RayJob
64+
from torchx.schedulers.kubernetes_mcad_scheduler import KubernetesMCADJob
6465
import pytest
6566

6667

@@ -1686,6 +1687,40 @@ def test_DDPJobDefinition_dry_run():
16861687
assert ddp_job._scheduler == "ray"
16871688

16881689

1690+
def test_DDPJobDefinition_dry_run_no_cluster():
1691+
"""
1692+
Test that the dry run method returns the correct type: AppDryRunInfo,
1693+
that the attributes of the returned object are of the correct type,
1694+
and that the values from cluster and job definition are correctly passed.
1695+
"""
1696+
ddp = test_DDPJobDefinition_creation()
1697+
ddp.image = "fake-image"
1698+
ddp_job = ddp._dry_run_no_cluster()
1699+
assert type(ddp_job) == AppDryRunInfo
1700+
assert ddp_job._fmt is not None
1701+
assert type(ddp_job.request) == KubernetesMCADJob
1702+
assert type(ddp_job._app) == AppDef
1703+
assert type(ddp_job._cfg) == type(dict())
1704+
assert type(ddp_job._scheduler) == type(str())
1705+
1706+
assert (
1707+
ddp_job.request.resource["spec"]["resources"]["GenericItems"][0][
1708+
"generictemplate"
1709+
]
1710+
.spec.containers[0]
1711+
.image
1712+
== "fake-image"
1713+
)
1714+
1715+
assert ddp_job._app.roles[0].resource.cpu == 1
1716+
assert ddp_job._app.roles[0].resource.gpu == 0
1717+
assert ddp_job._app.roles[0].resource.memMB == 1024
1718+
1719+
assert ddp_job._cfg["requirements"] == "test"
1720+
1721+
assert ddp_job._scheduler == "kubernetes_mcad"
1722+
1723+
16891724
def test_DDPJobDefinition_dry_run_no_resource_args():
16901725
"""
16911726
Test that the dry run correctly gets resources from the cluster object
@@ -1715,6 +1750,55 @@ def test_DDPJobDefinition_dry_run_no_resource_args():
17151750
)
17161751

17171752

1753+
def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args():
1754+
"""
1755+
Test that the dry run method returns the correct type: AppDryRunInfo,
1756+
that the attributes of the returned object are of the correct type,
1757+
and that the values from cluster and job definition are correctly passed.
1758+
"""
1759+
ddp = test_DDPJobDefinition_creation()
1760+
try:
1761+
ddp._dry_run_no_cluster()
1762+
assert 0 == 1
1763+
except ValueError as e:
1764+
assert str(e) == "Job definition missing arg: image"
1765+
ddp.image = "fake-image"
1766+
ddp.name = None
1767+
try:
1768+
ddp._dry_run_no_cluster()
1769+
assert 0 == 1
1770+
except ValueError as e:
1771+
assert str(e) == "Job definition missing arg: name"
1772+
ddp.name = "fake"
1773+
ddp.cpu = None
1774+
try:
1775+
ddp._dry_run_no_cluster()
1776+
assert 0 == 1
1777+
except ValueError as e:
1778+
assert str(e) == "Job definition missing arg: cpu (# cpus per worker)"
1779+
ddp.cpu = 1
1780+
ddp.gpu = None
1781+
try:
1782+
ddp._dry_run_no_cluster()
1783+
assert 0 == 1
1784+
except ValueError as e:
1785+
assert str(e) == "Job definition missing arg: gpu (# gpus per worker)"
1786+
ddp.gpu = 1
1787+
ddp.memMB = None
1788+
try:
1789+
ddp._dry_run_no_cluster()
1790+
assert 0 == 1
1791+
except ValueError as e:
1792+
assert str(e) == "Job definition missing arg: memMB (memory in MB)"
1793+
ddp.memMB = 1
1794+
ddp.j = None
1795+
try:
1796+
ddp._dry_run_no_cluster()
1797+
assert 0 == 1
1798+
except ValueError as e:
1799+
assert str(e) == "Job definition missing arg: j (`workers`x`procs`)"
1800+
1801+
17181802
def test_DDPJobDefinition_submit(mocker):
17191803
"""
17201804
Tests that the submit method returns the correct type: DDPJob
@@ -1733,6 +1817,14 @@ def test_DDPJobDefinition_submit(mocker):
17331817
assert type(ddp_job._app_handle) == str
17341818
assert ddp_job._app_handle == "fake-dashboard-url"
17351819

1820+
ddp_def.image = "fake-image"
1821+
ddp_job = ddp_def.submit()
1822+
assert type(ddp_job) == DDPJob
1823+
assert type(ddp_job.job_definition) == DDPJobDefinition
1824+
assert ddp_job.cluster == None
1825+
assert type(ddp_job._app_handle) == str
1826+
assert ddp_job._app_handle == "fake-dashboard-url"
1827+
17361828

17371829
def test_DDPJob_creation(mocker):
17381830
ddp_def = test_DDPJobDefinition_creation()
@@ -1757,6 +1849,29 @@ def test_DDPJob_creation(mocker):
17571849
return ddp_job
17581850

17591851

1852+
def test_DDPJob_creation_no_cluster(mocker):
1853+
ddp_def = test_DDPJobDefinition_creation()
1854+
ddp_def.image = "fake-image"
1855+
mocker.patch(
1856+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1857+
return_value="fake-app-handle",
1858+
) # a fake app_handle
1859+
ddp_job = DDPJob(ddp_def, None)
1860+
assert type(ddp_job) == DDPJob
1861+
assert type(ddp_job.job_definition) == DDPJobDefinition
1862+
assert ddp_job.cluster == None
1863+
assert type(ddp_job._app_handle) == str
1864+
assert ddp_job._app_handle == "fake-app-handle"
1865+
_, args, kwargs = torchx_runner.schedule.mock_calls[0]
1866+
assert type(args[0]) == AppDryRunInfo
1867+
job_info = args[0]
1868+
assert type(job_info.request) == KubernetesMCADJob
1869+
assert type(job_info._app) == AppDef
1870+
assert type(job_info._cfg) == type(dict())
1871+
assert type(job_info._scheduler) == type(str())
1872+
return ddp_job
1873+
1874+
17601875
def test_DDPJob_status(mocker):
17611876
ddp_job = test_DDPJob_creation(mocker)
17621877
mocker.patch(
@@ -1777,6 +1892,18 @@ def test_DDPJob_logs(mocker):
17771892
assert args[0] == "fake-dashboard-url"
17781893

17791894

1895+
def arg_check_side_effect(*args):
1896+
assert args[0] == "fake-app-handle"
1897+
1898+
1899+
def test_DDPJob_cancel(mocker):
1900+
ddp_job = test_DDPJob_creation_no_cluster(mocker)
1901+
mocker.patch(
1902+
"codeflare_sdk.job.jobs.torchx_runner.cancel", side_effect=arg_check_side_effect
1903+
)
1904+
ddp_job.cancel()
1905+
1906+
17801907
def parse_j(cmd):
17811908

17821909
pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"

0 commit comments

Comments
 (0)