61
61
from torchx .specs import AppDryRunInfo , AppDef
62
62
from torchx .runner import get_runner , Runner
63
63
from torchx .schedulers .ray_scheduler import RayJob
64
+ from torchx .schedulers .kubernetes_mcad_scheduler import KubernetesMCADJob
64
65
import pytest
65
66
66
67
@@ -1686,6 +1687,40 @@ def test_DDPJobDefinition_dry_run():
1686
1687
assert ddp_job ._scheduler == "ray"
1687
1688
1688
1689
1690
+ def test_DDPJobDefinition_dry_run_no_cluster ():
1691
+ """
1692
+ Test that the dry run method returns the correct type: AppDryRunInfo,
1693
+ that the attributes of the returned object are of the correct type,
1694
+ and that the values from cluster and job definition are correctly passed.
1695
+ """
1696
+ ddp = test_DDPJobDefinition_creation ()
1697
+ ddp .image = "fake-image"
1698
+ ddp_job = ddp ._dry_run_no_cluster ()
1699
+ assert type (ddp_job ) == AppDryRunInfo
1700
+ assert ddp_job ._fmt is not None
1701
+ assert type (ddp_job .request ) == KubernetesMCADJob
1702
+ assert type (ddp_job ._app ) == AppDef
1703
+ assert type (ddp_job ._cfg ) == type (dict ())
1704
+ assert type (ddp_job ._scheduler ) == type (str ())
1705
+
1706
+ assert (
1707
+ ddp_job .request .resource ["spec" ]["resources" ]["GenericItems" ][0 ][
1708
+ "generictemplate"
1709
+ ]
1710
+ .spec .containers [0 ]
1711
+ .image
1712
+ == "fake-image"
1713
+ )
1714
+
1715
+ assert ddp_job ._app .roles [0 ].resource .cpu == 1
1716
+ assert ddp_job ._app .roles [0 ].resource .gpu == 0
1717
+ assert ddp_job ._app .roles [0 ].resource .memMB == 1024
1718
+
1719
+ assert ddp_job ._cfg ["requirements" ] == "test"
1720
+
1721
+ assert ddp_job ._scheduler == "kubernetes_mcad"
1722
+
1723
+
1689
1724
def test_DDPJobDefinition_dry_run_no_resource_args ():
1690
1725
"""
1691
1726
Test that the dry run correctly gets resources from the cluster object
@@ -1715,6 +1750,55 @@ def test_DDPJobDefinition_dry_run_no_resource_args():
1715
1750
)
1716
1751
1717
1752
1753
+ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args ():
1754
+ """
1755
+ Test that the dry run method returns the correct type: AppDryRunInfo,
1756
+ that the attributes of the returned object are of the correct type,
1757
+ and that the values from cluster and job definition are correctly passed.
1758
+ """
1759
+ ddp = test_DDPJobDefinition_creation ()
1760
+ try :
1761
+ ddp ._dry_run_no_cluster ()
1762
+ assert 0 == 1
1763
+ except ValueError as e :
1764
+ assert str (e ) == "Job definition missing arg: image"
1765
+ ddp .image = "fake-image"
1766
+ ddp .name = None
1767
+ try :
1768
+ ddp ._dry_run_no_cluster ()
1769
+ assert 0 == 1
1770
+ except ValueError as e :
1771
+ assert str (e ) == "Job definition missing arg: name"
1772
+ ddp .name = "fake"
1773
+ ddp .cpu = None
1774
+ try :
1775
+ ddp ._dry_run_no_cluster ()
1776
+ assert 0 == 1
1777
+ except ValueError as e :
1778
+ assert str (e ) == "Job definition missing arg: cpu (# cpus per worker)"
1779
+ ddp .cpu = 1
1780
+ ddp .gpu = None
1781
+ try :
1782
+ ddp ._dry_run_no_cluster ()
1783
+ assert 0 == 1
1784
+ except ValueError as e :
1785
+ assert str (e ) == "Job definition missing arg: gpu (# gpus per worker)"
1786
+ ddp .gpu = 1
1787
+ ddp .memMB = None
1788
+ try :
1789
+ ddp ._dry_run_no_cluster ()
1790
+ assert 0 == 1
1791
+ except ValueError as e :
1792
+ assert str (e ) == "Job definition missing arg: memMB (memory in MB)"
1793
+ ddp .memMB = 1
1794
+ ddp .j = None
1795
+ try :
1796
+ ddp ._dry_run_no_cluster ()
1797
+ assert 0 == 1
1798
+ except ValueError as e :
1799
+ assert str (e ) == "Job definition missing arg: j (`workers`x`procs`)"
1800
+
1801
+
1718
1802
def test_DDPJobDefinition_submit (mocker ):
1719
1803
"""
1720
1804
Tests that the submit method returns the correct type: DDPJob
@@ -1733,6 +1817,14 @@ def test_DDPJobDefinition_submit(mocker):
1733
1817
assert type (ddp_job ._app_handle ) == str
1734
1818
assert ddp_job ._app_handle == "fake-dashboard-url"
1735
1819
1820
+ ddp_def .image = "fake-image"
1821
+ ddp_job = ddp_def .submit ()
1822
+ assert type (ddp_job ) == DDPJob
1823
+ assert type (ddp_job .job_definition ) == DDPJobDefinition
1824
+ assert ddp_job .cluster == None
1825
+ assert type (ddp_job ._app_handle ) == str
1826
+ assert ddp_job ._app_handle == "fake-dashboard-url"
1827
+
1736
1828
1737
1829
def test_DDPJob_creation (mocker ):
1738
1830
ddp_def = test_DDPJobDefinition_creation ()
@@ -1757,6 +1849,29 @@ def test_DDPJob_creation(mocker):
1757
1849
return ddp_job
1758
1850
1759
1851
1852
+ def test_DDPJob_creation_no_cluster (mocker ):
1853
+ ddp_def = test_DDPJobDefinition_creation ()
1854
+ ddp_def .image = "fake-image"
1855
+ mocker .patch (
1856
+ "codeflare_sdk.job.jobs.torchx_runner.schedule" ,
1857
+ return_value = "fake-app-handle" ,
1858
+ ) # a fake app_handle
1859
+ ddp_job = DDPJob (ddp_def , None )
1860
+ assert type (ddp_job ) == DDPJob
1861
+ assert type (ddp_job .job_definition ) == DDPJobDefinition
1862
+ assert ddp_job .cluster == None
1863
+ assert type (ddp_job ._app_handle ) == str
1864
+ assert ddp_job ._app_handle == "fake-app-handle"
1865
+ _ , args , kwargs = torchx_runner .schedule .mock_calls [0 ]
1866
+ assert type (args [0 ]) == AppDryRunInfo
1867
+ job_info = args [0 ]
1868
+ assert type (job_info .request ) == KubernetesMCADJob
1869
+ assert type (job_info ._app ) == AppDef
1870
+ assert type (job_info ._cfg ) == type (dict ())
1871
+ assert type (job_info ._scheduler ) == type (str ())
1872
+ return ddp_job
1873
+
1874
+
1760
1875
def test_DDPJob_status (mocker ):
1761
1876
ddp_job = test_DDPJob_creation (mocker )
1762
1877
mocker .patch (
@@ -1777,6 +1892,18 @@ def test_DDPJob_logs(mocker):
1777
1892
assert args [0 ] == "fake-dashboard-url"
1778
1893
1779
1894
1895
+ def arg_check_side_effect (* args ):
1896
+ assert args [0 ] == "fake-app-handle"
1897
+
1898
+
1899
+ def test_DDPJob_cancel (mocker ):
1900
+ ddp_job = test_DDPJob_creation_no_cluster (mocker )
1901
+ mocker .patch (
1902
+ "codeflare_sdk.job.jobs.torchx_runner.cancel" , side_effect = arg_check_side_effect
1903
+ )
1904
+ ddp_job .cancel ()
1905
+
1906
+
1780
1907
def parse_j (cmd ):
1781
1908
1782
1909
pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"
0 commit comments