|
16 | 16 | import sys
|
17 | 17 | import filecmp
|
18 | 18 | import os
|
| 19 | +import re |
19 | 20 |
|
20 | 21 | parent = Path(__file__).resolve().parents[1]
|
21 | 22 | sys.path.append(str(parent) + "/src")
|
|
46 | 47 | RayClusterStatus,
|
47 | 48 | CodeFlareClusterStatus,
|
48 | 49 | )
|
| 50 | +from codeflare_sdk.job.jobs import ( |
| 51 | + JobDefinition, |
| 52 | + Job, |
| 53 | + DDPJobDefinition, |
| 54 | + DDPJob, |
| 55 | + torchx_runner, |
| 56 | +) |
49 | 57 | import openshift
|
50 | 58 | from openshift import OpenShiftPythonException
|
51 | 59 | from openshift.selector import Selector
|
52 | 60 | import ray
|
| 61 | +from torchx.specs import AppDryRunInfo, AppDef |
| 62 | +from torchx.runner import get_runner, Runner |
| 63 | +from torchx.schedulers.ray_scheduler import RayJob |
53 | 64 | import pytest
|
54 | 65 |
|
55 | 66 |
|
@@ -1535,6 +1546,7 @@ def test_cluster_status(mocker):
|
1535 | 1546 | mocker.patch(
|
1536 | 1547 | "codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray
|
1537 | 1548 | )
|
| 1549 | + |
1538 | 1550 | status, ready = cf.status()
|
1539 | 1551 | assert status == CodeFlareClusterStatus.STARTING
|
1540 | 1552 | assert ready == False
|
@@ -1594,3 +1606,186 @@ def test_cmd_line_generation():
|
1594 | 1606 | def test_cleanup():
|
1595 | 1607 | os.remove("test.yaml")
|
1596 | 1608 | os.remove("raytest2.yaml")
|
| 1609 | + |
| 1610 | + |
| 1611 | +def test_jobdefinition_coverage(): |
| 1612 | + abstract = JobDefinition() |
| 1613 | + cluster = Cluster(test_config_creation()) |
| 1614 | + abstract._dry_run(cluster) |
| 1615 | + abstract.submit(cluster) |
| 1616 | + |
| 1617 | + |
| 1618 | +def test_job_coverage(): |
| 1619 | + abstract = Job() |
| 1620 | + abstract.status() |
| 1621 | + abstract.logs() |
| 1622 | + |
| 1623 | + |
| 1624 | +def test_DDPJobDefinition_creation(): |
| 1625 | + ddp = DDPJobDefinition( |
| 1626 | + script="test.py", |
| 1627 | + m=None, |
| 1628 | + script_args=["test"], |
| 1629 | + name="test", |
| 1630 | + cpu=1, |
| 1631 | + gpu=0, |
| 1632 | + memMB=1024, |
| 1633 | + h=None, |
| 1634 | + j="2x1", |
| 1635 | + env={"test": "test"}, |
| 1636 | + max_retries=0, |
| 1637 | + mounts=[], |
| 1638 | + rdzv_port=29500, |
| 1639 | + scheduler_args={"requirements": "test"}, |
| 1640 | + ) |
| 1641 | + assert ddp.script == "test.py" |
| 1642 | + assert ddp.m == None |
| 1643 | + assert ddp.script_args == ["test"] |
| 1644 | + assert ddp.name == "test" |
| 1645 | + assert ddp.cpu == 1 |
| 1646 | + assert ddp.gpu == 0 |
| 1647 | + assert ddp.memMB == 1024 |
| 1648 | + assert ddp.h == None |
| 1649 | + assert ddp.j == "2x1" |
| 1650 | + assert ddp.env == {"test": "test"} |
| 1651 | + assert ddp.max_retries == 0 |
| 1652 | + assert ddp.mounts == [] |
| 1653 | + assert ddp.rdzv_port == 29500 |
| 1654 | + assert ddp.scheduler_args == {"requirements": "test"} |
| 1655 | + return ddp |
| 1656 | + |
| 1657 | + |
| 1658 | +def test_DDPJobDefinition_dry_run(): |
| 1659 | + """ |
| 1660 | + Test that the dry run method returns the correct type: AppDryRunInfo, |
| 1661 | + that the attributes of the returned object are of the correct type, |
| 1662 | + and that the values from cluster and job definition are correctly passed. |
| 1663 | + """ |
| 1664 | + ddp = test_DDPJobDefinition_creation() |
| 1665 | + cluster = Cluster(test_config_creation()) |
| 1666 | + ddp_job = ddp._dry_run(cluster) |
| 1667 | + assert type(ddp_job) == AppDryRunInfo |
| 1668 | + assert ddp_job._fmt is not None |
| 1669 | + assert type(ddp_job.request) == RayJob |
| 1670 | + assert type(ddp_job._app) == AppDef |
| 1671 | + assert type(ddp_job._cfg) == type(dict()) |
| 1672 | + assert type(ddp_job._scheduler) == type(str()) |
| 1673 | + |
| 1674 | + assert ddp_job.request.app_id.startswith("test") |
| 1675 | + assert ddp_job.request.working_dir.startswith("/tmp/torchx_workspace") |
| 1676 | + assert ddp_job.request.cluster_name == "unit-test-cluster" |
| 1677 | + assert ddp_job.request.requirements == "test" |
| 1678 | + |
| 1679 | + assert ddp_job._app.roles[0].resource.cpu == 1 |
| 1680 | + assert ddp_job._app.roles[0].resource.gpu == 0 |
| 1681 | + assert ddp_job._app.roles[0].resource.memMB == 1024 |
| 1682 | + |
| 1683 | + assert ddp_job._cfg["cluster_name"] == "unit-test-cluster" |
| 1684 | + assert ddp_job._cfg["requirements"] == "test" |
| 1685 | + |
| 1686 | + assert ddp_job._scheduler == "ray" |
| 1687 | + |
| 1688 | + |
| 1689 | +def test_DDPJobDefinition_dry_run_no_resource_args(): |
| 1690 | + """ |
| 1691 | + Test that the dry run correctly gets resources from the cluster object |
| 1692 | + when the job definition does not specify resources. |
| 1693 | + """ |
| 1694 | + cluster = Cluster(test_config_creation()) |
| 1695 | + ddp = DDPJobDefinition( |
| 1696 | + script="test.py", |
| 1697 | + m=None, |
| 1698 | + script_args=["test"], |
| 1699 | + name="test", |
| 1700 | + h=None, |
| 1701 | + env={"test": "test"}, |
| 1702 | + max_retries=0, |
| 1703 | + mounts=[], |
| 1704 | + rdzv_port=29500, |
| 1705 | + scheduler_args={"requirements": "test"}, |
| 1706 | + ) |
| 1707 | + ddp_job = ddp._dry_run(cluster) |
| 1708 | + |
| 1709 | + assert ddp_job._app.roles[0].resource.cpu == cluster.config.max_cpus |
| 1710 | + assert ddp_job._app.roles[0].resource.gpu == cluster.config.gpu |
| 1711 | + assert ddp_job._app.roles[0].resource.memMB == cluster.config.max_memory * 1024 |
| 1712 | + assert ( |
| 1713 | + parse_j(ddp_job._app.roles[0].args[1]) |
| 1714 | + == f"{cluster.config.max_worker}x{cluster.config.gpu}" |
| 1715 | + ) |
| 1716 | + |
| 1717 | + |
| 1718 | +def test_DDPJobDefinition_submit(mocker): |
| 1719 | + """ |
| 1720 | + Tests that the submit method returns the correct type: DDPJob |
| 1721 | + And that the attributes of the returned object are of the correct type |
| 1722 | + """ |
| 1723 | + ddp_def = test_DDPJobDefinition_creation() |
| 1724 | + cluster = Cluster(test_config_creation()) |
| 1725 | + mocker.patch( |
| 1726 | + "codeflare_sdk.job.jobs.torchx_runner.schedule", |
| 1727 | + return_value="fake-dashboard-url", |
| 1728 | + ) # a fake app_handle |
| 1729 | + ddp_job = ddp_def.submit(cluster) |
| 1730 | + assert type(ddp_job) == DDPJob |
| 1731 | + assert type(ddp_job.job_definition) == DDPJobDefinition |
| 1732 | + assert type(ddp_job.cluster) == Cluster |
| 1733 | + assert type(ddp_job._app_handle) == str |
| 1734 | + assert ddp_job._app_handle == "fake-dashboard-url" |
| 1735 | + |
| 1736 | + |
| 1737 | +def test_DDPJob_creation(mocker): |
| 1738 | + ddp_def = test_DDPJobDefinition_creation() |
| 1739 | + cluster = Cluster(test_config_creation()) |
| 1740 | + mocker.patch( |
| 1741 | + "codeflare_sdk.job.jobs.torchx_runner.schedule", |
| 1742 | + return_value="fake-dashboard-url", |
| 1743 | + ) # a fake app_handle |
| 1744 | + ddp_job = DDPJob(ddp_def, cluster) |
| 1745 | + assert type(ddp_job) == DDPJob |
| 1746 | + assert type(ddp_job.job_definition) == DDPJobDefinition |
| 1747 | + assert type(ddp_job.cluster) == Cluster |
| 1748 | + assert type(ddp_job._app_handle) == str |
| 1749 | + assert ddp_job._app_handle == "fake-dashboard-url" |
| 1750 | + _, args, kwargs = torchx_runner.schedule.mock_calls[0] |
| 1751 | + assert type(args[0]) == AppDryRunInfo |
| 1752 | + job_info = args[0] |
| 1753 | + assert type(job_info.request) == RayJob |
| 1754 | + assert type(job_info._app) == AppDef |
| 1755 | + assert type(job_info._cfg) == type(dict()) |
| 1756 | + assert type(job_info._scheduler) == type(str()) |
| 1757 | + return ddp_job |
| 1758 | + |
| 1759 | + |
| 1760 | +def test_DDPJob_status(mocker): |
| 1761 | + ddp_job = test_DDPJob_creation(mocker) |
| 1762 | + mocker.patch( |
| 1763 | + "codeflare_sdk.job.jobs.torchx_runner.status", return_value="fake-status" |
| 1764 | + ) |
| 1765 | + assert ddp_job.status() == "fake-status" |
| 1766 | + _, args, kwargs = torchx_runner.status.mock_calls[0] |
| 1767 | + assert args[0] == "fake-dashboard-url" |
| 1768 | + |
| 1769 | + |
| 1770 | +def test_DDPJob_logs(mocker): |
| 1771 | + ddp_job = test_DDPJob_creation(mocker) |
| 1772 | + mocker.patch( |
| 1773 | + "codeflare_sdk.job.jobs.torchx_runner.log_lines", return_value="fake-logs" |
| 1774 | + ) |
| 1775 | + assert ddp_job.logs() == "fake-logs" |
| 1776 | + _, args, kwargs = torchx_runner.log_lines.mock_calls[0] |
| 1777 | + assert args[0] == "fake-dashboard-url" |
| 1778 | + |
| 1779 | + |
| 1780 | +def parse_j(cmd): |
| 1781 | + |
| 1782 | + pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+" |
| 1783 | + match = re.search(pattern, cmd) |
| 1784 | + if match: |
| 1785 | + substring = match.group(0) |
| 1786 | + else: |
| 1787 | + return None |
| 1788 | + args = substring.split() |
| 1789 | + max_worker = args[1] |
| 1790 | + gpu = args[3] |
| 1791 | + return f"{max_worker}x{gpu}" |
0 commit comments