@@ -1462,6 +1462,322 @@ def test_run_job_with_managed_cluster_all_status_fields(mocker):
1462
1462
assert result ["job_submission_id" ] == "all-fields-job-456"
1463
1463
1464
1464
1465
+ def test_run_job_with_managed_cluster_status_polling_exception (mocker ):
1466
+ """Test RayJob with exception during status polling."""
1467
+ from codeflare_sdk .ray .job .job import RayJobSpec
1468
+ from kubernetes .client .rest import ApiException
1469
+
1470
+ # Mock dependencies
1471
+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1472
+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1473
+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1474
+
1475
+ mock_api_client = mocker .Mock ()
1476
+ mocker .patch (
1477
+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1478
+ return_value = mock_api_client ,
1479
+ )
1480
+
1481
+ mock_co_api = mocker .Mock ()
1482
+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1483
+
1484
+ # Mock Cluster creation
1485
+ mock_cluster_resource = {
1486
+ "apiVersion" : "ray.io/v1" ,
1487
+ "kind" : "RayCluster" ,
1488
+ "spec" : {},
1489
+ }
1490
+ mocker .patch (
1491
+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1492
+ )
1493
+ mock_cluster_instance = mocker .Mock ()
1494
+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1495
+ mocker .patch (
1496
+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1497
+ )
1498
+
1499
+ # Mock RayJob creation
1500
+ mock_co_api .create_namespaced_custom_object .return_value = {
1501
+ "metadata" : {"name" : "test-status-exception" }
1502
+ }
1503
+
1504
+ # Mock status polling to raise exception after first success
1505
+ success_response = {
1506
+ "status" : {
1507
+ "jobDeploymentStatus" : "Running" ,
1508
+ "jobStatus" : "RUNNING" ,
1509
+ "jobId" : "exception-job-123" ,
1510
+ }
1511
+ }
1512
+ exception = ApiException (status = 500 , reason = "Internal Server Error" )
1513
+
1514
+ mock_co_api .get_namespaced_custom_object_status .side_effect = [
1515
+ success_response ,
1516
+ exception ,
1517
+ ]
1518
+ mocker .patch ("time.sleep" )
1519
+
1520
+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1521
+ job_config = RayJobSpec (entrypoint = "python script.py" )
1522
+
1523
+ try :
1524
+ Cluster .run_job_with_managed_cluster (
1525
+ cluster_config = cluster_config ,
1526
+ job_config = job_config ,
1527
+ wait_for_completion = True ,
1528
+ job_timeout_seconds = 10 ,
1529
+ )
1530
+ assert False , "Expected ApiException"
1531
+ except ApiException as e :
1532
+ assert e .status == 500
1533
+
1534
+
1535
+ def test_run_job_with_managed_cluster_empty_status (mocker ):
1536
+ """Test RayJob with completely empty status."""
1537
+ from codeflare_sdk .ray .job .job import RayJobSpec
1538
+
1539
+ # Mock dependencies
1540
+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1541
+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1542
+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1543
+
1544
+ mock_api_client = mocker .Mock ()
1545
+ mocker .patch (
1546
+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1547
+ return_value = mock_api_client ,
1548
+ )
1549
+
1550
+ mock_co_api = mocker .Mock ()
1551
+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1552
+
1553
+ # Mock Cluster creation
1554
+ mock_cluster_resource = {
1555
+ "apiVersion" : "ray.io/v1" ,
1556
+ "kind" : "RayCluster" ,
1557
+ "spec" : {},
1558
+ }
1559
+ mocker .patch (
1560
+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1561
+ )
1562
+ mock_cluster_instance = mocker .Mock ()
1563
+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1564
+ mocker .patch (
1565
+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1566
+ )
1567
+
1568
+ # Mock RayJob creation
1569
+ mock_co_api .create_namespaced_custom_object .return_value = {
1570
+ "metadata" : {"name" : "test-empty-status" }
1571
+ }
1572
+
1573
+ # Mock completely empty status response
1574
+ empty_status_response = {}
1575
+ mock_co_api .get_namespaced_custom_object_status .return_value = empty_status_response
1576
+ mocker .patch ("time.sleep" )
1577
+
1578
+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1579
+ job_config = RayJobSpec (entrypoint = "python script.py" )
1580
+
1581
+ # Should handle empty status gracefully with timeout
1582
+ try :
1583
+ Cluster .run_job_with_managed_cluster (
1584
+ cluster_config = cluster_config ,
1585
+ job_config = job_config ,
1586
+ wait_for_completion = True ,
1587
+ job_timeout_seconds = 2 ,
1588
+ job_polling_interval_seconds = 1 ,
1589
+ )
1590
+ assert False , "Expected TimeoutError"
1591
+ except TimeoutError as e :
1592
+ assert "timed out after 2 seconds" in str (e )
1593
+
1594
+
1595
+ def test_run_job_with_managed_cluster_no_metadata_name (mocker ):
1596
+ """Test RayJob creation with missing metadata name."""
1597
+ from codeflare_sdk .ray .job .job import RayJobSpec
1598
+
1599
+ # Mock dependencies
1600
+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1601
+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1602
+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1603
+
1604
+ mock_api_client = mocker .Mock ()
1605
+ mocker .patch (
1606
+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1607
+ return_value = mock_api_client ,
1608
+ )
1609
+
1610
+ mock_co_api = mocker .Mock ()
1611
+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1612
+
1613
+ # Mock Cluster creation
1614
+ mock_cluster_resource = {
1615
+ "apiVersion" : "ray.io/v1" ,
1616
+ "kind" : "RayCluster" ,
1617
+ "spec" : {},
1618
+ }
1619
+ mocker .patch (
1620
+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1621
+ )
1622
+ mock_cluster_instance = mocker .Mock ()
1623
+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1624
+ mocker .patch (
1625
+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1626
+ )
1627
+
1628
+ # Mock RayJob creation with missing name in metadata - method generates its own name
1629
+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {}}
1630
+
1631
+ # Mock status API response even for no-wait case
1632
+ from kubernetes .client .rest import ApiException
1633
+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1634
+ status = 404 , reason = "Not Found"
1635
+ )
1636
+
1637
+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1638
+ job_config = RayJobSpec (entrypoint = "python script.py" )
1639
+
1640
+ # Should still work and generate a job name
1641
+ result = Cluster .run_job_with_managed_cluster (
1642
+ cluster_config = cluster_config ,
1643
+ job_config = job_config ,
1644
+ wait_for_completion = False ,
1645
+ )
1646
+
1647
+ # Should have a generated job name (starts with 'rayjob-')
1648
+ assert "job_cr_name" in result
1649
+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1650
+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1651
+
1652
+
1653
+ def test_run_job_with_managed_cluster_default_namespace (mocker ):
1654
+ """Test RayJob with default namespace."""
1655
+ from codeflare_sdk .ray .job .job import RayJobSpec
1656
+
1657
+ # Mock dependencies
1658
+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1659
+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1660
+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1661
+
1662
+ mock_api_client = mocker .Mock ()
1663
+ mocker .patch (
1664
+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1665
+ return_value = mock_api_client ,
1666
+ )
1667
+
1668
+ mock_co_api = mocker .Mock ()
1669
+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1670
+
1671
+ # Mock Cluster creation
1672
+ mock_cluster_resource = {
1673
+ "apiVersion" : "ray.io/v1" ,
1674
+ "kind" : "RayCluster" ,
1675
+ "spec" : {},
1676
+ }
1677
+ mocker .patch (
1678
+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1679
+ )
1680
+ mock_cluster_instance = mocker .Mock ()
1681
+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1682
+ mocker .patch (
1683
+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1684
+ )
1685
+
1686
+ # Mock RayJob creation
1687
+ mock_co_api .create_namespaced_custom_object .return_value = {
1688
+ "metadata" : {"name" : "test-default-ns" }
1689
+ }
1690
+
1691
+ # Mock status API response even for no-wait case
1692
+ from kubernetes .client .rest import ApiException
1693
+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1694
+ status = 404 , reason = "Not Found"
1695
+ )
1696
+
1697
+ # Test with ClusterConfiguration that has namespace=None (should use default)
1698
+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = None )
1699
+ job_config = RayJobSpec (entrypoint = "python script.py" )
1700
+
1701
+ result = Cluster .run_job_with_managed_cluster (
1702
+ cluster_config = cluster_config ,
1703
+ job_config = job_config ,
1704
+ wait_for_completion = False ,
1705
+ )
1706
+
1707
+ # Method generates its own job name regardless of API response
1708
+ assert "job_cr_name" in result
1709
+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1710
+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1711
+
1712
+
1713
+ def test_run_job_with_managed_cluster_job_spec_with_runtime_env (mocker ):
1714
+ """Test RayJob with runtime environment in job spec."""
1715
+ from codeflare_sdk .ray .job .job import RayJobSpec
1716
+
1717
+ # Mock dependencies
1718
+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1719
+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1720
+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1721
+
1722
+ mock_api_client = mocker .Mock ()
1723
+ mocker .patch (
1724
+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1725
+ return_value = mock_api_client ,
1726
+ )
1727
+
1728
+ mock_co_api = mocker .Mock ()
1729
+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1730
+
1731
+ # Mock Cluster creation
1732
+ mock_cluster_resource = {
1733
+ "apiVersion" : "ray.io/v1" ,
1734
+ "kind" : "RayCluster" ,
1735
+ "spec" : {},
1736
+ }
1737
+ mocker .patch (
1738
+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1739
+ )
1740
+ mock_cluster_instance = mocker .Mock ()
1741
+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1742
+ mocker .patch (
1743
+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1744
+ )
1745
+
1746
+ # Mock RayJob creation
1747
+ mock_co_api .create_namespaced_custom_object .return_value = {
1748
+ "metadata" : {"name" : "test-runtime-env" }
1749
+ }
1750
+
1751
+ # Mock status API response even for no-wait case
1752
+ from kubernetes .client .rest import ApiException
1753
+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1754
+ status = 404 , reason = "Not Found"
1755
+ )
1756
+
1757
+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1758
+
1759
+ # Create job spec with runtime environment
1760
+ job_config = RayJobSpec (
1761
+ entrypoint = "python script.py" ,
1762
+ runtime_env = {"pip" : ["numpy" , "pandas" ]},
1763
+ metadata = {"job_timeout_s" : 3600 },
1764
+ )
1765
+
1766
+ result = Cluster .run_job_with_managed_cluster (
1767
+ cluster_config = cluster_config ,
1768
+ job_config = job_config ,
1769
+ wait_for_completion = False ,
1770
+ )
1771
+
1772
+ # Method generates its own job name regardless of API response
1773
+ assert "job_cr_name" in result
1774
+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1775
+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1776
+
1777
+ # Verify the CustomObjectsApi was called with proper parameters
1778
+ mock_co_api .create_namespaced_custom_object .assert_called_once ()
1779
+
1780
+
1465
1781
# Make sure to always keep this function last
1466
1782
def test_cleanup ():
1467
1783
os .remove (f"{ aw_dir } test-all-params.yaml" )
0 commit comments