Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Fix check failure RAY_CHECK(it != current_tasks_.end()); #47659

Merged
merged 30 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/core.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ steps:

- label: ":ray: core: python {{matrix.python}} tests ({{matrix.worker_id}})"
if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
tags:
tags:
- python
- dashboard
instance_type: large
Expand Down
120 changes: 119 additions & 1 deletion python/ray/tests/test_network_failure_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from time import sleep
import pytest
import threading
from ray._private.test_utils import wait_for_condition
from ray.tests.conftest_docker import * # noqa
from ray.tests.conftest_docker import gen_head_node, gen_worker_node
Expand Down Expand Up @@ -157,7 +158,6 @@ def test_transient_network_error(head2, worker2, gcs_network):
network = gcs_network

check_two_nodes = """
import sys
import ray
from ray._private.test_utils import wait_for_condition

Expand Down Expand Up @@ -196,6 +196,124 @@ def ping(self):
assert result.exit_code == 0, result.output.decode("utf-8")


head3 = gen_head_node(
{
"RAY_grpc_keepalive_time_ms": "1000",
"RAY_grpc_client_keepalive_time_ms": "1000",
"RAY_grpc_client_keepalive_timeout_ms": "1000",
"RAY_health_check_initial_delay_ms": "1000",
"RAY_health_check_period_ms": "1000",
"RAY_health_check_timeout_ms": "100000",
"RAY_health_check_failure_threshold": "20",
}
)

worker3 = gen_worker_node(
envs={
"RAY_grpc_keepalive_time_ms": "1000",
"RAY_grpc_client_keepalive_time_ms": "1000",
"RAY_grpc_client_keepalive_timeout_ms": "1000",
"RAY_health_check_initial_delay_ms": "1000",
"RAY_health_check_period_ms": "1000",
"RAY_health_check_timeout_ms": "100000",
"RAY_health_check_failure_threshold": "20",
},
num_cpus=2,
)


def test_async_actor_task_retry(head3, worker3, gcs_network):
# Test that if transient network error happens
# after an async actor task is submitted and being executed,
# a secon attempt will be submitted and executed after the
# first attempt finishes.
network = gcs_network

driver = """
import asyncio
import ray
from ray.util.state import list_tasks

ray.init(namespace="test")

@ray.remote(num_cpus=0.1, name="counter", lifetime="detached")
class Counter:
def __init__(self):
self.count = 0

def inc(self):
self.count = self.count + 1
return self.count

@ray.method(max_task_retries=-1)
def get(self):
return self.count

@ray.remote(num_cpus=0.1, max_task_retries=-1)
class AsyncActor:
def __init__(self, counter):
self.counter = counter

async def run(self):
count = await self.counter.get.remote()
if count == 0:
# first attempt
await self.counter.inc.remote()
while len(list_tasks(
filters=[("name", "=", "AsyncActor.run")])) != 2:
# wait for second attempt to be made
await asyncio.sleep(1)
# wait until the second attempt reaches the actor
await asyncio.sleep(2)
await self.counter.inc.remote()
return "first"
else:
# second attempt
# make sure second attempt only runs
# after first attempt finishes
assert count == 2
return "second"

counter = Counter.remote()
async_actor = AsyncActor.remote(counter)
assert ray.get(async_actor.run.remote()) == "second"
"""

check_async_actor_run_is_called = """
import ray
from ray._private.test_utils import wait_for_condition
ray.init(namespace="test")

wait_for_condition(lambda: ray.get_actor("counter") is not None)
counter = ray.get_actor("counter")
wait_for_condition(lambda: ray.get(counter.get.remote()) == 1)
"""

def inject_transient_network_failure():
try:
result = head3.exec_run(
cmd=f"python -c '{check_async_actor_run_is_called}'"
)
assert result.exit_code == 0, result.output.decode("utf-8")

worker_ip = worker3._container.attrs["NetworkSettings"]["Networks"][
network.name
]["IPAddress"]
network.disconnect(worker3.name, force=True)
sleep(2)
network.connect(worker3.name, ipv4_address=worker_ip)
except Exception as e:
print(f"Network failure injection failed {e}")

t = threading.Thread(target=inject_transient_network_failure, daemon=True)
t.start()

result = head3.exec_run(
cmd=f"python -c '{driver}'",
)
assert result.exit_code == 0, result.output.decode("utf-8")


if __name__ == "__main__":
import os

Expand Down
7 changes: 4 additions & 3 deletions src/ray/common/ray_syncer/ray_syncer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ void RaySyncer::Connect(const std::string &node_id,
execute_after(
io_context_,
[this, node_id, channel]() {
RAY_LOG(INFO) << "Connection is broken. Reconnect to node: "
<< NodeID::FromBinary(node_id);
RAY_LOG(INFO).WithField(NodeID::FromBinary(node_id))
<< "Connection is broken. Reconnect to node.";
Connect(node_id, channel);
},
/* delay_microseconds = */ std::chrono::milliseconds(2000));
Expand Down Expand Up @@ -370,10 +370,11 @@ ServerBidiReactor *RaySyncerService::StartSync(grpc::CallbackServerContext *cont
// 4. OnDone method of the old reactor is called which calls this cleanup_cb_
return;
}
RAY_LOG(INFO).WithField(NodeID::FromBinary(node_id)) << "Connection is broken.";
syncer_.sync_reactors_.erase(node_id);
syncer_.node_state_->RemoveNode(node_id);
});
RAY_LOG(INFO).WithField(kLogKeyNodeID, NodeID::FromBinary(reactor->GetRemoteNodeID()))
RAY_LOG(INFO).WithField(NodeID::FromBinary(reactor->GetRemoteNodeID()))
<< "Get connection";
// Disconnect exiting connection if there is any.
// This can happen when there is transient network error
Expand Down
Loading