Skip to content

Commit f38ba02

Browse files
authored
Vine: Fix Crash After Idle Disconnect (#4065)
* - Commentary on poll_active_workers to clarify purpose. - Modify idle-disconnect to send "exit" message but not disconnect immediately. (let the worker do it) * format
1 parent a5a5af9 commit f38ba02

File tree

1 file changed

+35
-10
lines changed

1 file changed

+35
-10
lines changed

taskvine/src/manager/vine_manager.c

+35-10
Original file line numberDiff line numberDiff line change
@@ -269,30 +269,41 @@ static vine_msg_code_t handle_name(struct vine_manager *q, struct vine_worker_in
269269
}
270270

271271
/*
272-
Handle a timeout request from a worker. Check if the worker has any important data before letting it go.
272+
Handle a request from a worker to shutdown because it has been idle for a while.
273+
However, do not accept the request if the manager has some further information
274+
indicating that the worker should be kept around.
273275
*/
274276

275-
static void handle_worker_timeout(struct vine_manager *q, struct vine_worker_info *w)
277+
static void handle_idle_disconnect_request(struct vine_manager *q, struct vine_worker_info *w)
276278
{
277-
// Look at the files and check if any are endangered temps.
278279
char *cachename;
279280
struct vine_file_replica *replica;
280-
// debug(D_VINE, "Handling timeout request");
281+
282+
/* First check to see if this worker has any unique files that should not be lost. */
283+
281284
HASH_TABLE_ITERATE(w->current_files, cachename, replica)
282285
{
283286
if (replica->type == VINE_TEMP) {
284287
int c = vine_file_replica_table_count_replicas(q, cachename, VINE_FILE_REPLICA_STATE_READY);
285288
if (c == 1) {
286-
debug(D_VINE, "Rejecting timeout request from worker %s (%s). Has unique file %s", w->hostname, w->addrport, cachename);
289+
debug(D_VINE, "Rejecting disconnect request from worker %s (%s). Has unique file %s", w->hostname, w->addrport, cachename);
287290
return;
288291
}
289292
}
290293
}
291294

295+
/*
296+
Then, if it is not running any tasks, tell it to exit and shut down cleanly.
297+
We do not disconnect just yet because there may be more messages pending,
298+
or other information the worker wants to send as it shuts down.
299+
The worker will disconnect on its own.
300+
Also, we don't want to invalidate the worker object w in an unexpected location.
301+
*/
302+
292303
if (itable_size(w->current_tasks) == 0) {
293-
debug(D_VINE, "Accepting timeout request from worker %s (%s).", w->hostname, w->addrport);
304+
debug(D_VINE, "Accepting disconnect request from worker %s (%s).", w->hostname, w->addrport);
294305
q->stats->workers_idled_out++;
295-
vine_manager_shut_down_worker(q, w);
306+
vine_manager_send(q, w, "exit\n");
296307
}
297308

298309
return;
@@ -313,7 +324,7 @@ static vine_msg_code_t handle_info(struct vine_manager *q, struct vine_worker_in
313324
if (string_prefix_is(field, "tasks_running")) {
314325
w->dynamic_tasks_running = atoi(value);
315326
} else if (string_prefix_is(field, "idle-disconnect-request")) {
316-
handle_worker_timeout(q, w);
327+
handle_idle_disconnect_request(q, w);
317328
} else if (string_prefix_is(field, "worker-id")) {
318329
free(w->workerid);
319330
w->workerid = xxstrdup(value);
@@ -2507,6 +2518,10 @@ static vine_result_code_t handle_worker(struct vine_manager *q, struct link *l)
25072518
w = hash_table_lookup(q->worker_table, key);
25082519
free(key);
25092520

2521+
/* This should not happen, but just in case: */
2522+
if (!w)
2523+
return VINE_WORKER_FAILURE;
2524+
25102525
vine_msg_code_t mcode;
25112526
mcode = vine_manager_recv_no_retry(q, w, line, sizeof(line));
25122527

@@ -4961,7 +4976,12 @@ struct vine_task *vine_wait_for_task_id(struct vine_manager *q, int task_id, int
49614976
return vine_wait_internal(q, timeout, NULL, task_id);
49624977
}
49634978

4964-
/* return number of workers that failed */
4979+
/*
4980+
Consider all of the connected workers, and act open each connection
4981+
that has pending input data, until the input buffer is empty.
4982+
Return the number of workers that *failed* and disconnected.
4983+
*/
4984+
49654985
static int poll_active_workers(struct vine_manager *q, int stoptime)
49664986
{
49674987
BEGIN_ACCUM_TIME(q, time_polling);
@@ -4993,9 +5013,14 @@ static int poll_active_workers(struct vine_manager *q, int stoptime)
49935013

49945014
int i;
49955015
int workers_failed = 0;
4996-
// Then consider all existing active workers
5016+
5017+
/* Consider all active connections of any kind. */
49975018
for (i = 1; i < n; i++) {
5019+
5020+
/* If there is pending input data on that connection. */
49985021
if (q->poll_table[i].revents) {
5022+
5023+
/* Act on the next input message, until there is no more buffered data. */
49995024
do {
50005025
if (handle_worker(q, q->poll_table[i].link) == VINE_WORKER_FAILURE) {
50015026
workers_failed++;

0 commit comments

Comments
 (0)