Skip to content

Commit d764b79

Browse files
dthainbtovar
authored andcommitted
Vine: Fix Crash After Idle Disconnect (#4065)
* - Commentary on poll_active_workers to clarify purpose. - Modify idle-disconnect to send "exit" message but not disconnect immediately. (let the worker do it) * format
1 parent 9311195 commit d764b79

File tree

1 file changed

+35
-10
lines changed

1 file changed

+35
-10
lines changed

taskvine/src/manager/vine_manager.c

+35-10
Original file line numberDiff line numberDiff line change
@@ -267,30 +267,41 @@ static vine_msg_code_t handle_name(struct vine_manager *q, struct vine_worker_in
267267
}
268268

269269
/*
270-
Handle a timeout request from a worker. Check if the worker has any important data before letting it go.
270+
Handle a request from a worker to shutdown because it has been idle for a while.
271+
However, do not accept the request if the manager has some further information
272+
indicating that the worker should be kept around.
271273
*/
272274

273-
static void handle_worker_timeout(struct vine_manager *q, struct vine_worker_info *w)
275+
static void handle_idle_disconnect_request(struct vine_manager *q, struct vine_worker_info *w)
274276
{
275-
// Look at the files and check if any are endangered temps.
276277
char *cachename;
277278
struct vine_file_replica *replica;
278-
// debug(D_VINE, "Handling timeout request");
279+
280+
/* First check to see if this worker has any unique files that should not be lost. */
281+
279282
HASH_TABLE_ITERATE(w->current_files, cachename, replica)
280283
{
281284
if (replica->type == VINE_TEMP) {
282285
int c = vine_file_replica_table_count_replicas(q, cachename, VINE_FILE_REPLICA_STATE_READY);
283286
if (c == 1) {
284-
debug(D_VINE, "Rejecting timeout request from worker %s (%s). Has unique file %s", w->hostname, w->addrport, cachename);
287+
debug(D_VINE, "Rejecting disconnect request from worker %s (%s). Has unique file %s", w->hostname, w->addrport, cachename);
285288
return;
286289
}
287290
}
288291
}
289292

293+
/*
294+
Then, if it is not running any tasks, tell it to exit and shut down cleanly.
295+
We do not disconnect just yet because there may be more messages pending,
296+
or other information the worker wants to send as it shuts down.
297+
The worker will disconnect on its own.
298+
Also, we don't want to invalidate the worker object w in an unexpected location.
299+
*/
300+
290301
if (itable_size(w->current_tasks) == 0) {
291-
debug(D_VINE, "Accepting timeout request from worker %s (%s).", w->hostname, w->addrport);
302+
debug(D_VINE, "Accepting disconnect request from worker %s (%s).", w->hostname, w->addrport);
292303
q->stats->workers_idled_out++;
293-
vine_manager_shut_down_worker(q, w);
304+
vine_manager_send(q, w, "exit\n");
294305
}
295306

296307
return;
@@ -311,7 +322,7 @@ static vine_msg_code_t handle_info(struct vine_manager *q, struct vine_worker_in
311322
if (string_prefix_is(field, "tasks_running")) {
312323
w->dynamic_tasks_running = atoi(value);
313324
} else if (string_prefix_is(field, "idle-disconnect-request")) {
314-
handle_worker_timeout(q, w);
325+
handle_idle_disconnect_request(q, w);
315326
} else if (string_prefix_is(field, "worker-id")) {
316327
free(w->workerid);
317328
w->workerid = xxstrdup(value);
@@ -2467,6 +2478,10 @@ static vine_result_code_t handle_worker(struct vine_manager *q, struct link *l)
24672478
w = hash_table_lookup(q->worker_table, key);
24682479
free(key);
24692480

2481+
/* This should not happen, but just in case: */
2482+
if (!w)
2483+
return VINE_WORKER_FAILURE;
2484+
24702485
vine_msg_code_t mcode;
24712486
mcode = vine_manager_recv_no_retry(q, w, line, sizeof(line));
24722487

@@ -4776,7 +4791,12 @@ struct vine_task *vine_wait_for_task_id(struct vine_manager *q, int task_id, int
47764791
return vine_wait_internal(q, timeout, NULL, task_id);
47774792
}
47784793

4779-
/* return number of workers that failed */
4794+
/*
4795+
Consider all of the connected workers, and act open each connection
4796+
that has pending input data, until the input buffer is empty.
4797+
Return the number of workers that *failed* and disconnected.
4798+
*/
4799+
47804800
static int poll_active_workers(struct vine_manager *q, int stoptime)
47814801
{
47824802
BEGIN_ACCUM_TIME(q, time_polling);
@@ -4808,9 +4828,14 @@ static int poll_active_workers(struct vine_manager *q, int stoptime)
48084828

48094829
int i;
48104830
int workers_failed = 0;
4811-
// Then consider all existing active workers
4831+
4832+
/* Consider all active connections of any kind. */
48124833
for (i = 1; i < n; i++) {
4834+
4835+
/* If there is pending input data on that connection. */
48134836
if (q->poll_table[i].revents) {
4837+
4838+
/* Act on the next input message, until there is no more buffered data. */
48144839
do {
48154840
if (handle_worker(q, q->poll_table[i].link) == VINE_WORKER_FAILURE) {
48164841
workers_failed++;

0 commit comments

Comments
 (0)