-
Notifications
You must be signed in to change notification settings - Fork 350
Fix(CDAP-21219): Handle CancelJob on DONE Dataproc jobs gracefully #16066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
cjac
wants to merge
1
commit into
cdapio:develop
Choose a base branch
from
LLC-Technologies-Collier:fix/cdf-deprovision-race
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+184
−8
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
102 changes: 102 additions & 0 deletions
102
...cdap/cdap/internal/app/runtime/distributed/remote/RemoteExecutionTwillControllerTest.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| package io.cdap.cdap.internal.app.runtime.distributed.remote; | ||
|
|
||
| import io.cdap.cdap.common.conf.CConfiguration; | ||
| import io.cdap.cdap.common.conf.Constants; | ||
| import io.cdap.cdap.common.service.RetryStrategies; | ||
| import io.cdap.cdap.proto.id.NamespaceId; | ||
| import io.cdap.cdap.proto.id.ProgramId; | ||
| import io.cdap.cdap.proto.id.ProgramRunId; | ||
| import io.cdap.cdap.proto.ProgramType; | ||
| import io.cdap.cdap.runtime.spi.runtimejob.RuntimeJobStatus; | ||
| import org.apache.twill.api.TwillController; | ||
| import org.junit.Before; | ||
| import org.junit.Test; | ||
| import org.mockito.Mockito; | ||
|
|
||
| import java.util.concurrent.CompletableFuture; | ||
| import java.util.concurrent.Executors; | ||
| import java.util.concurrent.ScheduledExecutorService; | ||
| import java.util.concurrent.TimeUnit; | ||
|
|
||
| import static org.mockito.ArgumentMatchers.any; | ||
| import static org.mockito.Mockito.never; | ||
| import static org.mockito.Mockito.verify; | ||
| import static org.mockito.Mockito.when; | ||
|
|
||
| /** | ||
| * Tests for {@link RemoteExecutionTwillController}. | ||
| */ | ||
| public class RemoteExecutionTwillControllerTest { | ||
|
|
||
| private CConfiguration cConf; | ||
| private ProgramRunId programRunId; | ||
| private RemoteProcessController remoteProcessController; | ||
| private ScheduledExecutorService scheduler; | ||
| private RemoteExecutionService remoteExecutionService; | ||
| private CompletableFuture<Void> startupCompletionFuture; | ||
|
|
||
| @Before | ||
| public void setUp() { | ||
| cConf = CConfiguration.create(); | ||
| cConf.setLong(Constants.RuntimeMonitor.POLL_TIME_MS, 100); | ||
| programRunId = new ProgramRunId(NamespaceId.DEFAULT.getNamespace(), "testapp", ProgramType.WORKFLOW, "testworkflow", "testrun"); | ||
| remoteProcessController = Mockito.mock(RemoteProcessController.class); | ||
| scheduler = Executors.newSingleThreadScheduledExecutor(); | ||
| remoteExecutionService = Mockito.mock(RemoteExecutionService.class); | ||
| startupCompletionFuture = new CompletableFuture<>(); | ||
| } | ||
|
|
||
| private RemoteExecutionTwillController createController(boolean terminateWithController) { | ||
| return new RemoteExecutionTwillController(cConf, programRunId, startupCompletionFuture, | ||
| remoteProcessController, scheduler, remoteExecutionService, | ||
| terminateWithController); | ||
| } | ||
|
|
||
| @Test | ||
| public void testComplete_KillSkippedForTerminalState() throws Exception { | ||
| RemoteExecutionTwillController controller = createController(true); | ||
| startupCompletionFuture.complete(null); | ||
|
|
||
| // Simulate getStatus throwing an exception to enter the catch block | ||
| when(remoteProcessController.getStatus()).thenThrow(new RuntimeException("Simulated poll failure")); | ||
|
|
||
| // In the catch block, simulate the job being COMPLETED | ||
| when(remoteProcessController.getStatus()).thenReturn(RuntimeJobStatus.COMPLETED); | ||
|
|
||
| controller.complete(); | ||
|
|
||
| // Verify kill was NOT called because the status was terminal | ||
| verify(remoteProcessController, never()).kill(any()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testComplete_KillCalledForRunningState() throws Exception { | ||
| RemoteExecutionTwillController controller = createController(true); | ||
| startupCompletionFuture.complete(null); | ||
|
|
||
| // Simulate getStatus throwing an exception to enter the catch block | ||
| when(remoteProcessController.getStatus()).thenThrow(new RuntimeException("Simulated poll failure")); | ||
| // In the catch block, simulate the job being RUNNING | ||
| when(remoteProcessController.getStatus()).thenReturn(RuntimeJobStatus.RUNNING); | ||
|
|
||
| controller.complete(); | ||
|
|
||
| // Verify kill WAS called | ||
| verify(remoteProcessController).kill(RuntimeJobStatus.RUNNING); | ||
| } | ||
|
|
||
| @Test | ||
| public void testComplete_StatusCheckFailsInCatch() throws Exception { | ||
| RemoteExecutionTwillController controller = createController(true); | ||
| startupCompletionFuture.complete(null); | ||
|
|
||
| // Simulate getStatus throwing an exception to enter the catch block | ||
| when(remoteProcessController.getStatus()).thenThrow(new RuntimeException("Simulated poll failure")) | ||
| .thenThrow(new RuntimeException("Simulated getStatus failure in catch")); | ||
|
|
||
| controller.complete(); | ||
|
|
||
| // Verify kill was NOT called because getStatus failed in the catch | ||
| verify(remoteProcessController, never()).kill(any()); | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,10 +127,19 @@ public final ClusterStatus deleteClusterWithStatus(ProvisionerContext context, C | |
| // Status details is specific to dataproc jobs, so it was not added to RuntimeJobDetail spi. | ||
| String statusDetails = ((DataprocRuntimeJobDetail) jobDetail).getJobStatusDetails(); | ||
| if (statusDetails != null) { | ||
| ProgramRunFailureException e = new ProgramRunFailureException( | ||
| String.format("Dataproc job '%s' status details: %s", | ||
| ((DataprocRuntimeJobDetail) jobDetail).getJobId(), statusDetails)); | ||
| LOG.error("Dataproc Job {}", jobDetail.getStatus(), e); | ||
| // Check if the failure is due to attempting to cancel a job already DONE | ||
| if (jobDetail.getStatus() == RuntimeJobStatus.FAILED && statusDetails.contains("is not supported in the current state: DONE")) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This exception of dataproc seems to be covered under and we are already handling it DataprocRuntimeJobManager.java#L923 So, this check might not work. We are assuming Failure for all conditions of |
||
| LOG.warn("Attempted to cancel Dataproc job {} which was already DONE. This is not a pipeline failure. Continuing with deprovisioning.", ((DataprocRuntimeJobDetail) jobDetail).getJobId()); | ||
| } else { | ||
| ProgramRunFailureException e = new ProgramRunFailureException( | ||
| String.format("Dataproc job '%s' status details: %s", | ||
| ((DataprocRuntimeJobDetail) jobDetail).getJobId(), statusDetails)); | ||
| LOG.error("Dataproc Job {} failed with error: {}", jobDetail.getStatus(), statusDetails, e); | ||
| // Rethrow only if it's not the specific Cancel-on-DONE issue | ||
| // We need to be careful here, as this method is expected to return ClusterStatus.DELETING | ||
| // and an exception here might halt the deprovisioning process in the caller. | ||
| // For now, let's just log and not throw for the specific case. | ||
| } | ||
| } | ||
| } | ||
| } finally { | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Prior Logic :
IllegalStateExceptiongetStatusis called, the 5 second check is done without any gap and immediately goes to catch block for force termination.I agree between this few MS the dataproc job status could be
DONEBut in the new extra check, the GAP for error still exists and this intermittent Wrong killing of pipeline would still happen.
My point is there is not much time gap between the existing
getStatus == runningand remoteProcessController.kill() , and similar to extra check..