-
Notifications
You must be signed in to change notification settings - Fork 13.8k
[FLINK-38404][core] Fix JobMasterTest by making it scheduler-type aware #27091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
583d9ce
a7b8b52
73915e7
7824f3f
1d640a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -184,6 +184,7 @@ | |
| import java.util.stream.Collectors; | ||
| import java.util.stream.IntStream; | ||
|
|
||
| import static org.apache.flink.configuration.JobManagerOptions.SchedulerType; | ||
| import static org.apache.flink.configuration.RestartStrategyOptions.RestartStrategyType.FIXED_DELAY; | ||
| import static org.apache.flink.core.testutils.FlinkAssertions.assertThatFuture; | ||
| import static org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.createExecutionAttemptId; | ||
|
|
@@ -1691,40 +1692,17 @@ private TestingResourceManagerGateway createAndRegisterTestingResourceManagerGat | |
| @Test | ||
| void testJobFailureWhenGracefulTaskExecutorTermination() throws Exception { | ||
| final List<Event> jobEvents = new ArrayList<>(); | ||
| runJobFailureWhenTaskExecutorTerminatesTest( | ||
| heartbeatServices, | ||
| (localTaskManagerLocation, jobMasterGateway) -> | ||
| jobMasterGateway.disconnectTaskManager( | ||
| localTaskManagerLocation.getResourceID(), | ||
| new FlinkException("Test disconnectTaskManager exception.")), | ||
| jobEvents); | ||
| assertThat( | ||
| jobEvents.stream() | ||
| .filter( | ||
| event -> | ||
| Events.JobStatusChangeEvent.name() | ||
| .equals(event.getName())) | ||
| .map(Event::getAttributes) | ||
| .map(x -> x.get("newJobStatus"))) | ||
| .containsExactly( | ||
| JobStatus.RUNNING.toString(), | ||
| JobStatus.FAILING.toString(), | ||
| JobStatus.FAILED.toString()); | ||
| final SchedulerType schedulerType = | ||
| runJobFailureWhenTaskExecutorTerminatesTest( | ||
| heartbeatServices, | ||
| (localTaskManagerLocation, jobMasterGateway) -> | ||
| jobMasterGateway.disconnectTaskManager( | ||
| localTaskManagerLocation.getResourceID(), | ||
| new FlinkException( | ||
| "Test disconnectTaskManager exception.")), | ||
| jobEvents); | ||
|
|
||
| assertThat( | ||
| jobEvents.stream() | ||
| .filter( | ||
| event -> | ||
| Events.AllSubtasksStatusChangeEvent.name() | ||
| .equals(event.getName())) | ||
| .map(Event::getAttributes) | ||
| .map( | ||
| x -> | ||
| x.get( | ||
| AllSubTasksRunningOrFinishedStateTimeMetrics | ||
| .STATUS_ATTRIBUTE))) | ||
| .containsExactly( | ||
| ALL_RUNNING_OR_FINISHED.toString(), NOT_ALL_RUNNING_OR_FINISHED.toString()); | ||
| assertJobStatusTransitions(schedulerType, jobEvents); | ||
| } | ||
|
|
||
| @Test | ||
|
|
@@ -1733,24 +1711,47 @@ void testJobFailureWhenTaskExecutorHeartbeatTimeout() throws Exception { | |
| final TestingHeartbeatServices testingHeartbeatService = | ||
| new TestingHeartbeatServices(heartbeatInterval, heartbeatTimeout); | ||
|
|
||
| runJobFailureWhenTaskExecutorTerminatesTest( | ||
| testingHeartbeatService, | ||
| (localTaskManagerLocation, jobMasterGateway) -> | ||
| testingHeartbeatService.triggerHeartbeatTimeout( | ||
| jmResourceId, localTaskManagerLocation.getResourceID()), | ||
| jobEvents); | ||
| assertThat( | ||
| jobEvents.stream() | ||
| .filter( | ||
| event -> | ||
| Events.JobStatusChangeEvent.name() | ||
| .equals(event.getName())) | ||
| .map(Event::getAttributes) | ||
| .map(x -> x.get("newJobStatus"))) | ||
| .containsExactly( | ||
| JobStatus.RUNNING.toString(), | ||
| JobStatus.FAILING.toString(), | ||
| JobStatus.FAILED.toString()); | ||
| final SchedulerType schedulerType = | ||
| runJobFailureWhenTaskExecutorTerminatesTest( | ||
| testingHeartbeatService, | ||
| (localTaskManagerLocation, jobMasterGateway) -> | ||
| testingHeartbeatService.triggerHeartbeatTimeout( | ||
| jmResourceId, localTaskManagerLocation.getResourceID()), | ||
| jobEvents); | ||
|
|
||
| assertJobStatusTransitions(schedulerType, jobEvents); | ||
| } | ||
|
|
||
| /** | ||
| * Asserts that job status transitions are as expected based on the scheduler type. | ||
| * DefaultScheduler does not emit CREATED state, while AdaptiveScheduler and | ||
| * AdaptiveBatchScheduler do. | ||
| */ | ||
| private static void assertJobStatusTransitions( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for the delay in responding. It looks like we have got to a the root cause of this, in that we have 2 different types of scheduler that contain different job status's. I see that this method checks for the schedulerType and performs different asserts. I cannot see from the test that this method is driven for each scheduler type. If the 2 callers of this method cover the 2 scheduler types - it would be good if this was more explicit in the code.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davidradl the maven profile 'enable-adaptive-scheduler' was introduced so that lots of existing tests can run against the AdaptiveScheduler which was added later.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I agree with @noorall and @zhuzhurk that he scheduler used in tests are not static, but can be set when running it. Specifically it's via the By default, when this test is running, the DEFAULT scheduler is used so the I guess the purpose of adding the dynamic feature through the Maven profile 'enable-adaptive-scheduler' is likely to eliminate the need for existing tests to explicitly enumerate various scheduler types, as long as they are either agnostic or aware of scheduler type.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davidradl Please help to review again. Thanks. |
||
| SchedulerType schedulerType, List<Event> jobEvents) { | ||
| final List<String> jobStatusTransitions = | ||
| jobEvents.stream() | ||
| .filter(event -> Events.JobStatusChangeEvent.name().equals(event.getName())) | ||
| .map(Event::getAttributes) | ||
| .map(x -> (String) x.get("newJobStatus")) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| if (schedulerType == SchedulerType.Adaptive) { | ||
| // Adaptive schedulers emit CREATED: CREATED → RUNNING → FAILING → FAILED | ||
| assertThat(jobStatusTransitions) | ||
| .containsExactly( | ||
| JobStatus.CREATED.toString(), | ||
| JobStatus.RUNNING.toString(), | ||
| JobStatus.FAILING.toString(), | ||
| JobStatus.FAILED.toString()); | ||
| } else { | ||
| // Default scheduler does not emit CREATED: RUNNING → FAILING → FAILED | ||
| assertThat(jobStatusTransitions) | ||
| .containsExactly( | ||
| JobStatus.RUNNING.toString(), | ||
| JobStatus.FAILING.toString(), | ||
| JobStatus.FAILED.toString()); | ||
| } | ||
|
|
||
| assertThat( | ||
| jobEvents.stream() | ||
|
|
@@ -2555,7 +2556,7 @@ private TestingResourceManagerGateway createResourceManagerGateway( | |
| return resourceManagerGateway; | ||
| } | ||
|
|
||
| private void runJobFailureWhenTaskExecutorTerminatesTest( | ||
| private SchedulerType runJobFailureWhenTaskExecutorTerminatesTest( | ||
| HeartbeatServices heartbeatServices, | ||
| BiConsumer<LocalUnresolvedTaskManagerLocation, JobMasterGateway> jobReachedRunningState, | ||
| List<Event> jobEventsOut) | ||
|
|
@@ -2564,12 +2565,17 @@ private void runJobFailureWhenTaskExecutorTerminatesTest( | |
| final JobMasterBuilder.TestingOnCompletionActions onCompletionActions = | ||
| new JobMasterBuilder.TestingOnCompletionActions(); | ||
|
|
||
| final SlotPoolServiceSchedulerFactory slotPoolServiceSchedulerFactory = | ||
| DefaultSlotPoolServiceSchedulerFactory.fromConfiguration( | ||
| configuration, jobGraph.getJobType(), jobGraph.isDynamic()); | ||
|
|
||
| try (final JobMaster jobMaster = | ||
| new JobMasterBuilder(jobGraph, rpcService) | ||
| .withResourceId(jmResourceId) | ||
| .withHighAvailabilityServices(haServices) | ||
| .withHeartbeatServices(heartbeatServices) | ||
| .withOnCompletionActions(onCompletionActions) | ||
| .withSlotPoolServiceSchedulerFactory(slotPoolServiceSchedulerFactory) | ||
| .withMetricsGroupFactory( | ||
| new JobManagerJobMetricGroupFactory() { | ||
| @Override | ||
|
|
@@ -2636,6 +2642,8 @@ public void addEvent(EventBuilder eventBuilder) { | |
|
|
||
| assertThat(archivedExecutionGraph.getState()).isEqualTo(JobStatus.FAILED); | ||
| } | ||
|
|
||
| return slotPoolServiceSchedulerFactory.getSchedulerType(); | ||
| } | ||
|
|
||
| private Collection<SlotOffer> registerSlotsAtJobMaster( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I missed something—this part seems to have been omitted? @liuml07
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, this is part of the new helper function
assertJobStatusTransitions(schedulerType, jobEvents)as this assertion is also about status change and it's duplicate between the two tests, so just reused it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, LGTM! 😊