Skip to content

Commit b123027

Browse files
authored
Merge pull request #15737 from cdapio/CDAP-21046
[CDAP-21046][CDAP-21048] Fixing flow control metrics on startup.
2 parents e74b369 + 902acf4 commit b123027

File tree

4 files changed

+157
-63
lines changed

4 files changed

+157
-63
lines changed

cdap-app-fabric/src/main/java/io/cdap/cdap/internal/app/services/ProgramNotificationSubscriberService.java

Lines changed: 59 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@
8989
import java.util.Queue;
9090
import java.util.Set;
9191
import java.util.concurrent.TimeUnit;
92-
import java.util.concurrent.atomic.AtomicBoolean;
9392
import java.util.stream.IntStream;
9493
import javax.annotation.Nullable;
9594
import org.apache.twill.internal.CompositeService;
@@ -101,6 +100,9 @@
101100
* events topic
102101
*/
103102
public class ProgramNotificationSubscriberService extends AbstractIdleService {
103+
104+
private static final Logger LOG =
105+
LoggerFactory.getLogger(ProgramNotificationSubscriberService.class);
104106
private final MessagingService messagingService;
105107
private final CConfiguration cConf;
106108
private final MetricsCollectionService metricsCollectionService;
@@ -145,6 +147,8 @@ protected void startUp() throws Exception {
145147
List<Service> children = new ArrayList<>();
146148
String topicPrefix = cConf.get(Constants.AppFabric.PROGRAM_STATUS_EVENT_TOPIC);
147149
int numPartitions = cConf.getInt(Constants.AppFabric.PROGRAM_STATUS_EVENT_NUM_PARTITIONS);
150+
// Active runs should be restored only once not for every shard that is created.
151+
restoreActiveRuns();
148152
// Add bare one - we always listen to it
149153
children.add(createChildService("program.status", topicPrefix));
150154
// If number of partitions is more than 1 - create partitioned services
@@ -153,8 +157,60 @@ protected void startUp() throws Exception {
153157
.forEach(i -> children.add(createChildService("program.status." + i, topicPrefix + i)));
154158
}
155159
delegate = new CompositeService(children);
156-
157160
delegate.startAndWait();
161+
// Explicitly emit both launching and running counts on startup.
162+
emitFlowControlMetrics();
163+
}
164+
165+
private void emitFlowControlMetrics() {
166+
runRecordMonitorService.emitLaunchingMetrics();
167+
runRecordMonitorService.emitRunningMetrics();
168+
}
169+
170+
private void restoreActiveRuns() {
171+
LOG.info("Restoring active runs");
172+
int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
173+
RetryStrategy retryStrategy =
174+
RetryStrategies.fromConfiguration(cConf, Constants.Service.RUNTIME_MONITOR_RETRY_PREFIX);
175+
long startTs = System.currentTimeMillis();
176+
177+
Retries.runWithRetries(
178+
() ->
179+
store.scanActiveRuns(
180+
batchSize,
181+
(runRecordDetail) -> {
182+
if (runRecordDetail.getStartTs() > startTs) {
183+
return;
184+
}
185+
try {
186+
LOG.info("Found active run: {}", runRecordDetail.getProgramRunId());
187+
if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
188+
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
189+
} else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
190+
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
191+
// It is unknown what is the state of program runs in STARTING state.
192+
// A STARTING message is published again to retry STARTING logic.
193+
ProgramOptions programOptions =
194+
new SimpleProgramOptions(
195+
runRecordDetail.getProgramRunId().getParent(),
196+
new BasicArguments(runRecordDetail.getSystemArgs()),
197+
new BasicArguments(runRecordDetail.getUserArgs()));
198+
LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
199+
programStateWriter.start(
200+
runRecordDetail.getProgramRunId(),
201+
programOptions,
202+
null,
203+
this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
204+
}
205+
} catch (Exception e) {
206+
ProgramRunId programRunId = runRecordDetail.getProgramRunId();
207+
LOG.warn(
208+
"Retrying to start run {} failed. Marking it as failed.", programRunId, e);
209+
programStateWriter.error(programRunId, e);
210+
}
211+
}),
212+
retryStrategy,
213+
e -> true);
158214
}
159215

160216
@Override
@@ -178,7 +234,6 @@ private ProgramNotificationSingleTopicSubscriberService createChildService(
178234
provisioningService,
179235
programStateWriter,
180236
transactionRunner,
181-
store,
182237
runRecordMonitorService,
183238
name,
184239
topicName,
@@ -195,7 +250,7 @@ class ProgramNotificationSingleTopicSubscriberService
195250
extends AbstractNotificationSubscriberService {
196251

197252
private static final Logger LOG =
198-
LoggerFactory.getLogger(ProgramNotificationSubscriberService.class);
253+
LoggerFactory.getLogger(ProgramNotificationSingleTopicSubscriberService.class);
199254

200255
private static final Gson GSON =
201256
ApplicationSpecificationAdapter.addTypeAdapters(new GsonBuilder()).create();
@@ -220,8 +275,6 @@ class ProgramNotificationSingleTopicSubscriberService
220275
private final Queue<Runnable> tasks;
221276
private final MetricsCollectionService metricsCollectionService;
222277
private Set<ProgramCompletionNotifier> programCompletionNotifiers;
223-
private final CConfiguration cConf;
224-
private final Store store;
225278
private final RunRecordMonitorService runRecordMonitorService;
226279
private final boolean checkTxSeparation;
227280

@@ -234,7 +287,6 @@ class ProgramNotificationSingleTopicSubscriberService
234287
ProvisioningService provisioningService,
235288
ProgramStateWriter programStateWriter,
236289
TransactionRunner transactionRunner,
237-
Store store,
238290
RunRecordMonitorService runRecordMonitorService,
239291
String name,
240292
String topicName,
@@ -259,8 +311,6 @@ class ProgramNotificationSingleTopicSubscriberService
259311
this.metricsCollectionService = metricsCollectionService;
260312
this.programCompletionNotifiers = programCompletionNotifiers;
261313
this.runRecordMonitorService = runRecordMonitorService;
262-
this.cConf = cConf;
263-
this.store = store;
264314

265315
// If number of partitions equals 1, DB deadlock cannot happen as a result of concurrent
266316
// modifications to
@@ -273,55 +323,6 @@ class ProgramNotificationSingleTopicSubscriberService
273323
@Override
274324
protected void doStartUp() throws Exception {
275325
super.doStartUp();
276-
277-
int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
278-
RetryStrategy retryStrategy =
279-
RetryStrategies.fromConfiguration(cConf, Constants.Service.RUNTIME_MONITOR_RETRY_PREFIX);
280-
long startTs = System.currentTimeMillis();
281-
282-
AtomicBoolean launching = new AtomicBoolean(false);
283-
Retries.runWithRetries(
284-
() ->
285-
store.scanActiveRuns(
286-
batchSize,
287-
(runRecordDetail) -> {
288-
if (runRecordDetail.getStartTs() > startTs) {
289-
return;
290-
}
291-
try {
292-
if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
293-
launching.set(true);
294-
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
295-
} else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
296-
launching.set(true);
297-
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
298-
// It is unknown what is the state of program runs in STARTING state.
299-
// A STARTING message is published again to retry STARTING logic.
300-
ProgramOptions programOptions =
301-
new SimpleProgramOptions(
302-
runRecordDetail.getProgramRunId().getParent(),
303-
new BasicArguments(runRecordDetail.getSystemArgs()),
304-
new BasicArguments(runRecordDetail.getUserArgs()));
305-
LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
306-
programStateWriter.start(
307-
runRecordDetail.getProgramRunId(),
308-
programOptions,
309-
null,
310-
this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
311-
}
312-
} catch (Exception e) {
313-
ProgramRunId programRunId = runRecordDetail.getProgramRunId();
314-
LOG.warn(
315-
"Retrying to start run {} failed. Marking it as failed.", programRunId, e);
316-
programStateWriter.error(programRunId, e);
317-
}
318-
}),
319-
retryStrategy,
320-
e -> true);
321-
if (!launching.get()) {
322-
// there is no launching pipeline
323-
runRecordMonitorService.emitLaunchingMetrics(0);
324-
}
325326
}
326327

327328
@Nullable

cdap-app-fabric/src/main/java/io/cdap/cdap/internal/app/services/RunRecordMonitorService.java

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import io.cdap.cdap.common.app.RunIds;
2424
import io.cdap.cdap.common.conf.CConfiguration;
2525
import io.cdap.cdap.common.conf.Constants;
26+
import io.cdap.cdap.common.conf.Constants.Metrics.FlowControl;
2627
import io.cdap.cdap.proto.ProgramRunStatus;
2728
import io.cdap.cdap.proto.ProgramType;
2829
import io.cdap.cdap.proto.id.ProgramRunId;
@@ -184,15 +185,31 @@ public void removeRequest(ProgramRunId programRunId, boolean emitRunningChange)
184185
}
185186

186187
if (emitRunningChange) {
187-
emitMetrics(Constants.Metrics.FlowControl.RUNNING_COUNT, getProgramsRunningCount());
188+
emitRunningMetrics();
188189
}
189190
}
190191

191192
public void emitLaunchingMetrics(long value) {
192193
emitMetrics(Constants.Metrics.FlowControl.LAUNCHING_COUNT, value);
193194
}
194195

196+
/**
197+
* Emit the {@link Constants.Metrics.FlowControl#LAUNCHING_COUNT} metric for runs.
198+
*/
199+
public void emitLaunchingMetrics() {
200+
emitMetrics(Constants.Metrics.FlowControl.LAUNCHING_COUNT, launchingQueue.size());
201+
}
202+
203+
204+
/**
205+
* Emit the {@link Constants.Metrics.FlowControl#RUNNING_COUNT} metric for runs.
206+
*/
207+
public void emitRunningMetrics() {
208+
emitMetrics(FlowControl.RUNNING_COUNT, getProgramsRunningCount());
209+
}
210+
195211
private void emitMetrics(String metricName, long value) {
212+
LOG.debug("Setting metric {} to value {}", metricName, value);
196213
metricsCollectionService.getContext(Collections.emptyMap()).gauge(metricName, value);
197214
}
198215

@@ -208,11 +225,11 @@ private void cleanupQueue() {
208225
// Queue head might have already been removed. So instead of calling poll, we call remove.
209226
if (launchingQueue.remove(programRunId)) {
210227
LOG.info("Removing request with runId {} due to expired retention time.", programRunId);
211-
emitMetrics(Constants.Metrics.FlowControl.LAUNCHING_COUNT, launchingQueue.size());
212228
}
213229
}
214-
215-
emitMetrics(Constants.Metrics.FlowControl.RUNNING_COUNT, getProgramsRunningCount());
230+
// Always emit both metrics after cleanup.
231+
emitLaunchingMetrics();
232+
emitRunningMetrics();
216233
}
217234

218235
/**

cdap-app-fabric/src/test/java/io/cdap/cdap/internal/AppFabricTestHelper.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,15 @@ protected void configure() {
131131
});
132132
}
133133

134+
public static <T extends Service> T getService(Class<T> clazz) {
135+
for (Service service : services) {
136+
if (clazz.isAssignableFrom(service.getClass())) {
137+
return (T) service;
138+
}
139+
}
140+
return null;
141+
}
142+
134143
public static Injector getInjector(CConfiguration cConf, Module overrides) {
135144
return getInjector(cConf, null, overrides);
136145
}

cdap-app-fabric/src/test/java/io/cdap/cdap/internal/app/services/ProgramNotificationSubscriberServiceTest.java

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import io.cdap.cdap.common.app.RunIds;
3434
import io.cdap.cdap.common.conf.CConfiguration;
3535
import io.cdap.cdap.common.conf.Constants;
36+
import io.cdap.cdap.common.conf.Constants.Metrics.FlowControl;
3637
import io.cdap.cdap.common.id.Id;
3738
import io.cdap.cdap.common.utils.ProjectInfo;
3839
import io.cdap.cdap.common.utils.Tasks;
@@ -308,6 +309,67 @@ public void testMetricsEmit() throws Exception {
308309
metricStore.deleteAll();
309310
}
310311

312+
@Test
313+
public void testLaunchingCountMetricsOnRestart() throws Exception {
314+
AppFabricTestHelper.deployApplication(Id.Namespace.DEFAULT, ProgramStateWorkflowApp.class, null,
315+
cConf);
316+
ApplicationDetail appDetail = AppFabricTestHelper.getAppInfo(Id.Namespace.DEFAULT,
317+
ProgramStateWorkflowApp.class.getSimpleName(), cConf);
318+
319+
ProgramRunId workflowRunId = NamespaceId.DEFAULT
320+
.app(ProgramStateWorkflowApp.class.getSimpleName(), appDetail.getAppVersion())
321+
.workflow(ProgramStateWorkflowApp.ProgramStateWorkflow.class.getSimpleName())
322+
.run(RunIds.generate());
323+
324+
ApplicationSpecification appSpec = TransactionRunners.run(transactionRunner, context -> {
325+
return AppMetadataStore.create(context).getApplication(workflowRunId.getParent().getParent())
326+
.getSpec();
327+
});
328+
329+
ProgramDescriptor programDescriptor = new ProgramDescriptor(workflowRunId.getParent(), appSpec);
330+
331+
// Start and run the workflow
332+
Map<String, String> systemArgs = new HashMap<>();
333+
systemArgs.put(ProgramOptionConstants.SKIP_PROVISIONING, Boolean.TRUE.toString());
334+
systemArgs.put(SystemArguments.PROFILE_NAME, ProfileId.NATIVE.getScopedName());
335+
TransactionRunners.run(transactionRunner, context -> {
336+
programStateWriter.start(workflowRunId, new SimpleProgramOptions(workflowRunId.getParent(),
337+
new BasicArguments(systemArgs),
338+
new BasicArguments()), null, programDescriptor);
339+
});
340+
checkProgramStatus(appSpec.getArtifactId(), workflowRunId, ProgramRunStatus.STARTING);
341+
342+
ProgramNotificationSubscriberService notificationService = AppFabricTestHelper.getService(
343+
ProgramNotificationSubscriberService.class);
344+
// Restart the Notification service. We are not using the stopAndWait() because we don't want to
345+
// terminate the main service.
346+
notificationService.shutDown();
347+
notificationService.startUp();
348+
349+
MetricStore metricStore = injector.getInstance(MetricStore.class);
350+
// Wait for metrics to be written.
351+
Tasks.waitFor(1L, () -> queryMetrics(metricStore,
352+
SYSTEM_METRIC_PREFIX + FlowControl.LAUNCHING_COUNT, new HashMap<>()), 10, TimeUnit.SECONDS);
353+
Assert.assertEquals(0L, queryMetrics(metricStore,
354+
SYSTEM_METRIC_PREFIX + FlowControl.RUNNING_COUNT, new HashMap<>()));
355+
356+
TransactionRunners.run(transactionRunner, context -> {
357+
programStateWriter.running(workflowRunId, null);
358+
});
359+
checkProgramStatus(appSpec.getArtifactId(), workflowRunId, ProgramRunStatus.RUNNING);
360+
// Restart the Notification service. We are not using the stopAndWait() because we don't want to
361+
// terminate the main service.
362+
notificationService.shutDown();
363+
notificationService.startUp();
364+
// Running counts are not based on metadata store in RunRecordMonitorService so not asserting it
365+
// here.
366+
Tasks.waitFor(0L, () -> queryMetrics(metricStore,
367+
SYSTEM_METRIC_PREFIX + FlowControl.LAUNCHING_COUNT, new HashMap<>()), 10, TimeUnit.SECONDS);
368+
369+
// Cleanup metrics.
370+
metricStore.deleteAll();
371+
}
372+
311373
private Map<String, String> getAdditionalTagsForProgramMetrics(ProgramRunStatus existingStatus, String provisioner,
312374
ProgramRunClusterStatus clusterStatus) {
313375
Map<String, String> additionalTags = new HashMap<>();
@@ -460,8 +522,13 @@ private long getMetric(MetricStore metricStore, ProgramRunId programRunId, Profi
460522
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
461523
.putAll(additionalTags)
462524
.build();
525+
return queryMetrics(metricStore, metricName, tags);
526+
}
527+
528+
private long queryMetrics(MetricStore metricStore, String metricName,
529+
Map<String, String> tags) {
463530
MetricDataQuery query = new MetricDataQuery(0, 0, Integer.MAX_VALUE, metricName, AggregationFunction.SUM,
464-
tags, new ArrayList<>());
531+
tags, new ArrayList<>());
465532
Collection<MetricTimeSeries> result = metricStore.query(query);
466533
if (result.isEmpty()) {
467534
return 0;

0 commit comments

Comments
 (0)