use of org.apache.hadoop.yarn.util.SystemClock in project apex-core by apache.
the class CheckpointTest method testUpdateRecoveryCheckpoint.
@Test
public void testUpdateRecoveryCheckpoint() throws Exception {
Clock clock = new SystemClock();
dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
GenericTestOperator o3SL = dag.addOperator("o3SL", StatelessOperator.class);
dag.addStream("o1.output1", o1.outport1, o2.inport1);
dag.addStream("o2.output1", o2.outport1, o3SL.inport1);
StreamingContainerManager dnm = new StreamingContainerManager(dag);
PhysicalPlan plan = dnm.getPhysicalPlan();
for (PTOperator oper : plan.getAllOperators().values()) {
Assert.assertEquals("activation windowId " + oper, Checkpoint.INITIAL_CHECKPOINT, oper.getRecoveryCheckpoint());
Assert.assertEquals("checkpoints " + oper, Collections.emptyList(), oper.checkpoints);
}
List<PTOperator> nodes1 = plan.getOperators(dag.getMeta(o1));
Assert.assertNotNull(nodes1);
Assert.assertEquals(1, nodes1.size());
PTOperator o1p1 = nodes1.get(0);
PTOperator o2p1 = plan.getOperators(dag.getMeta(o2)).get(0);
PTOperator o3SLp1 = plan.getOperators(dag.getMeta(o3SL)).get(0);
// recovery checkpoint won't update in deploy state
for (PTOperator oper : plan.getAllOperators().values()) {
Assert.assertEquals("", PTOperator.State.PENDING_DEPLOY, oper.getState());
}
dnm.updateRecoveryCheckpoints(o2p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("no checkpoints " + o2p1, Checkpoint.INITIAL_CHECKPOINT, o2p1.getRecoveryCheckpoint());
UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("no checkpoints " + o1p1, Checkpoint.INITIAL_CHECKPOINT, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("number dependencies " + ctx.visited, 3, ctx.visited.size());
// adding checkpoints to upstream only does not move recovery checkpoint
Checkpoint cp3 = new Checkpoint(3L, 0, 0);
Checkpoint cp5 = new Checkpoint(5L, 0, 0);
Checkpoint cp4 = new Checkpoint(4L, 0, 0);
o1p1.checkpoints.add(cp3);
o1p1.checkpoints.add(cp5);
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, Checkpoint.INITIAL_CHECKPOINT, o1p1.getRecoveryCheckpoint());
o2p1.checkpoints.add(new Checkpoint(3L, 0, 0));
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, Checkpoint.INITIAL_CHECKPOINT, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + o2p1, Checkpoint.INITIAL_CHECKPOINT, o2p1.getRecoveryCheckpoint());
// set leaf operator checkpoint
dnm.addCheckpoint(o3SLp1, cp5);
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, Checkpoint.INITIAL_CHECKPOINT, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + o2p1, Checkpoint.INITIAL_CHECKPOINT, o2p1.getRecoveryCheckpoint());
// set all operators as active to enable recovery window id update
for (PTOperator oper : plan.getAllOperators().values()) {
oper.setState(PTOperator.State.ACTIVE);
}
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, cp3, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + o2p1, cp3, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + o3SLp1, cp5, o3SLp1.getRecoveryCheckpoint());
Assert.assertNull("checkpoint null for stateless operator " + o3SLp1, o3SLp1.stats.checkpointStats);
o2p1.checkpoints.add(cp4);
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, cp3, o1p1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + o2p1, cp4, o2p1.getRecoveryCheckpoint());
o1p1.checkpoints.add(1, cp4);
Assert.assertEquals(o1p1.checkpoints, getCheckpoints(3L, 4L, 5L));
dnm.updateRecoveryCheckpoints(o1p1, new UpdateCheckpointsContext(clock), false);
Assert.assertEquals("checkpoint " + o1p1, cp4, o1p1.getRecoveryCheckpoint());
Assert.assertEquals(o1p1.checkpoints, getCheckpoints(4L, 5L));
// out of sequence windowIds should be sorted
dnm.addCheckpoint(o2p1, new Checkpoint(2L, 0, 0));
Assert.assertEquals("add first", getCheckpoints(2L, 4L), o2p1.checkpoints);
dnm.addCheckpoint(o2p1, new Checkpoint(3L, 0, 0));
Assert.assertEquals("add middle", getCheckpoints(2L, 3L, 4L), o2p1.checkpoints);
dnm.addCheckpoint(o2p1, new Checkpoint(4L, 0, 0));
Assert.assertEquals("ignore duplicate", getCheckpoints(2L, 3L, 4L), o2p1.checkpoints);
dnm.addCheckpoint(o2p1, new Checkpoint(5L, 0, 0));
Assert.assertEquals("add latest", getCheckpoints(2L, 3L, 4L, 5L), o2p1.checkpoints);
}
use of org.apache.hadoop.yarn.util.SystemClock in project apex-core by apache.
the class CheckpointTest method testUpdateRecoveryCheckpointWithCycle.
@Test
public void testUpdateRecoveryCheckpointWithCycle() throws Exception {
Clock clock = new SystemClock();
dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
// Simulate a DAG with a loop which has a unifier operator
TestGeneratorInputOperator o1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
GenericTestOperator o3 = dag.addOperator("o3", GenericTestOperator.class);
GenericTestOperator o4 = dag.addOperator("o4", GenericTestOperator.class);
DefaultDelayOperator d = dag.addOperator("d", DefaultDelayOperator.class);
dag.addStream("o1.output1", o1.outport, o2.inport1);
dag.addStream("o2.output1", o2.outport1, o3.inport1);
dag.addStream("o3.output1", o3.outport1, o4.inport1);
dag.addStream("o4.output1", o4.outport1, d.input);
dag.addStream("d.output", d.output, o2.inport2);
dag.setOperatorAttribute(o3, Context.OperatorContext.PARTITIONER, new StatelessPartitioner<Operator>(2));
dag.validate();
StreamingContainerManager dnm = new StreamingContainerManager(dag);
PhysicalPlan plan = dnm.getPhysicalPlan();
for (PTOperator oper : plan.getAllOperators().values()) {
Assert.assertEquals("Initial activation windowId" + oper, Checkpoint.INITIAL_CHECKPOINT, oper.getRecoveryCheckpoint());
Assert.assertEquals("Checkpoints empty" + oper, Collections.emptyList(), oper.checkpoints);
}
Checkpoint cp1 = new Checkpoint(1L, 0, 0);
Checkpoint cp2 = new Checkpoint(2L, 0, 0);
Map<OperatorMeta, Set<OperatorMeta>> checkpointGroups = dnm.getCheckpointGroups();
Map<Integer, PTOperator> allOperators = plan.getAllOperators();
for (PTOperator operator : allOperators.values()) {
operator.setState(PTOperator.State.ACTIVE);
operator.checkpoints.add(cp1);
dnm.updateRecoveryCheckpoints(operator, new UpdateCheckpointsContext(clock, false, checkpointGroups), false);
}
List<PTOperator> physicalO1 = plan.getOperators(dag.getOperatorMeta("o1"));
physicalO1.get(0).checkpoints.add(cp2);
dnm.updateRecoveryCheckpoints(physicalO1.get(0), new UpdateCheckpointsContext(clock, false, checkpointGroups), false);
Assert.assertEquals("Recovery checkpoint updated ", physicalO1.get(0).getRecoveryCheckpoint(), cp1);
}
use of org.apache.hadoop.yarn.util.SystemClock in project hive by apache.
the class TestTaskExecutorService method runPreemptionGraceTest.
private void runPreemptionGraceTest(MockRequest victim1, MockRequest victim2, int time) throws InterruptedException {
MockRequest preemptor = createMockRequest(3, 1, 100, 100, true, 20000l, false);
victim1.setSleepAfterKill();
victim2.setSleepAfterKill();
ControlledClock clock = new ControlledClock(new SystemClock());
clock.setTime(0);
TaskExecutorServiceForTest taskExecutorService = new TaskExecutorServiceForTest(2, 3, ShortestJobFirstComparator.class.getName(), true, clock);
taskExecutorService.init(new Configuration());
taskExecutorService.start();
try {
taskExecutorService.schedule(victim1);
awaitStartAndSchedulerRun(victim1, taskExecutorService);
taskExecutorService.schedule(victim2);
awaitStartAndSchedulerRun(victim2, taskExecutorService);
taskExecutorService.schedule(preemptor);
// Wait for scheduling to run a few times.
taskExecutorService.waitForScheduleRuns(5);
clock.setTime(time);
// Wait for scheduling to run a few times.
taskExecutorService.waitForScheduleRuns(5);
victim1.unblockKill();
victim2.unblockKill();
preemptor.complete();
preemptor.awaitEnd();
TaskExecutorServiceForTest.InternalCompletionListenerForTest icl3 = taskExecutorService.getInternalCompletionListenerForTest(preemptor.getRequestId());
icl3.awaitCompletion();
} finally {
taskExecutorService.shutDown(false);
}
}
use of org.apache.hadoop.yarn.util.SystemClock in project hive by apache.
the class LlapSliderUtils method getAppReport.
public static ApplicationReport getAppReport(String appName, SliderClient sliderClient, long timeoutMs) throws LlapStatusServiceDriver.LlapStatusCliException {
Clock clock = new SystemClock();
long startTime = clock.getTime();
long timeoutTime = timeoutMs < 0 ? Long.MAX_VALUE : (startTime + timeoutMs);
ApplicationReport appReport = null;
while (appReport == null) {
try {
appReport = sliderClient.getYarnAppListClient().findInstance(appName);
if (timeoutMs == 0) {
// break immediately if timeout is 0
break;
}
// Otherwise sleep, and try again.
if (appReport == null) {
long remainingTime = Math.min(timeoutTime - clock.getTime(), 500l);
if (remainingTime > 0) {
Thread.sleep(remainingTime);
} else {
break;
}
}
} catch (Exception e) {
// No point separating IOException vs YarnException vs others
throw new LlapStatusServiceDriver.LlapStatusCliException(LlapStatusServiceDriver.ExitCode.YARN_ERROR, "Failed to get Yarn AppReport", e);
}
}
return appReport;
}
use of org.apache.hadoop.yarn.util.SystemClock in project hive by apache.
the class LlapStatusServiceDriver method main.
public static void main(String[] args) {
LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args));
int ret = ExitCode.SUCCESS.getInt();
Clock clock = new SystemClock();
long startTime = clock.getTime();
long lastSummaryLogTime = -1;
LlapStatusServiceDriver statusServiceDriver = null;
LlapStatusOptions options = null;
try {
statusServiceDriver = new LlapStatusServiceDriver();
options = statusServiceDriver.parseOptions(args);
} catch (Throwable t) {
statusServiceDriver.close();
logError(t);
if (t instanceof LlapStatusCliException) {
LlapStatusCliException ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
}
}
if (ret != 0 || options == null) {
// Failure / help
if (statusServiceDriver != null) {
statusServiceDriver.close();
}
System.exit(ret);
}
boolean firstAttempt = true;
final long refreshInterval = options.getRefreshIntervalMs();
final boolean watchMode = options.isWatchMode();
final long watchTimeout = options.getWatchTimeoutMs();
long numAttempts = watchTimeout / refreshInterval;
// Break out of the loop fast if watchMode is disabled.
numAttempts = watchMode ? numAttempts : 1;
LlapStatusHelpers.State launchingState = null;
LlapStatusHelpers.State currentState = null;
boolean desiredStateAttained = false;
final float runningNodesThreshold = options.getRunningNodesThreshold();
try (OutputStream os = options.getOutputFile() == null ? System.out : new BufferedOutputStream(new FileOutputStream(options.getOutputFile()));
PrintWriter pw = new PrintWriter(os)) {
LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." + " Watch mode: {}. Running nodes threshold: {}.", TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS), TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold));
while (numAttempts > 0) {
if (!firstAttempt) {
if (watchMode) {
try {
Thread.sleep(refreshInterval);
} catch (InterruptedException e) {
// ignore
}
} else {
// reported once, so break
break;
}
} else {
firstAttempt = false;
}
ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0);
currentState = statusServiceDriver.appStatusBuilder.getState();
try {
lastSummaryLogTime = LlapStatusServiceDriver.maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver, watchMode, watchTimeout, launchingState);
} catch (Exception e) {
LOG.warn("Failed to log summary", e);
}
if (ret == ExitCode.SUCCESS.getInt()) {
if (watchMode) {
// slider has started llap application, now if for some reason state changes to COMPLETE then fail fast
if (launchingState == null && LAUNCHING_STATES.contains(currentState)) {
launchingState = currentState;
}
if (currentState.equals(State.COMPLETE)) {
if (launchingState != null || options.isLaunched()) {
LOG.warn("COMPLETE state reached while waiting for RUNNING state. Failing.");
System.err.println("Final diagnostics: " + statusServiceDriver.appStatusBuilder.getDiagnostics());
break;
} else {
LOG.info("Found a stopped application; assuming it was a previous attempt " + "and waiting for the next one. Omit the -l flag to avoid this.");
}
}
if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) {
if (LOG.isDebugEnabled()) {
LOG.debug("Current state: {}. Desired state: {}. {}/{} instances.", currentState, runningNodesThreshold == 1.0f ? State.RUNNING_ALL : State.RUNNING_PARTIAL, statusServiceDriver.appStatusBuilder.getLiveInstances(), statusServiceDriver.appStatusBuilder.getDesiredInstances());
}
numAttempts--;
continue;
}
// we have reached RUNNING state, now check if running nodes threshold is met
final int liveInstances = statusServiceDriver.appStatusBuilder.getLiveInstances();
final int desiredInstances = statusServiceDriver.appStatusBuilder.getDesiredInstances();
if (desiredInstances > 0) {
final float ratio = (float) liveInstances / (float) desiredInstances;
if (ratio < runningNodesThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug("Waiting until running nodes threshold is reached. Current: {} Desired: {}." + " {}/{} instances.", new DecimalFormat("#.###").format(ratio), new DecimalFormat("#.###").format(runningNodesThreshold), statusServiceDriver.appStatusBuilder.getLiveInstances(), statusServiceDriver.appStatusBuilder.getDesiredInstances());
}
numAttempts--;
continue;
} else {
desiredStateAttained = true;
statusServiceDriver.appStatusBuilder.setRunningThresholdAchieved(true);
}
} else {
numAttempts--;
continue;
}
}
} else if (ret == ExitCode.YARN_ERROR.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got YARN error. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.SLIDER_CLIENT_ERROR_CREATE_FAILED.getInt() && watchMode) {
LOG.warn("Watch mode enabled and slider client creation failed. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.SLIDER_CLIENT_ERROR_OTHER.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got slider client error. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.LLAP_REGISTRY_ERROR.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got LLAP registry error. Retrying..");
numAttempts--;
continue;
}
break;
}
// Log final state to CONSOLE_LOGGER
LlapStatusServiceDriver.maybeLogSummary(clock, 0L, statusServiceDriver, watchMode, watchTimeout, launchingState);
CONSOLE_LOGGER.info("\n\n\n");
// print current state before exiting
statusServiceDriver.outputJson(pw);
os.flush();
pw.flush();
if (numAttempts == 0 && watchMode && !desiredStateAttained) {
LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.", TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS));
}
} catch (Throwable t) {
logError(t);
if (t instanceof LlapStatusCliException) {
LlapStatusCliException ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
}
} finally {
LOG.info("LLAP status finished");
statusServiceDriver.close();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Completed processing - exiting with " + ret);
}
System.exit(ret);
}
Aggregations