use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testCheckpointStatsTrackerRestoreCallback.
/**
* Tests that the restore callbacks are called if registered.
*/
@Test
public void testCheckpointStatsTrackerRestoreCallback() throws Exception {
ExecutionVertex vertex1 = mockExecutionVertex(new ExecutionAttemptID());
StandaloneCompletedCheckpointStore store = new StandaloneCompletedCheckpointStore(1);
store.addCheckpoint(new CompletedCheckpoint(new JobID(), 0, 0, 0, Collections.<JobVertexID, TaskState>emptyMap()));
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(new JobID(), 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new StandaloneCheckpointIDCounter(), store, null, Executors.directExecutor());
CheckpointStatsTracker tracker = mock(CheckpointStatsTracker.class);
coord.setCheckpointStatsTracker(tracker);
assertTrue(coord.restoreLatestCheckpointedState(Collections.<JobVertexID, ExecutionJobVertex>emptyMap(), false, true));
verify(tracker, times(1)).reportRestoredCheckpoint(any(RestoredCheckpointStats.class));
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testMaxConcurrentAttempsWithSubsumption.
@Test
public void testMaxConcurrentAttempsWithSubsumption() {
try {
final int maxConcurrentAttempts = 2;
final JobID jid = new JobID();
// create some mock execution vertices and trigger some checkpoint
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
10, // timeout is very long (200 s)
200000, // no extra delay
0L, // max two concurrent checkpoints
maxConcurrentAttempts, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
coord.startCheckpointScheduler();
// after a while, there should be exactly as many checkpoints
// as concurrently permitted
long now = System.currentTimeMillis();
long timeout = now + 60000;
long minDuration = now + 100;
do {
Thread.sleep(20);
} while ((now = System.currentTimeMillis()) < minDuration || (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts && now < timeout));
// validate that the pending checkpoints are there
assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
assertNotNull(coord.getPendingCheckpoints().get(1L));
assertNotNull(coord.getPendingCheckpoints().get(2L));
// now we acknowledge the second checkpoint, which should subsume the first checkpoint
// and allow two more checkpoints to be triggered
// now, once we acknowledge one checkpoint, it should trigger the next one
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L));
// after a while, there should be the new checkpoints
final long newTimeout = System.currentTimeMillis() + 60000;
do {
Thread.sleep(20);
} while (coord.getPendingCheckpoints().get(4L) == null && System.currentTimeMillis() < newTimeout);
// do the final check
assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
assertNotNull(coord.getPendingCheckpoints().get(3L));
assertNotNull(coord.getPendingCheckpoints().get(4L));
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testStopPeriodicScheduler.
@Test
public void testStopPeriodicScheduler() throws Exception {
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(new JobID(), 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
// Periodic
CheckpointTriggerResult triggerResult = coord.triggerCheckpoint(System.currentTimeMillis(), CheckpointProperties.forStandardCheckpoint(), null, true);
assertTrue(triggerResult.isFailure());
assertEquals(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN, triggerResult.getFailureReason());
// Not periodic
triggerResult = coord.triggerCheckpoint(System.currentTimeMillis(), CheckpointProperties.forStandardCheckpoint(), null, false);
assertFalse(triggerResult.isFailure());
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testPeriodicTriggering.
@Test
public void testPeriodicTriggering() {
try {
final JobID jid = new JobID();
final long start = System.currentTimeMillis();
// create some mock execution vertices and trigger some checkpoint
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
final AtomicInteger numCalls = new AtomicInteger();
final Execution execution = triggerVertex.getCurrentExecutionAttempt();
doAnswer(new Answer<Void>() {
private long lastId = -1;
private long lastTs = -1;
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
long id = (Long) invocation.getArguments()[0];
long ts = (Long) invocation.getArguments()[1];
assertTrue(id > lastId);
assertTrue(ts >= lastTs);
assertTrue(ts >= start);
lastId = id;
lastTs = ts;
numCalls.incrementAndGet();
return null;
}
}).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));
CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
10, // timeout is very long (200 s)
200000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
coord.startCheckpointScheduler();
long timeout = System.currentTimeMillis() + 60000;
do {
Thread.sleep(20);
} while (timeout > System.currentTimeMillis() && numCalls.get() < 5);
assertTrue(numCalls.get() >= 5);
coord.stopCheckpointScheduler();
// for 400 ms, no further calls may come.
// there may be the case that one trigger was fired and about to
// acquire the lock, such that after cancelling it will still do
// the remainder of its work
int numCallsSoFar = numCalls.get();
Thread.sleep(400);
assertTrue(numCallsSoFar == numCalls.get() || numCallsSoFar + 1 == numCalls.get());
// start another sequence of periodic scheduling
numCalls.set(0);
coord.startCheckpointScheduler();
timeout = System.currentTimeMillis() + 60000;
do {
Thread.sleep(20);
} while (timeout > System.currentTimeMillis() && numCalls.get() < 5);
assertTrue(numCalls.get() >= 5);
coord.stopCheckpointScheduler();
// for 400 ms, no further calls may come
// there may be the case that one trigger was fired and about to
// acquire the lock, such that after cancelling it will still do
// the remainder of its work
numCallsSoFar = numCalls.get();
Thread.sleep(400);
assertTrue(numCallsSoFar == numCalls.get() || numCallsSoFar + 1 == numCalls.get());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinator method triggerCheckpoint.
@VisibleForTesting
CheckpointTriggerResult triggerCheckpoint(long timestamp, CheckpointProperties props, String targetDirectory, boolean isPeriodic) {
// Sanity check
if (props.externalizeCheckpoint() && targetDirectory == null) {
throw new IllegalStateException("No target directory specified to persist checkpoint to.");
}
// make some eager pre-checks
synchronized (lock) {
// abort if the coordinator has been shutdown in the meantime
if (shutdown) {
return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
}
// Don't allow periodic checkpoint if scheduling has been disabled
if (isPeriodic && !periodicScheduling) {
return new CheckpointTriggerResult(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN);
}
// these checks are not relevant for savepoints
if (!props.forceCheckpoint()) {
// sanity check: there should never be more than one trigger request queued
if (triggerRequestQueued) {
LOG.warn("Trying to trigger another checkpoint while one was queued already");
return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
}
// if too many checkpoints are currently in progress, we need to mark that a request is queued
if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
triggerRequestQueued = true;
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
}
// make sure the minimum interval between checkpoints has passed
final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
if (durationTillNextMillis > 0) {
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
// Reassign the new trigger to the currentPeriodicTrigger
currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
}
}
}
// check if all tasks that we need to trigger are running.
// if not, abort the checkpoint
Execution[] executions = new Execution[tasksToTrigger.length];
for (int i = 0; i < tasksToTrigger.length; i++) {
Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
if (ee != null && ee.getState() == ExecutionState.RUNNING) {
executions[i] = ee;
} else {
LOG.info("Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.", tasksToTrigger[i].getSimpleName());
return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
// next, check if all tasks that need to acknowledge the checkpoint are running.
// if not, abort the checkpoint
Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);
for (ExecutionVertex ev : tasksToWaitFor) {
Execution ee = ev.getCurrentExecutionAttempt();
if (ee != null) {
ackTasks.put(ee.getAttemptId(), ev);
} else {
LOG.info("Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.", ev.getSimpleName());
return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
// we avoid blocking the processing of 'acknowledge/decline' messages during that time.
synchronized (triggerLock) {
final long checkpointID;
try {
// this must happen outside the coordinator-wide lock, because it communicates
// with external services (in HA mode) and may block for a while.
checkpointID = checkpointIdCounter.getAndIncrement();
} catch (Throwable t) {
int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
}
final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks, props, targetDirectory, executor);
if (statsTracker != null) {
PendingCheckpointStats callback = statsTracker.reportPendingCheckpoint(checkpointID, timestamp, props);
checkpoint.setStatsCallback(callback);
}
// schedule the timer that will clean up the expired checkpoints
final Runnable canceller = new Runnable() {
@Override
public void run() {
synchronized (lock) {
// note that checkpoint completion discards the pending checkpoint object
if (!checkpoint.isDiscarded()) {
LOG.info("Checkpoint " + checkpointID + " expired before completing.");
checkpoint.abortExpired();
pendingCheckpoints.remove(checkpointID);
rememberRecentCheckpointId(checkpointID);
triggerQueuedRequests();
}
}
}
};
try {
// re-acquire the coordinator-wide lock
synchronized (lock) {
// that the conditions still hold.
if (shutdown) {
return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
} else if (!props.forceCheckpoint()) {
if (triggerRequestQueued) {
LOG.warn("Trying to trigger another checkpoint while one was queued already");
return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
}
if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
triggerRequestQueued = true;
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
}
// make sure the minimum interval between checkpoints has passed
final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
if (durationTillNextMillis > 0) {
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
// Reassign the new trigger to the currentPeriodicTrigger
currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
}
}
LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);
pendingCheckpoints.put(checkpointID, checkpoint);
ScheduledFuture<?> cancellerHandle = timer.schedule(canceller, checkpointTimeout, TimeUnit.MILLISECONDS);
if (!checkpoint.setCancellerHandle(cancellerHandle)) {
// checkpoint is already disposed!
cancellerHandle.cancel(false);
}
}
// end of lock scope
CheckpointOptions checkpointOptions;
if (!props.isSavepoint()) {
checkpointOptions = CheckpointOptions.forFullCheckpoint();
} else {
checkpointOptions = CheckpointOptions.forSavepoint(targetDirectory);
}
// send the messages to the tasks that trigger their checkpoint
for (Execution execution : executions) {
execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
}
numUnsuccessfulCheckpointsTriggers.set(0);
return new CheckpointTriggerResult(checkpoint);
} catch (Throwable t) {
// guard the map against concurrent modifications
synchronized (lock) {
pendingCheckpoints.remove(checkpointID);
}
int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
if (!checkpoint.isDiscarded()) {
checkpoint.abortError(new Exception("Failed to trigger checkpoint"));
}
return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
}
}
// end trigger lock
}
Aggregations