use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinator method triggerCheckpoint.
@VisibleForTesting
CheckpointTriggerResult triggerCheckpoint(long timestamp, CheckpointProperties props, String targetDirectory, boolean isPeriodic) {
// Sanity check
if (props.externalizeCheckpoint() && targetDirectory == null) {
throw new IllegalStateException("No target directory specified to persist checkpoint to.");
}
// make some eager pre-checks
synchronized (lock) {
// abort if the coordinator has been shutdown in the meantime
if (shutdown) {
return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
}
// Don't allow periodic checkpoint if scheduling has been disabled
if (isPeriodic && !periodicScheduling) {
return new CheckpointTriggerResult(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN);
}
// these checks are not relevant for savepoints
if (!props.forceCheckpoint()) {
// sanity check: there should never be more than one trigger request queued
if (triggerRequestQueued) {
LOG.warn("Trying to trigger another checkpoint while one was queued already");
return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
}
// if too many checkpoints are currently in progress, we need to mark that a request is queued
if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
triggerRequestQueued = true;
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
}
// make sure the minimum interval between checkpoints has passed
final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
if (durationTillNextMillis > 0) {
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
// Reassign the new trigger to the currentPeriodicTrigger
currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
}
}
}
// check if all tasks that we need to trigger are running.
// if not, abort the checkpoint
Execution[] executions = new Execution[tasksToTrigger.length];
for (int i = 0; i < tasksToTrigger.length; i++) {
Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
if (ee != null && ee.getState() == ExecutionState.RUNNING) {
executions[i] = ee;
} else {
LOG.info("Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.", tasksToTrigger[i].getSimpleName());
return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
// next, check if all tasks that need to acknowledge the checkpoint are running.
// if not, abort the checkpoint
Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);
for (ExecutionVertex ev : tasksToWaitFor) {
Execution ee = ev.getCurrentExecutionAttempt();
if (ee != null) {
ackTasks.put(ee.getAttemptId(), ev);
} else {
LOG.info("Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.", ev.getSimpleName());
return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
// we avoid blocking the processing of 'acknowledge/decline' messages during that time.
synchronized (triggerLock) {
final long checkpointID;
try {
// this must happen outside the coordinator-wide lock, because it communicates
// with external services (in HA mode) and may block for a while.
checkpointID = checkpointIdCounter.getAndIncrement();
} catch (Throwable t) {
int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
}
final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks, props, targetDirectory, executor);
if (statsTracker != null) {
PendingCheckpointStats callback = statsTracker.reportPendingCheckpoint(checkpointID, timestamp, props);
checkpoint.setStatsCallback(callback);
}
// schedule the timer that will clean up the expired checkpoints
final Runnable canceller = new Runnable() {
@Override
public void run() {
synchronized (lock) {
// note that checkpoint completion discards the pending checkpoint object
if (!checkpoint.isDiscarded()) {
LOG.info("Checkpoint " + checkpointID + " expired before completing.");
checkpoint.abortExpired();
pendingCheckpoints.remove(checkpointID);
rememberRecentCheckpointId(checkpointID);
triggerQueuedRequests();
}
}
}
};
try {
// re-acquire the coordinator-wide lock
synchronized (lock) {
// that the conditions still hold.
if (shutdown) {
return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
} else if (!props.forceCheckpoint()) {
if (triggerRequestQueued) {
LOG.warn("Trying to trigger another checkpoint while one was queued already");
return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
}
if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
triggerRequestQueued = true;
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
}
// make sure the minimum interval between checkpoints has passed
final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
if (durationTillNextMillis > 0) {
if (currentPeriodicTrigger != null) {
currentPeriodicTrigger.cancel(false);
currentPeriodicTrigger = null;
}
// Reassign the new trigger to the currentPeriodicTrigger
currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
}
}
LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);
pendingCheckpoints.put(checkpointID, checkpoint);
ScheduledFuture<?> cancellerHandle = timer.schedule(canceller, checkpointTimeout, TimeUnit.MILLISECONDS);
if (!checkpoint.setCancellerHandle(cancellerHandle)) {
// checkpoint is already disposed!
cancellerHandle.cancel(false);
}
}
// end of lock scope
CheckpointOptions checkpointOptions;
if (!props.isSavepoint()) {
checkpointOptions = CheckpointOptions.forFullCheckpoint();
} else {
checkpointOptions = CheckpointOptions.forSavepoint(targetDirectory);
}
// send the messages to the tasks that trigger their checkpoint
for (Execution execution : executions) {
execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
}
numUnsuccessfulCheckpointsTriggers.set(0);
return new CheckpointTriggerResult(checkpoint);
} catch (Throwable t) {
// guard the map against concurrent modifications
synchronized (lock) {
pendingCheckpoints.remove(checkpointID);
}
int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
if (!checkpoint.isDiscarded()) {
checkpoint.abortError(new Exception("Failed to trigger checkpoint"));
}
return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
}
}
// end trigger lock
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class PendingCheckpoint method acknowledgeTask.
/**
* Acknowledges the task with the given execution attempt id and the given subtask state.
*
* @param executionAttemptId of the acknowledged task
* @param subtaskState of the acknowledged task
* @param metrics Checkpoint metrics for the stats
* @return TaskAcknowledgeResult of the operation
*/
public TaskAcknowledgeResult acknowledgeTask(ExecutionAttemptID executionAttemptId, SubtaskState subtaskState, CheckpointMetrics metrics) {
synchronized (lock) {
if (discarded) {
return TaskAcknowledgeResult.DISCARDED;
}
final ExecutionVertex vertex = notYetAcknowledgedTasks.remove(executionAttemptId);
if (vertex == null) {
if (acknowledgedTasks.contains(executionAttemptId)) {
return TaskAcknowledgeResult.DUPLICATE;
} else {
return TaskAcknowledgeResult.UNKNOWN;
}
} else {
acknowledgedTasks.add(executionAttemptId);
}
JobVertexID jobVertexID = vertex.getJobvertexId();
int subtaskIndex = vertex.getParallelSubtaskIndex();
long ackTimestamp = System.currentTimeMillis();
long stateSize = 0;
if (null != subtaskState) {
TaskState taskState = taskStates.get(jobVertexID);
if (null == taskState) {
@SuppressWarnings("deprecation") ChainedStateHandle<StreamStateHandle> nonPartitionedState = subtaskState.getLegacyOperatorState();
ChainedStateHandle<OperatorStateHandle> partitioneableState = subtaskState.getManagedOperatorState();
//TODO this should go away when we remove chained state, assigning state to operators directly instead
int chainLength;
if (nonPartitionedState != null) {
chainLength = nonPartitionedState.getLength();
} else if (partitioneableState != null) {
chainLength = partitioneableState.getLength();
} else {
chainLength = 1;
}
taskState = new TaskState(jobVertexID, vertex.getTotalNumberOfParallelSubtasks(), vertex.getMaxParallelism(), chainLength);
taskStates.put(jobVertexID, taskState);
}
taskState.putState(subtaskIndex, subtaskState);
stateSize = subtaskState.getStateSize();
}
++numAcknowledgedTasks;
// publish the checkpoint statistics
// to prevent null-pointers from concurrent modification, copy reference onto stack
final PendingCheckpointStats statsCallback = this.statsCallback;
if (statsCallback != null) {
// Do this in millis because the web frontend works with them
long alignmentDurationMillis = metrics.getAlignmentDurationNanos() / 1_000_000;
SubtaskStateStats subtaskStateStats = new SubtaskStateStats(subtaskIndex, ackTimestamp, stateSize, metrics.getSyncDurationMillis(), metrics.getAsyncDurationMillis(), metrics.getBytesBufferedInAlignment(), alignmentDurationMillis);
statsCallback.reportSubtaskStats(jobVertexID, subtaskStateStats);
}
return TaskAcknowledgeResult.SUCCESS;
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorExternalizedCheckpointsTest method testTriggerAndConfirmSimpleExternalizedCheckpoint.
/**
* Triggers multiple externalized checkpoints and verifies that the metadata
* files have been created.
*/
@Test
public void testTriggerAndConfirmSimpleExternalizedCheckpoint() throws Exception {
final JobID jid = new JobID();
final ExternalizedCheckpointSettings externalizedCheckpointSettings = ExternalizedCheckpointSettings.externalizeCheckpoints(false);
final File checkpointDir = tmp.newFolder();
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
ExecutionVertex vertex1 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID1);
ExecutionVertex vertex2 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID2);
Map<JobVertexID, ExecutionJobVertex> jobVertices = new HashMap<>();
jobVertices.put(vertex1.getJobvertexId(), vertex1.getJobVertex());
jobVertices.put(vertex2.getJobvertexId(), vertex2.getJobVertex());
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, externalizedCheckpointSettings, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), checkpointDir.getAbsolutePath(), Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// ---------------
// trigger checkpoint 1
// ---------------
{
final long timestamp1 = System.currentTimeMillis();
coord.triggerCheckpoint(timestamp1, false);
long checkpointId1 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId1));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId1, timestamp1);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
// ---------------
// trigger checkpoint 2
// ---------------
{
final long timestamp2 = System.currentTimeMillis() + 7;
coord.triggerCheckpoint(timestamp2, false);
long checkpointId2 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId2, timestamp2);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
// ---------------
// trigger checkpoint 3
// ---------------
{
final long timestamp3 = System.currentTimeMillis() + 146;
coord.triggerCheckpoint(timestamp3, false);
long checkpointId3 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId3));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId3));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId3, timestamp3);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
coord.shutdown(JobStatus.FINISHED);
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testRestoreLatestCheckpointedStateWithChangingParallelism.
/**
* Tests the checkpoint restoration with changing parallelism of job vertex with partitioned
* state.
*
* @throws Exception
*/
private void testRestoreLatestCheckpointedStateWithChangingParallelism(boolean scaleOut) throws Exception {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
final JobVertexID jobVertexID1 = new JobVertexID();
final JobVertexID jobVertexID2 = new JobVertexID();
int parallelism1 = 3;
int parallelism2 = scaleOut ? 2 : 13;
int maxParallelism1 = 42;
int maxParallelism2 = 13;
int newParallelism2 = scaleOut ? 13 : 2;
final ExecutionJobVertex jobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
final ExecutionJobVertex jobVertex2 = mockExecutionJobVertex(jobVertexID2, parallelism2, maxParallelism2);
List<ExecutionVertex> allExecutionVertices = new ArrayList<>(parallelism1 + parallelism2);
allExecutionVertices.addAll(Arrays.asList(jobVertex1.getTaskVertices()));
allExecutionVertices.addAll(Arrays.asList(jobVertex2.getTaskVertices()));
ExecutionVertex[] arrayExecutionVertices = allExecutionVertices.toArray(new ExecutionVertex[allExecutionVertices.size()]);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), arrayExecutionVertices, arrayExecutionVertices, arrayExecutionVertices, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
// trigger the checkpoint
coord.triggerCheckpoint(timestamp, false);
assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
//vertex 1
for (int index = 0; index < jobVertex1.getParallelism(); index++) {
ChainedStateHandle<StreamStateHandle> valueSizeTuple = generateStateForVertex(jobVertexID1, index);
ChainedStateHandle<OperatorStateHandle> opStateBackend = generateChainedPartitionableStateHandle(jobVertexID1, index, 2, 8, false);
KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), true);
SubtaskState checkpointStateHandles = new SubtaskState(valueSizeTuple, opStateBackend, null, keyedStateBackend, keyedStateRaw);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
//vertex 2
final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesBackend = new ArrayList<>(jobVertex2.getParallelism());
final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesRaw = new ArrayList<>(jobVertex2.getParallelism());
for (int index = 0; index < jobVertex2.getParallelism(); index++) {
KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), true);
ChainedStateHandle<OperatorStateHandle> opStateBackend = generateChainedPartitionableStateHandle(jobVertexID2, index, 2, 8, false);
ChainedStateHandle<OperatorStateHandle> opStateRaw = generateChainedPartitionableStateHandle(jobVertexID2, index, 2, 8, true);
expectedOpStatesBackend.add(opStateBackend);
expectedOpStatesRaw.add(opStateRaw);
SubtaskState checkpointStateHandles = new SubtaskState(new ChainedStateHandle<>(Collections.<StreamStateHandle>singletonList(null)), opStateBackend, opStateRaw, keyedStateBackend, keyedStateRaw);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
assertEquals(1, completedCheckpoints.size());
Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
final ExecutionJobVertex newJobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
// rescale vertex 2
final ExecutionJobVertex newJobVertex2 = mockExecutionJobVertex(jobVertexID2, newParallelism2, maxParallelism2);
tasks.put(jobVertexID1, newJobVertex1);
tasks.put(jobVertexID2, newJobVertex2);
coord.restoreLatestCheckpointedState(tasks, true, false);
// verify the restored state
verifyStateRestore(jobVertexID1, newJobVertex1, keyGroupPartitions1);
List<List<Collection<OperatorStateHandle>>> actualOpStatesBackend = new ArrayList<>(newJobVertex2.getParallelism());
List<List<Collection<OperatorStateHandle>>> actualOpStatesRaw = new ArrayList<>(newJobVertex2.getParallelism());
for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), false);
KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), true);
TaskStateHandles taskStateHandles = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskStateHandles();
ChainedStateHandle<StreamStateHandle> operatorState = taskStateHandles.getLegacyOperatorState();
List<Collection<OperatorStateHandle>> opStateBackend = taskStateHandles.getManagedOperatorState();
List<Collection<OperatorStateHandle>> opStateRaw = taskStateHandles.getRawOperatorState();
Collection<KeyGroupsStateHandle> keyGroupStateBackend = taskStateHandles.getManagedKeyedState();
Collection<KeyGroupsStateHandle> keyGroupStateRaw = taskStateHandles.getRawKeyedState();
actualOpStatesBackend.add(opStateBackend);
actualOpStatesRaw.add(opStateRaw);
assertNull(operatorState);
compareKeyedState(Collections.singletonList(originalKeyedStateBackend), keyGroupStateBackend);
compareKeyedState(Collections.singletonList(originalKeyedStateRaw), keyGroupStateRaw);
}
comparePartitionableState(expectedOpStatesBackend, actualOpStatesBackend);
comparePartitionableState(expectedOpStatesRaw, actualOpStatesRaw);
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.
/**
* Triggers a savepoint and two checkpoints. The second checkpoint completes
* and subsumes the first checkpoint, but not the first savepoint. Then we
* trigger another checkpoint and savepoint. The 2nd savepoint completes and
* subsumes the last checkpoint, but not the first savepoint.
*/
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, counter, new StandaloneCompletedCheckpointStore(10), null, Executors.directExecutor());
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoint and checkpoint
Future<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(timestamp, savepointDir);
long savepointId1 = counter.getLast();
CheckpointMetaData checkpointMetaDataS1 = new CheckpointMetaData(savepointId1, 0L);
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertTrue(coord.triggerCheckpoint(timestamp + 1, false));
assertEquals(2, coord.getNumberOfPendingCheckpoints());
assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
long checkpointId2 = counter.getLast();
assertEquals(3, coord.getNumberOfPendingCheckpoints());
CheckpointMetaData checkpointMetaData2 = new CheckpointMetaData(checkpointId2, 0L);
// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
assertFalse(savepointFuture1.isDone());
assertTrue(coord.triggerCheckpoint(timestamp + 3, false));
assertEquals(2, coord.getNumberOfPendingCheckpoints());
Future<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(timestamp + 4, savepointDir);
long savepointId2 = counter.getLast();
CheckpointMetaData checkpointMetaDataS2 = new CheckpointMetaData(savepointId2, 0L);
assertEquals(3, coord.getNumberOfPendingCheckpoints());
// 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
assertFalse(savepointFuture1.isDone());
assertTrue(savepointFuture2.isDone());
// Ack first savepoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1));
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertTrue(savepointFuture1.isDone());
}
Aggregations