use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.
/**
* Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
* that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
* checkpointing messages from other jobs should not be touched. A late acknowledge
* message is an acknowledge message which arrives after the checkpoint has been declined.
*
* @throws Exception
*/
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
final JobID jobId = new JobID();
final ExecutionAttemptID triggerAttemptId = new ExecutionAttemptID();
final ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptId);
final ExecutionAttemptID ackAttemptId1 = new ExecutionAttemptID();
final ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptId1);
final ExecutionAttemptID ackAttemptId2 = new ExecutionAttemptID();
final ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptId2);
final long timestamp = 1L;
CheckpointCoordinator coord = new CheckpointCoordinator(jobId, 20000L, 20000L, 0L, 1, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { triggerVertex, ackVertex1, ackVertex2 }, new ExecutionVertex[0], new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
assertTrue(coord.triggerCheckpoint(timestamp, false));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();
long checkpointId = pendingCheckpoint.getCheckpointId();
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
SubtaskState triggerSubtaskState = mock(SubtaskState.class);
// acknowledge the first trigger vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
SubtaskState unknownSubtaskState = mock(SubtaskState.class);
// receive an acknowledge message for an unknown vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState));
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState, times(1)).discardState();
SubtaskState differentJobSubtaskState = mock(SubtaskState.class);
// receive an acknowledge message from an unknown job
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
// duplicate acknowledge message for the trigger vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
// duplicate acknowledge messages for a known vertex should not trigger discarding the state
verify(triggerSubtaskState, never()).discardState();
// let the checkpoint fail at the first ack vertex
coord.receiveDeclineMessage(new DeclineCheckpoint(jobId, ackAttemptId1, checkpointId));
assertTrue(pendingCheckpoint.isDiscarded());
// check that we've cleaned up the already acknowledged state
verify(triggerSubtaskState, times(1)).discardState();
SubtaskState ackSubtaskState = mock(SubtaskState.class);
// late acknowledge message from the second ack vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, ackAttemptId2, checkpointId, new CheckpointMetrics(), ackSubtaskState));
// check that we also cleaned up this state
verify(ackSubtaskState, times(1)).discardState();
// receive an acknowledge message from an unknown job
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
SubtaskState unknownSubtaskState2 = mock(SubtaskState.class);
// receive an acknowledge message for an unknown vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2));
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState2, times(1)).discardState();
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testSuccessfulCheckpointSubsumesUnsuccessful.
@Test
public void testSuccessfulCheckpointSubsumesUnsuccessful() {
try {
final JobID jid = new JobID();
final long timestamp1 = System.currentTimeMillis();
final long timestamp2 = timestamp1 + 1552;
// create some mock execution vertices
final ExecutionAttemptID triggerAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID triggerAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID3 = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex1 = mockExecutionVertex(triggerAttemptID1);
ExecutionVertex triggerVertex2 = mockExecutionVertex(triggerAttemptID2);
ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
ExecutionVertex ackVertex3 = mockExecutionVertex(ackAttemptID3);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex1, triggerVertex2 }, new ExecutionVertex[] { ackVertex1, ackVertex2, ackVertex3 }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(10), null, Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp1, false));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending1 = coord.getPendingCheckpoints().values().iterator().next();
long checkpointId1 = pending1.getCheckpointId();
// trigger messages should have been sent
verify(triggerVertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId1), eq(timestamp1), any(CheckpointOptions.class));
verify(triggerVertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId1), eq(timestamp1), any(CheckpointOptions.class));
CheckpointMetaData checkpointMetaData1 = new CheckpointMetaData(checkpointId1, 0L);
// acknowledge one of the three tasks
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID2, checkpointId1));
// start the second checkpoint
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp2, false));
assertEquals(2, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending2;
{
Iterator<PendingCheckpoint> all = coord.getPendingCheckpoints().values().iterator();
PendingCheckpoint cc1 = all.next();
PendingCheckpoint cc2 = all.next();
pending2 = pending1 == cc1 ? cc2 : cc1;
}
long checkpointId2 = pending2.getCheckpointId();
// trigger messages should have been sent
verify(triggerVertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId2), eq(timestamp2), any(CheckpointOptions.class));
verify(triggerVertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId2), eq(timestamp2), any(CheckpointOptions.class));
// we acknowledge one more task from the first checkpoint and the second
// checkpoint completely. The second checkpoint should then subsume the first checkpoint
CheckpointMetaData checkpointMetaData2 = new CheckpointMetaData(checkpointId2, 0L);
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID3, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID2, checkpointId2));
// now, the second checkpoint should be confirmed, and the first discarded
// actually both pending checkpoints are discarded, and the second has been transformed
// into a successful checkpoint
assertTrue(pending1.isDiscarded());
assertTrue(pending2.isDiscarded());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());
// validate the committed checkpoints
List<CompletedCheckpoint> scs = coord.getSuccessfulCheckpoints();
CompletedCheckpoint success = scs.get(0);
assertEquals(checkpointId2, success.getCheckpointID());
assertEquals(timestamp2, success.getTimestamp());
assertEquals(jid, success.getJobId());
assertTrue(success.getTaskStates().isEmpty());
// the first confirm message should be out
verify(commitVertex.getCurrentExecutionAttempt(), times(1)).notifyCheckpointComplete(eq(checkpointId2), eq(timestamp2));
// send the last remaining ack for the first checkpoint. This should not do anything
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID3, checkpointId1));
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testCheckpointAbortsIfAckTasksAreNotExecuted.
@Test
public void testCheckpointAbortsIfAckTasksAreNotExecuted() {
try {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock Execution vertices that need to ack the checkpoint
final ExecutionAttemptID triggerAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID triggerAttemptID2 = new ExecutionAttemptID();
ExecutionVertex triggerVertex1 = mockExecutionVertex(triggerAttemptID1);
ExecutionVertex triggerVertex2 = mockExecutionVertex(triggerAttemptID2);
// create some mock Execution vertices that receive the checkpoint trigger messages
ExecutionVertex ackVertex1 = mock(ExecutionVertex.class);
ExecutionVertex ackVertex2 = mock(ExecutionVertex.class);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex1, triggerVertex2 }, new ExecutionVertex[] { ackVertex1, ackVertex2 }, new ExecutionVertex[] {}, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
// nothing should be happening
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should not succeed
assertFalse(coord.triggerCheckpoint(timestamp, false));
// still, nothing should be happening
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testRestoreLatestCheckpointFailureWhenParallelismChanges.
/**
* Tests that the checkpoint restoration fails if the parallelism of a job vertices with
* non-partitioned state has changed.
*
* @throws Exception
*/
@Test(expected = IllegalStateException.class)
public void testRestoreLatestCheckpointFailureWhenParallelismChanges() throws Exception {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
final JobVertexID jobVertexID1 = new JobVertexID();
final JobVertexID jobVertexID2 = new JobVertexID();
int parallelism1 = 3;
int parallelism2 = 2;
int maxParallelism1 = 42;
int maxParallelism2 = 13;
final ExecutionJobVertex jobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
final ExecutionJobVertex jobVertex2 = mockExecutionJobVertex(jobVertexID2, parallelism2, maxParallelism2);
List<ExecutionVertex> allExecutionVertices = new ArrayList<>(parallelism1 + parallelism2);
allExecutionVertices.addAll(Arrays.asList(jobVertex1.getTaskVertices()));
allExecutionVertices.addAll(Arrays.asList(jobVertex2.getTaskVertices()));
ExecutionVertex[] arrayExecutionVertices = allExecutionVertices.toArray(new ExecutionVertex[allExecutionVertices.size()]);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), arrayExecutionVertices, arrayExecutionVertices, arrayExecutionVertices, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
// trigger the checkpoint
coord.triggerCheckpoint(timestamp, false);
assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
for (int index = 0; index < jobVertex1.getParallelism(); index++) {
ChainedStateHandle<StreamStateHandle> valueSizeTuple = generateStateForVertex(jobVertexID1, index);
KeyGroupsStateHandle keyGroupState = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
SubtaskState checkpointStateHandles = new SubtaskState(valueSizeTuple, null, null, keyGroupState, null);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
for (int index = 0; index < jobVertex2.getParallelism(); index++) {
ChainedStateHandle<StreamStateHandle> state = generateStateForVertex(jobVertexID2, index);
KeyGroupsStateHandle keyGroupState = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
SubtaskState checkpointStateHandles = new SubtaskState(state, null, null, keyGroupState, null);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
assertEquals(1, completedCheckpoints.size());
Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
int newParallelism1 = 4;
int newParallelism2 = 3;
final ExecutionJobVertex newJobVertex1 = mockExecutionJobVertex(jobVertexID1, newParallelism1, maxParallelism1);
final ExecutionJobVertex newJobVertex2 = mockExecutionJobVertex(jobVertexID2, newParallelism2, maxParallelism2);
tasks.put(jobVertexID1, newJobVertex1);
tasks.put(jobVertexID2, newJobVertex2);
coord.restoreLatestCheckpointedState(tasks, true, false);
fail("The restoration should have failed because the parallelism of an vertex with " + "non-partitioned state changed.");
}
use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.
the class CheckpointCoordinatorTest method testConcurrentSavepoints.
/**
* Tests that the savepoints can be triggered concurrently.
*/
@Test
public void testConcurrentSavepoints() throws Exception {
JobID jobId = new JobID();
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinator coord = new CheckpointCoordinator(jobId, 100000, 200000, 0L, // max one checkpoint at a time => should not affect savepoints
1, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, checkpointIDCounter, new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
List<Future<CompletedCheckpoint>> savepointFutures = new ArrayList<>();
int numSavepoints = 5;
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoints
for (int i = 0; i < numSavepoints; i++) {
savepointFutures.add(coord.triggerSavepoint(i, savepointDir));
}
// After triggering multiple savepoints, all should in progress
for (Future<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertFalse(savepointFuture.isDone());
}
// ACK all savepoints
long checkpointId = checkpointIDCounter.getLast();
for (int i = 0; i < numSavepoints; i++, checkpointId--) {
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, attemptID1, checkpointId));
}
// After ACKs, all should be completed
for (Future<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertTrue(savepointFuture.isDone());
}
}
Aggregations