use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testRestoreLatestCheckpointedState.
/**
* Tests that the checkpointed partitioned and non-partitioned state is assigned properly to
* the {@link Execution} upon recovery.
*
* @throws Exception
*/
@Test
public void testRestoreLatestCheckpointedState() throws Exception {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
final JobVertexID jobVertexID1 = new JobVertexID();
final JobVertexID jobVertexID2 = new JobVertexID();
int parallelism1 = 3;
int parallelism2 = 2;
int maxParallelism1 = 42;
int maxParallelism2 = 13;
final ExecutionJobVertex jobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
final ExecutionJobVertex jobVertex2 = mockExecutionJobVertex(jobVertexID2, parallelism2, maxParallelism2);
List<ExecutionVertex> allExecutionVertices = new ArrayList<>(parallelism1 + parallelism2);
allExecutionVertices.addAll(Arrays.asList(jobVertex1.getTaskVertices()));
allExecutionVertices.addAll(Arrays.asList(jobVertex2.getTaskVertices()));
ExecutionVertex[] arrayExecutionVertices = allExecutionVertices.toArray(new ExecutionVertex[allExecutionVertices.size()]);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), arrayExecutionVertices, arrayExecutionVertices, arrayExecutionVertices, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
// trigger the checkpoint
coord.triggerCheckpoint(timestamp, false);
assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
for (int index = 0; index < jobVertex1.getParallelism(); index++) {
ChainedStateHandle<StreamStateHandle> nonPartitionedState = generateStateForVertex(jobVertexID1, index);
ChainedStateHandle<OperatorStateHandle> partitionableState = generateChainedPartitionableStateHandle(jobVertexID1, index, 2, 8, false);
KeyGroupsStateHandle partitionedKeyGroupState = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
SubtaskState checkpointStateHandles = new SubtaskState(nonPartitionedState, partitionableState, null, partitionedKeyGroupState, null);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
for (int index = 0; index < jobVertex2.getParallelism(); index++) {
ChainedStateHandle<StreamStateHandle> nonPartitionedState = generateStateForVertex(jobVertexID2, index);
ChainedStateHandle<OperatorStateHandle> partitionableState = generateChainedPartitionableStateHandle(jobVertexID2, index, 2, 8, false);
KeyGroupsStateHandle partitionedKeyGroupState = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
SubtaskState checkpointStateHandles = new SubtaskState(nonPartitionedState, partitionableState, null, partitionedKeyGroupState, null);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
}
List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
assertEquals(1, completedCheckpoints.size());
Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
tasks.put(jobVertexID1, jobVertex1);
tasks.put(jobVertexID2, jobVertex2);
coord.restoreLatestCheckpointedState(tasks, true, false);
// verify the restored state
verifyStateRestore(jobVertexID1, jobVertex1, keyGroupPartitions1);
verifyStateRestore(jobVertexID2, jobVertex2, keyGroupPartitions2);
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testCheckpointTimeoutIsolated.
@Test
public void testCheckpointTimeoutIsolated() {
try {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock execution vertices
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
// set up the coordinator
// the timeout for the checkpoint is a 200 milliseconds
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 200, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex1, ackVertex2 }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
// trigger a checkpoint, partially acknowledged
assertTrue(coord.triggerCheckpoint(timestamp, false));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
PendingCheckpoint checkpoint = coord.getPendingCheckpoints().values().iterator().next();
assertFalse(checkpoint.isDiscarded());
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpoint.getCheckpointId()));
// wait until the checkpoint must have expired.
// we check every 250 msecs conservatively for 5 seconds
// to give even slow build servers a very good chance of completing this
long deadline = System.currentTimeMillis() + 5000;
do {
Thread.sleep(250);
} while (!checkpoint.isDiscarded() && coord.getNumberOfPendingCheckpoints() > 0 && System.currentTimeMillis() < deadline);
assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDiscarded());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// no confirm message must have been sent
verify(commitVertex.getCurrentExecutionAttempt(), times(0)).notifyCheckpointComplete(anyLong(), anyLong());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testMultipleConcurrentCheckpoints.
@Test
public void testMultipleConcurrentCheckpoints() {
try {
final JobID jid = new JobID();
final long timestamp1 = System.currentTimeMillis();
final long timestamp2 = timestamp1 + 8617;
// create some mock execution vertices
final ExecutionAttemptID triggerAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID triggerAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID3 = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex1 = mockExecutionVertex(triggerAttemptID1);
ExecutionVertex triggerVertex2 = mockExecutionVertex(triggerAttemptID2);
ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
ExecutionVertex ackVertex3 = mockExecutionVertex(ackAttemptID3);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex1, triggerVertex2 }, new ExecutionVertex[] { ackVertex1, ackVertex2, ackVertex3 }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp1, false));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending1 = coord.getPendingCheckpoints().values().iterator().next();
long checkpointId1 = pending1.getCheckpointId();
// trigger messages should have been sent
verify(triggerVertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId1), eq(timestamp1), any(CheckpointOptions.class));
verify(triggerVertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId1), eq(timestamp1), any(CheckpointOptions.class));
CheckpointMetaData checkpointMetaData1 = new CheckpointMetaData(checkpointId1, 0L);
// acknowledge one of the three tasks
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID2, checkpointId1));
// start the second checkpoint
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp2, false));
assertEquals(2, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending2;
{
Iterator<PendingCheckpoint> all = coord.getPendingCheckpoints().values().iterator();
PendingCheckpoint cc1 = all.next();
PendingCheckpoint cc2 = all.next();
pending2 = pending1 == cc1 ? cc2 : cc1;
}
long checkpointId2 = pending2.getCheckpointId();
CheckpointMetaData checkpointMetaData2 = new CheckpointMetaData(checkpointId2, 0L);
// trigger messages should have been sent
verify(triggerVertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId2), eq(timestamp2), any(CheckpointOptions.class));
verify(triggerVertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpointId2), eq(timestamp2), any(CheckpointOptions.class));
// we acknowledge the remaining two tasks from the first
// checkpoint and two tasks from the second checkpoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID3, checkpointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID2, checkpointId2));
// now, the first checkpoint should be confirmed
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertTrue(pending1.isDiscarded());
// the first confirm message should be out
verify(commitVertex.getCurrentExecutionAttempt(), times(1)).notifyCheckpointComplete(eq(checkpointId1), eq(timestamp1));
// send the last remaining ack for the second checkpoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID3, checkpointId2));
// now, the second checkpoint should be confirmed
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertTrue(pending2.isDiscarded());
// the second commit message should be out
verify(commitVertex.getCurrentExecutionAttempt(), times(1)).notifyCheckpointComplete(eq(checkpointId2), eq(timestamp2));
// validate the committed checkpoints
List<CompletedCheckpoint> scs = coord.getSuccessfulCheckpoints();
CompletedCheckpoint sc1 = scs.get(0);
assertEquals(checkpointId1, sc1.getCheckpointID());
assertEquals(timestamp1, sc1.getTimestamp());
assertEquals(jid, sc1.getJobId());
assertTrue(sc1.getTaskStates().isEmpty());
CompletedCheckpoint sc2 = scs.get(1);
assertEquals(checkpointId2, sc2.getCheckpointID());
assertEquals(timestamp2, sc2.getTimestamp());
assertEquals(jid, sc2.getJobId());
assertTrue(sc2.getTaskStates().isEmpty());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testHandleMessagesForNonExistingCheckpoints.
@Test
public void testHandleMessagesForNonExistingCheckpoints() {
try {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock execution vertices and trigger some checkpoint
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 200000, 200000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex1, ackVertex2 }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
assertTrue(coord.triggerCheckpoint(timestamp, false));
long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();
// send some messages that do not belong to either the job or the any
// of the vertices that need to be acknowledged.
// non of the messages should throw an exception
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
// wrong job id
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), ackAttemptID1, checkpointId));
// unknown checkpoint
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, 1L));
// unknown ack vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, new ExecutionAttemptID(), checkpointId));
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testMaxConcurrentAttempsWithSubsumption.
@Test
public void testMaxConcurrentAttempsWithSubsumption() {
try {
final int maxConcurrentAttempts = 2;
final JobID jid = new JobID();
// create some mock execution vertices and trigger some checkpoint
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
10, // timeout is very long (200 s)
200000, // no extra delay
0L, // max two concurrent checkpoints
maxConcurrentAttempts, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
coord.startCheckpointScheduler();
// after a while, there should be exactly as many checkpoints
// as concurrently permitted
long now = System.currentTimeMillis();
long timeout = now + 60000;
long minDuration = now + 100;
do {
Thread.sleep(20);
} while ((now = System.currentTimeMillis()) < minDuration || (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts && now < timeout));
// validate that the pending checkpoints are there
assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
assertNotNull(coord.getPendingCheckpoints().get(1L));
assertNotNull(coord.getPendingCheckpoints().get(2L));
// now we acknowledge the second checkpoint, which should subsume the first checkpoint
// and allow two more checkpoints to be triggered
// now, once we acknowledge one checkpoint, it should trigger the next one
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L));
// after a while, there should be the new checkpoints
final long newTimeout = System.currentTimeMillis() + 60000;
do {
Thread.sleep(20);
} while (coord.getPendingCheckpoints().get(4L) == null && System.currentTimeMillis() < newTimeout);
// do the final check
assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
assertNotNull(coord.getPendingCheckpoints().get(3L));
assertNotNull(coord.getPendingCheckpoints().get(4L));
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations