Search in sources :

Example 11 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinator method triggerCheckpoint.

@VisibleForTesting
CheckpointTriggerResult triggerCheckpoint(long timestamp, CheckpointProperties props, String targetDirectory, boolean isPeriodic) {
    // Sanity check
    if (props.externalizeCheckpoint() && targetDirectory == null) {
        throw new IllegalStateException("No target directory specified to persist checkpoint to.");
    }
    // make some eager pre-checks
    synchronized (lock) {
        // abort if the coordinator has been shutdown in the meantime
        if (shutdown) {
            return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
        }
        // Don't allow periodic checkpoint if scheduling has been disabled
        if (isPeriodic && !periodicScheduling) {
            return new CheckpointTriggerResult(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN);
        }
        // these checks are not relevant for savepoints
        if (!props.forceCheckpoint()) {
            // sanity check: there should never be more than one trigger request queued
            if (triggerRequestQueued) {
                LOG.warn("Trying to trigger another checkpoint while one was queued already");
                return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
            }
            // if too many checkpoints are currently in progress, we need to mark that a request is queued
            if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
                triggerRequestQueued = true;
                if (currentPeriodicTrigger != null) {
                    currentPeriodicTrigger.cancel(false);
                    currentPeriodicTrigger = null;
                }
                return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
            }
            // make sure the minimum interval between checkpoints has passed
            final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
            final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
            if (durationTillNextMillis > 0) {
                if (currentPeriodicTrigger != null) {
                    currentPeriodicTrigger.cancel(false);
                    currentPeriodicTrigger = null;
                }
                // Reassign the new trigger to the currentPeriodicTrigger
                currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
                return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
            }
        }
    }
    // check if all tasks that we need to trigger are running.
    // if not, abort the checkpoint
    Execution[] executions = new Execution[tasksToTrigger.length];
    for (int i = 0; i < tasksToTrigger.length; i++) {
        Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
        if (ee != null && ee.getState() == ExecutionState.RUNNING) {
            executions[i] = ee;
        } else {
            LOG.info("Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.", tasksToTrigger[i].getSimpleName());
            return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
        }
    }
    // next, check if all tasks that need to acknowledge the checkpoint are running.
    // if not, abort the checkpoint
    Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);
    for (ExecutionVertex ev : tasksToWaitFor) {
        Execution ee = ev.getCurrentExecutionAttempt();
        if (ee != null) {
            ackTasks.put(ee.getAttemptId(), ev);
        } else {
            LOG.info("Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.", ev.getSimpleName());
            return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
        }
    }
    // we avoid blocking the processing of 'acknowledge/decline' messages during that time.
    synchronized (triggerLock) {
        final long checkpointID;
        try {
            // this must happen outside the coordinator-wide lock, because it communicates
            // with external services (in HA mode) and may block for a while.
            checkpointID = checkpointIdCounter.getAndIncrement();
        } catch (Throwable t) {
            int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
            LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
            return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
        }
        final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks, props, targetDirectory, executor);
        if (statsTracker != null) {
            PendingCheckpointStats callback = statsTracker.reportPendingCheckpoint(checkpointID, timestamp, props);
            checkpoint.setStatsCallback(callback);
        }
        // schedule the timer that will clean up the expired checkpoints
        final Runnable canceller = new Runnable() {

            @Override
            public void run() {
                synchronized (lock) {
                    // note that checkpoint completion discards the pending checkpoint object
                    if (!checkpoint.isDiscarded()) {
                        LOG.info("Checkpoint " + checkpointID + " expired before completing.");
                        checkpoint.abortExpired();
                        pendingCheckpoints.remove(checkpointID);
                        rememberRecentCheckpointId(checkpointID);
                        triggerQueuedRequests();
                    }
                }
            }
        };
        try {
            // re-acquire the coordinator-wide lock
            synchronized (lock) {
                // that the conditions still hold.
                if (shutdown) {
                    return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
                } else if (!props.forceCheckpoint()) {
                    if (triggerRequestQueued) {
                        LOG.warn("Trying to trigger another checkpoint while one was queued already");
                        return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
                    }
                    if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
                        triggerRequestQueued = true;
                        if (currentPeriodicTrigger != null) {
                            currentPeriodicTrigger.cancel(false);
                            currentPeriodicTrigger = null;
                        }
                        return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
                    }
                    // make sure the minimum interval between checkpoints has passed
                    final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
                    final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
                    if (durationTillNextMillis > 0) {
                        if (currentPeriodicTrigger != null) {
                            currentPeriodicTrigger.cancel(false);
                            currentPeriodicTrigger = null;
                        }
                        // Reassign the new trigger to the currentPeriodicTrigger
                        currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
                        return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
                    }
                }
                LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);
                pendingCheckpoints.put(checkpointID, checkpoint);
                ScheduledFuture<?> cancellerHandle = timer.schedule(canceller, checkpointTimeout, TimeUnit.MILLISECONDS);
                if (!checkpoint.setCancellerHandle(cancellerHandle)) {
                    // checkpoint is already disposed!
                    cancellerHandle.cancel(false);
                }
            }
            // end of lock scope
            CheckpointOptions checkpointOptions;
            if (!props.isSavepoint()) {
                checkpointOptions = CheckpointOptions.forFullCheckpoint();
            } else {
                checkpointOptions = CheckpointOptions.forSavepoint(targetDirectory);
            }
            // send the messages to the tasks that trigger their checkpoint
            for (Execution execution : executions) {
                execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
            }
            numUnsuccessfulCheckpointsTriggers.set(0);
            return new CheckpointTriggerResult(checkpoint);
        } catch (Throwable t) {
            // guard the map against concurrent modifications
            synchronized (lock) {
                pendingCheckpoints.remove(checkpointID);
            }
            int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
            LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
            if (!checkpoint.isDiscarded()) {
                checkpoint.abortError(new Exception("Failed to trigger checkpoint"));
            }
            return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
        }
    }
// end trigger lock
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Execution(org.apache.flink.runtime.executiongraph.Execution) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting)

Example 12 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class PendingCheckpoint method acknowledgeTask.

/**
	 * Acknowledges the task with the given execution attempt id and the given subtask state.
	 *
	 * @param executionAttemptId of the acknowledged task
	 * @param subtaskState of the acknowledged task
	 * @param metrics Checkpoint metrics for the stats
	 * @return TaskAcknowledgeResult of the operation
	 */
public TaskAcknowledgeResult acknowledgeTask(ExecutionAttemptID executionAttemptId, SubtaskState subtaskState, CheckpointMetrics metrics) {
    synchronized (lock) {
        if (discarded) {
            return TaskAcknowledgeResult.DISCARDED;
        }
        final ExecutionVertex vertex = notYetAcknowledgedTasks.remove(executionAttemptId);
        if (vertex == null) {
            if (acknowledgedTasks.contains(executionAttemptId)) {
                return TaskAcknowledgeResult.DUPLICATE;
            } else {
                return TaskAcknowledgeResult.UNKNOWN;
            }
        } else {
            acknowledgedTasks.add(executionAttemptId);
        }
        JobVertexID jobVertexID = vertex.getJobvertexId();
        int subtaskIndex = vertex.getParallelSubtaskIndex();
        long ackTimestamp = System.currentTimeMillis();
        long stateSize = 0;
        if (null != subtaskState) {
            TaskState taskState = taskStates.get(jobVertexID);
            if (null == taskState) {
                @SuppressWarnings("deprecation") ChainedStateHandle<StreamStateHandle> nonPartitionedState = subtaskState.getLegacyOperatorState();
                ChainedStateHandle<OperatorStateHandle> partitioneableState = subtaskState.getManagedOperatorState();
                //TODO this should go away when we remove chained state, assigning state to operators directly instead
                int chainLength;
                if (nonPartitionedState != null) {
                    chainLength = nonPartitionedState.getLength();
                } else if (partitioneableState != null) {
                    chainLength = partitioneableState.getLength();
                } else {
                    chainLength = 1;
                }
                taskState = new TaskState(jobVertexID, vertex.getTotalNumberOfParallelSubtasks(), vertex.getMaxParallelism(), chainLength);
                taskStates.put(jobVertexID, taskState);
            }
            taskState.putState(subtaskIndex, subtaskState);
            stateSize = subtaskState.getStateSize();
        }
        ++numAcknowledgedTasks;
        // publish the checkpoint statistics
        // to prevent null-pointers from concurrent modification, copy reference onto stack
        final PendingCheckpointStats statsCallback = this.statsCallback;
        if (statsCallback != null) {
            // Do this in millis because the web frontend works with them
            long alignmentDurationMillis = metrics.getAlignmentDurationNanos() / 1_000_000;
            SubtaskStateStats subtaskStateStats = new SubtaskStateStats(subtaskIndex, ackTimestamp, stateSize, metrics.getSyncDurationMillis(), metrics.getAsyncDurationMillis(), metrics.getBytesBufferedInAlignment(), alignmentDurationMillis);
            statsCallback.reportSubtaskStats(jobVertexID, subtaskStateStats);
        }
        return TaskAcknowledgeResult.SUCCESS;
    }
}
Also used : StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Savepoint(org.apache.flink.runtime.checkpoint.savepoint.Savepoint)

Example 13 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinatorExternalizedCheckpointsTest method testTriggerAndConfirmSimpleExternalizedCheckpoint.

/**
	 * Triggers multiple externalized checkpoints and verifies that the metadata
	 * files have been created.
	 */
@Test
public void testTriggerAndConfirmSimpleExternalizedCheckpoint() throws Exception {
    final JobID jid = new JobID();
    final ExternalizedCheckpointSettings externalizedCheckpointSettings = ExternalizedCheckpointSettings.externalizeCheckpoints(false);
    final File checkpointDir = tmp.newFolder();
    // create some mock Execution vertices that receive the checkpoint trigger messages
    final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
    final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
    ExecutionVertex vertex1 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID1);
    ExecutionVertex vertex2 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID2);
    Map<JobVertexID, ExecutionJobVertex> jobVertices = new HashMap<>();
    jobVertices.put(vertex1.getJobvertexId(), vertex1.getJobVertex());
    jobVertices.put(vertex2.getJobvertexId(), vertex2.getJobVertex());
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, externalizedCheckpointSettings, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), checkpointDir.getAbsolutePath(), Executors.directExecutor());
    assertEquals(0, coord.getNumberOfPendingCheckpoints());
    assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
    // ---------------
    // trigger checkpoint 1
    // ---------------
    {
        final long timestamp1 = System.currentTimeMillis();
        coord.triggerCheckpoint(timestamp1, false);
        long checkpointId1 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId1));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId1));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId1, timestamp1);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    // ---------------
    // trigger checkpoint 2
    // ---------------
    {
        final long timestamp2 = System.currentTimeMillis() + 7;
        coord.triggerCheckpoint(timestamp2, false);
        long checkpointId2 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId2, timestamp2);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    // ---------------
    // trigger checkpoint 3
    // ---------------
    {
        final long timestamp3 = System.currentTimeMillis() + 146;
        coord.triggerCheckpoint(timestamp3, false);
        long checkpointId3 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId3));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId3));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId3, timestamp3);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    coord.shutdown(JobStatus.FINISHED);
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) HashMap(java.util.HashMap) ExternalizedCheckpointSettings(org.apache.flink.runtime.jobgraph.tasks.ExternalizedCheckpointSettings) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 14 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinatorTest method testRestoreLatestCheckpointedStateWithChangingParallelism.

/**
	 * Tests the checkpoint restoration with changing parallelism of job vertex with partitioned
	 * state.
	 *
	 * @throws Exception
	 */
private void testRestoreLatestCheckpointedStateWithChangingParallelism(boolean scaleOut) throws Exception {
    final JobID jid = new JobID();
    final long timestamp = System.currentTimeMillis();
    final JobVertexID jobVertexID1 = new JobVertexID();
    final JobVertexID jobVertexID2 = new JobVertexID();
    int parallelism1 = 3;
    int parallelism2 = scaleOut ? 2 : 13;
    int maxParallelism1 = 42;
    int maxParallelism2 = 13;
    int newParallelism2 = scaleOut ? 13 : 2;
    final ExecutionJobVertex jobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
    final ExecutionJobVertex jobVertex2 = mockExecutionJobVertex(jobVertexID2, parallelism2, maxParallelism2);
    List<ExecutionVertex> allExecutionVertices = new ArrayList<>(parallelism1 + parallelism2);
    allExecutionVertices.addAll(Arrays.asList(jobVertex1.getTaskVertices()));
    allExecutionVertices.addAll(Arrays.asList(jobVertex2.getTaskVertices()));
    ExecutionVertex[] arrayExecutionVertices = allExecutionVertices.toArray(new ExecutionVertex[allExecutionVertices.size()]);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), arrayExecutionVertices, arrayExecutionVertices, arrayExecutionVertices, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
    // trigger the checkpoint
    coord.triggerCheckpoint(timestamp, false);
    assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
    long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
    CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
    List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
    List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
    //vertex 1
    for (int index = 0; index < jobVertex1.getParallelism(); index++) {
        ChainedStateHandle<StreamStateHandle> valueSizeTuple = generateStateForVertex(jobVertexID1, index);
        ChainedStateHandle<OperatorStateHandle> opStateBackend = generateChainedPartitionableStateHandle(jobVertexID1, index, 2, 8, false);
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), true);
        SubtaskState checkpointStateHandles = new SubtaskState(valueSizeTuple, opStateBackend, null, keyedStateBackend, keyedStateRaw);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
    }
    //vertex 2
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesBackend = new ArrayList<>(jobVertex2.getParallelism());
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesRaw = new ArrayList<>(jobVertex2.getParallelism());
    for (int index = 0; index < jobVertex2.getParallelism(); index++) {
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), true);
        ChainedStateHandle<OperatorStateHandle> opStateBackend = generateChainedPartitionableStateHandle(jobVertexID2, index, 2, 8, false);
        ChainedStateHandle<OperatorStateHandle> opStateRaw = generateChainedPartitionableStateHandle(jobVertexID2, index, 2, 8, true);
        expectedOpStatesBackend.add(opStateBackend);
        expectedOpStatesRaw.add(opStateRaw);
        SubtaskState checkpointStateHandles = new SubtaskState(new ChainedStateHandle<>(Collections.<StreamStateHandle>singletonList(null)), opStateBackend, opStateRaw, keyedStateBackend, keyedStateRaw);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), checkpointStateHandles);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint);
    }
    List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
    assertEquals(1, completedCheckpoints.size());
    Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
    List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
    final ExecutionJobVertex newJobVertex1 = mockExecutionJobVertex(jobVertexID1, parallelism1, maxParallelism1);
    // rescale vertex 2
    final ExecutionJobVertex newJobVertex2 = mockExecutionJobVertex(jobVertexID2, newParallelism2, maxParallelism2);
    tasks.put(jobVertexID1, newJobVertex1);
    tasks.put(jobVertexID2, newJobVertex2);
    coord.restoreLatestCheckpointedState(tasks, true, false);
    // verify the restored state
    verifyStateRestore(jobVertexID1, newJobVertex1, keyGroupPartitions1);
    List<List<Collection<OperatorStateHandle>>> actualOpStatesBackend = new ArrayList<>(newJobVertex2.getParallelism());
    List<List<Collection<OperatorStateHandle>>> actualOpStatesRaw = new ArrayList<>(newJobVertex2.getParallelism());
    for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
        KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), false);
        KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), true);
        TaskStateHandles taskStateHandles = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskStateHandles();
        ChainedStateHandle<StreamStateHandle> operatorState = taskStateHandles.getLegacyOperatorState();
        List<Collection<OperatorStateHandle>> opStateBackend = taskStateHandles.getManagedOperatorState();
        List<Collection<OperatorStateHandle>> opStateRaw = taskStateHandles.getRawOperatorState();
        Collection<KeyGroupsStateHandle> keyGroupStateBackend = taskStateHandles.getManagedKeyedState();
        Collection<KeyGroupsStateHandle> keyGroupStateRaw = taskStateHandles.getRawKeyedState();
        actualOpStatesBackend.add(opStateBackend);
        actualOpStatesRaw.add(opStateRaw);
        assertNull(operatorState);
        compareKeyedState(Collections.singletonList(originalKeyedStateBackend), keyGroupStateBackend);
        compareKeyedState(Collections.singletonList(originalKeyedStateRaw), keyGroupStateRaw);
    }
    comparePartitionableState(expectedOpStatesBackend, actualOpStatesBackend);
    comparePartitionableState(expectedOpStatesRaw, actualOpStatesRaw);
}
Also used : HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) ChainedStateHandle(org.apache.flink.runtime.state.ChainedStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) List(java.util.List) ArrayList(java.util.ArrayList) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) TaskStateHandles(org.apache.flink.runtime.state.TaskStateHandles) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) Collection(java.util.Collection) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) JobID(org.apache.flink.api.common.JobID)

Example 15 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.

/**
	 * Triggers a savepoint and two checkpoints. The second checkpoint completes
	 * and subsumes the first checkpoint, but not the first savepoint. Then we
	 * trigger another checkpoint and savepoint. The 2nd savepoint completes and
	 * subsumes the last checkpoint, but not the first savepoint.
	 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
    final JobID jid = new JobID();
    final long timestamp = System.currentTimeMillis();
    // create some mock Execution vertices that receive the checkpoint trigger messages
    final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
    final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
    ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
    ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
    StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, counter, new StandaloneCompletedCheckpointStore(10), null, Executors.directExecutor());
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    // Trigger savepoint and checkpoint
    Future<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(timestamp, savepointDir);
    long savepointId1 = counter.getLast();
    CheckpointMetaData checkpointMetaDataS1 = new CheckpointMetaData(savepointId1, 0L);
    assertEquals(1, coord.getNumberOfPendingCheckpoints());
    assertTrue(coord.triggerCheckpoint(timestamp + 1, false));
    assertEquals(2, coord.getNumberOfPendingCheckpoints());
    assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
    long checkpointId2 = counter.getLast();
    assertEquals(3, coord.getNumberOfPendingCheckpoints());
    CheckpointMetaData checkpointMetaData2 = new CheckpointMetaData(checkpointId2, 0L);
    // 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
    assertEquals(1, coord.getNumberOfPendingCheckpoints());
    assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
    assertFalse(savepointFuture1.isDone());
    assertTrue(coord.triggerCheckpoint(timestamp + 3, false));
    assertEquals(2, coord.getNumberOfPendingCheckpoints());
    Future<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(timestamp + 4, savepointDir);
    long savepointId2 = counter.getLast();
    CheckpointMetaData checkpointMetaDataS2 = new CheckpointMetaData(savepointId2, 0L);
    assertEquals(3, coord.getNumberOfPendingCheckpoints());
    // 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2));
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2));
    assertEquals(1, coord.getNumberOfPendingCheckpoints());
    assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
    assertFalse(savepointFuture1.isDone());
    assertTrue(savepointFuture2.isDone());
    // Ack first savepoint
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1));
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1));
    assertEquals(0, coord.getNumberOfPendingCheckpoints());
    assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
    assertTrue(savepointFuture1.isDone());
}
Also used : AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Test(org.junit.Test)

Aggregations

ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)65 Test (org.junit.Test)47 JobID (org.apache.flink.api.common.JobID)42 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)41 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)23 IOException (java.io.IOException)15 Execution (org.apache.flink.runtime.executiongraph.Execution)15 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)15 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)12 HashMap (java.util.HashMap)10 ArrayList (java.util.ArrayList)8 TriggerStackTraceSample (org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample)8 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)7 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)5 IntermediateResultPartition (org.apache.flink.runtime.executiongraph.IntermediateResultPartition)5 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)5 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)5 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)5 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)5