Search in sources :

Example 6 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class TestJobDataFlowValidator method getTrackedOperatorID.

/**
 * Traverse operators in the chain in the vertex and return the first tracked operator ID. For
 * upstream, start with head, for downstream - with tail (see {@link
 * JobVertex#getOperatorIDs()}). If a chain doesn't contain any tracked operators return
 * nothing.
 */
private static Optional<String> getTrackedOperatorID(JobVertex vertex, boolean upstream, TestJobWithDescription testJob) {
    ListIterator<OperatorIDPair> iterator = vertex.getOperatorIDs().listIterator(upstream ? 0 : vertex.getOperatorIDs().size());
    while (upstream ? iterator.hasNext() : iterator.hasPrevious()) {
        OperatorIDPair idPair = upstream ? iterator.next() : iterator.previous();
        String id = idPair.getUserDefinedOperatorID().orElse(idPair.getGeneratedOperatorID()).toString();
        if (testJob.operatorsWithDataFlowTracking.contains(id)) {
            return Optional.of(id);
        }
    }
    return Optional.empty();
}
Also used : OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 7 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class CheckpointRestoreWithUidHashITCase method testRestoreFromSavepointBySetUidHash.

@Test
public void testRestoreFromSavepointBySetUidHash() throws Exception {
    final int maxNumber = 100;
    try (MiniCluster miniCluster = new MiniCluster(createMiniClusterConfig())) {
        miniCluster.start();
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        JobGraph firstJob = createJobGraph(env, StatefulSourceBehavior.HOLD_AFTER_CHECKPOINT_ON_FIRST_RUN, maxNumber, "test-uid", null, null);
        JobID jobId = miniCluster.submitJob(firstJob).get().getJobID();
        waitForAllTaskRunning(miniCluster, jobId, false);
        // The source would emit some records and start waiting for the checkpoint to happen.
        // With this latch we ensures the savepoint happens in a fixed position and no following
        // records are emitted after savepoint is triggered.
        startWaitingForCheckpointLatch.get().await();
        String savepointPath = miniCluster.triggerSavepoint(jobId, TMP_FOLDER.newFolder().getAbsolutePath(), true, SavepointFormatType.CANONICAL).get();
        // Get the operator id
        List<OperatorIDPair> operatorIds = firstJob.getVerticesSortedTopologicallyFromSources().get(0).getOperatorIDs();
        OperatorIDPair sourceOperatorIds = operatorIds.get(operatorIds.size() - 1);
        JobGraph secondJob = createJobGraph(env, StatefulSourceBehavior.PROCESS_ONLY, maxNumber, null, sourceOperatorIds.getGeneratedOperatorID().toHexString(), savepointPath);
        miniCluster.executeJobBlocking(secondJob);
    }
    assertThat(result.get(), contains(IntStream.range(0, maxNumber).boxed().toArray()));
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) MiniCluster(org.apache.flink.runtime.minicluster.MiniCluster) JobID(org.apache.flink.api.common.JobID) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair) Test(org.junit.Test)

Example 8 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class Checkpoints method loadAndValidateCheckpoint.

public static CompletedCheckpoint loadAndValidateCheckpoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, CompletedCheckpointStorageLocation location, ClassLoader classLoader, boolean allowNonRestoredState, CheckpointProperties checkpointProperties, RestoreMode restoreMode) throws IOException {
    checkNotNull(jobId, "jobId");
    checkNotNull(tasks, "tasks");
    checkNotNull(location, "location");
    checkNotNull(classLoader, "classLoader");
    final StreamStateHandle metadataHandle = location.getMetadataHandle();
    final String checkpointPointer = location.getExternalPointer();
    // (1) load the savepoint
    final CheckpointMetadata checkpointMetadata;
    try (InputStream in = metadataHandle.openInputStream()) {
        DataInputStream dis = new DataInputStream(in);
        checkpointMetadata = loadCheckpointMetadata(dis, classLoader, checkpointPointer);
    }
    // generate mapping from operator to task
    Map<OperatorID, ExecutionJobVertex> operatorToJobVertexMapping = new HashMap<>();
    for (ExecutionJobVertex task : tasks.values()) {
        for (OperatorIDPair operatorIDPair : task.getOperatorIDs()) {
            operatorToJobVertexMapping.put(operatorIDPair.getGeneratedOperatorID(), task);
            operatorIDPair.getUserDefinedOperatorID().ifPresent(id -> operatorToJobVertexMapping.put(id, task));
        }
    }
    // (2) validate it (parallelism, etc)
    HashMap<OperatorID, OperatorState> operatorStates = new HashMap<>(checkpointMetadata.getOperatorStates().size());
    for (OperatorState operatorState : checkpointMetadata.getOperatorStates()) {
        ExecutionJobVertex executionJobVertex = operatorToJobVertexMapping.get(operatorState.getOperatorID());
        if (executionJobVertex != null) {
            if (executionJobVertex.getMaxParallelism() == operatorState.getMaxParallelism() || executionJobVertex.canRescaleMaxParallelism(operatorState.getMaxParallelism())) {
                operatorStates.put(operatorState.getOperatorID(), operatorState);
            } else {
                String msg = String.format("Failed to rollback to checkpoint/savepoint %s. " + "Max parallelism mismatch between checkpoint/savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the checkpoint/savepoint.", checkpointMetadata, operatorState.getOperatorID(), operatorState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
                throw new IllegalStateException(msg);
            }
        } else if (allowNonRestoredState) {
            LOG.info("Skipping savepoint state for operator {}.", operatorState.getOperatorID());
        } else {
            if (operatorState.getCoordinatorState() != null) {
                throwNonRestoredStateException(checkpointPointer, operatorState.getOperatorID());
            }
            for (OperatorSubtaskState operatorSubtaskState : operatorState.getStates()) {
                if (operatorSubtaskState.hasState()) {
                    throwNonRestoredStateException(checkpointPointer, operatorState.getOperatorID());
                }
            }
            LOG.info("Skipping empty savepoint state for operator {}.", operatorState.getOperatorID());
        }
    }
    return new CompletedCheckpoint(jobId, checkpointMetadata.getCheckpointId(), 0L, 0L, operatorStates, checkpointMetadata.getMasterStates(), checkpointProperties, restoreMode == RestoreMode.CLAIM ? new ClaimModeCompletedStorageLocation(location) : location);
}
Also used : HashMap(java.util.HashMap) DataInputStream(java.io.DataInputStream) InputStream(java.io.InputStream) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) DataInputStream(java.io.DataInputStream) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) CheckpointMetadata(org.apache.flink.runtime.checkpoint.metadata.CheckpointMetadata) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 9 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class PendingCheckpoint method acknowledgeTask.

/**
 * Acknowledges the task with the given execution attempt id and the given subtask state.
 *
 * @param executionAttemptId of the acknowledged task
 * @param operatorSubtaskStates of the acknowledged task
 * @param metrics Checkpoint metrics for the stats
 * @return TaskAcknowledgeResult of the operation
 */
public TaskAcknowledgeResult acknowledgeTask(ExecutionAttemptID executionAttemptId, TaskStateSnapshot operatorSubtaskStates, CheckpointMetrics metrics, @Nullable PendingCheckpointStats statsCallback) {
    synchronized (lock) {
        if (disposed) {
            return TaskAcknowledgeResult.DISCARDED;
        }
        final ExecutionVertex vertex = notYetAcknowledgedTasks.remove(executionAttemptId);
        if (vertex == null) {
            if (acknowledgedTasks.contains(executionAttemptId)) {
                return TaskAcknowledgeResult.DUPLICATE;
            } else {
                return TaskAcknowledgeResult.UNKNOWN;
            }
        } else {
            acknowledgedTasks.add(executionAttemptId);
        }
        long ackTimestamp = System.currentTimeMillis();
        if (operatorSubtaskStates != null && operatorSubtaskStates.isTaskDeployedAsFinished()) {
            checkpointPlan.reportTaskFinishedOnRestore(vertex);
        } else {
            List<OperatorIDPair> operatorIDs = vertex.getJobVertex().getOperatorIDs();
            for (OperatorIDPair operatorID : operatorIDs) {
                updateOperatorState(vertex, operatorSubtaskStates, operatorID);
            }
            if (operatorSubtaskStates != null && operatorSubtaskStates.isTaskFinished()) {
                checkpointPlan.reportTaskHasFinishedOperators(vertex);
            }
        }
        ++numAcknowledgedTasks;
        // to prevent null-pointers from concurrent modification, copy reference onto stack
        if (statsCallback != null) {
            // Do this in millis because the web frontend works with them
            long alignmentDurationMillis = metrics.getAlignmentDurationNanos() / 1_000_000;
            long checkpointStartDelayMillis = metrics.getCheckpointStartDelayNanos() / 1_000_000;
            SubtaskStateStats subtaskStateStats = new SubtaskStateStats(vertex.getParallelSubtaskIndex(), ackTimestamp, metrics.getBytesPersistedOfThisCheckpoint(), metrics.getTotalBytesPersisted(), metrics.getSyncDurationMillis(), metrics.getAsyncDurationMillis(), metrics.getBytesProcessedDuringAlignment(), metrics.getBytesPersistedDuringAlignment(), alignmentDurationMillis, checkpointStartDelayMillis, metrics.getUnalignedCheckpoint(), true);
            LOG.trace("Checkpoint {} stats for {}: size={}Kb, duration={}ms, sync part={}ms, async part={}ms", checkpointId, vertex.getTaskNameWithSubtaskIndex(), subtaskStateStats.getStateSize() == 0 ? 0 : subtaskStateStats.getStateSize() / 1024, subtaskStateStats.getEndToEndDuration(statsCallback.getTriggerTimestamp()), subtaskStateStats.getSyncCheckpointDuration(), subtaskStateStats.getAsyncCheckpointDuration());
            statsCallback.reportSubtaskStats(vertex.getJobvertexId(), subtaskStateStats);
        }
        return TaskAcknowledgeResult.SUCCESS;
    }
}
Also used : ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 10 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class DefaultCheckpointPlan method fulfillSubtaskStateForPartiallyFinishedOperators.

private void fulfillSubtaskStateForPartiallyFinishedOperators(Map<OperatorID, OperatorState> operatorStates) {
    for (Execution finishedTask : finishedTasks) {
        ExecutionJobVertex jobVertex = finishedTask.getVertex().getJobVertex();
        for (OperatorIDPair operatorIDPair : jobVertex.getOperatorIDs()) {
            OperatorState operatorState = operatorStates.get(operatorIDPair.getGeneratedOperatorID());
            if (operatorState != null && operatorState.isFullyFinished()) {
                continue;
            }
            if (operatorState == null) {
                operatorState = new OperatorState(operatorIDPair.getGeneratedOperatorID(), jobVertex.getParallelism(), jobVertex.getMaxParallelism());
                operatorStates.put(operatorIDPair.getGeneratedOperatorID(), operatorState);
            }
            operatorState.putState(finishedTask.getParallelSubtaskIndex(), FinishedOperatorSubtaskState.INSTANCE);
        }
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Aggregations

OperatorIDPair (org.apache.flink.runtime.OperatorIDPair)18 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)11 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)5 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)5 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)5 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)4 ArrayList (java.util.ArrayList)3 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)3 Collection (java.util.Collection)2 Collections.emptyList (java.util.Collections.emptyList)2 Collections.singletonList (java.util.Collections.singletonList)2 List (java.util.List)2 Execution (org.apache.flink.runtime.executiongraph.Execution)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)2 ChainedStateHandle (org.apache.flink.runtime.state.ChainedStateHandle)2 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)2 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)2