use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.
the class TestJobDataFlowValidator method getTrackedOperatorID.
/**
* Traverse operators in the chain in the vertex and return the first tracked operator ID. For
* upstream, start with head, for downstream - with tail (see {@link
* JobVertex#getOperatorIDs()}). If a chain doesn't contain any tracked operators return
* nothing.
*/
private static Optional<String> getTrackedOperatorID(JobVertex vertex, boolean upstream, TestJobWithDescription testJob) {
ListIterator<OperatorIDPair> iterator = vertex.getOperatorIDs().listIterator(upstream ? 0 : vertex.getOperatorIDs().size());
while (upstream ? iterator.hasNext() : iterator.hasPrevious()) {
OperatorIDPair idPair = upstream ? iterator.next() : iterator.previous();
String id = idPair.getUserDefinedOperatorID().orElse(idPair.getGeneratedOperatorID()).toString();
if (testJob.operatorsWithDataFlowTracking.contains(id)) {
return Optional.of(id);
}
}
return Optional.empty();
}
use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.
the class CheckpointRestoreWithUidHashITCase method testRestoreFromSavepointBySetUidHash.
@Test
public void testRestoreFromSavepointBySetUidHash() throws Exception {
final int maxNumber = 100;
try (MiniCluster miniCluster = new MiniCluster(createMiniClusterConfig())) {
miniCluster.start();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
JobGraph firstJob = createJobGraph(env, StatefulSourceBehavior.HOLD_AFTER_CHECKPOINT_ON_FIRST_RUN, maxNumber, "test-uid", null, null);
JobID jobId = miniCluster.submitJob(firstJob).get().getJobID();
waitForAllTaskRunning(miniCluster, jobId, false);
// The source would emit some records and start waiting for the checkpoint to happen.
// With this latch we ensures the savepoint happens in a fixed position and no following
// records are emitted after savepoint is triggered.
startWaitingForCheckpointLatch.get().await();
String savepointPath = miniCluster.triggerSavepoint(jobId, TMP_FOLDER.newFolder().getAbsolutePath(), true, SavepointFormatType.CANONICAL).get();
// Get the operator id
List<OperatorIDPair> operatorIds = firstJob.getVerticesSortedTopologicallyFromSources().get(0).getOperatorIDs();
OperatorIDPair sourceOperatorIds = operatorIds.get(operatorIds.size() - 1);
JobGraph secondJob = createJobGraph(env, StatefulSourceBehavior.PROCESS_ONLY, maxNumber, null, sourceOperatorIds.getGeneratedOperatorID().toHexString(), savepointPath);
miniCluster.executeJobBlocking(secondJob);
}
assertThat(result.get(), contains(IntStream.range(0, maxNumber).boxed().toArray()));
}
use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.
the class Checkpoints method loadAndValidateCheckpoint.
public static CompletedCheckpoint loadAndValidateCheckpoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, CompletedCheckpointStorageLocation location, ClassLoader classLoader, boolean allowNonRestoredState, CheckpointProperties checkpointProperties, RestoreMode restoreMode) throws IOException {
checkNotNull(jobId, "jobId");
checkNotNull(tasks, "tasks");
checkNotNull(location, "location");
checkNotNull(classLoader, "classLoader");
final StreamStateHandle metadataHandle = location.getMetadataHandle();
final String checkpointPointer = location.getExternalPointer();
// (1) load the savepoint
final CheckpointMetadata checkpointMetadata;
try (InputStream in = metadataHandle.openInputStream()) {
DataInputStream dis = new DataInputStream(in);
checkpointMetadata = loadCheckpointMetadata(dis, classLoader, checkpointPointer);
}
// generate mapping from operator to task
Map<OperatorID, ExecutionJobVertex> operatorToJobVertexMapping = new HashMap<>();
for (ExecutionJobVertex task : tasks.values()) {
for (OperatorIDPair operatorIDPair : task.getOperatorIDs()) {
operatorToJobVertexMapping.put(operatorIDPair.getGeneratedOperatorID(), task);
operatorIDPair.getUserDefinedOperatorID().ifPresent(id -> operatorToJobVertexMapping.put(id, task));
}
}
// (2) validate it (parallelism, etc)
HashMap<OperatorID, OperatorState> operatorStates = new HashMap<>(checkpointMetadata.getOperatorStates().size());
for (OperatorState operatorState : checkpointMetadata.getOperatorStates()) {
ExecutionJobVertex executionJobVertex = operatorToJobVertexMapping.get(operatorState.getOperatorID());
if (executionJobVertex != null) {
if (executionJobVertex.getMaxParallelism() == operatorState.getMaxParallelism() || executionJobVertex.canRescaleMaxParallelism(operatorState.getMaxParallelism())) {
operatorStates.put(operatorState.getOperatorID(), operatorState);
} else {
String msg = String.format("Failed to rollback to checkpoint/savepoint %s. " + "Max parallelism mismatch between checkpoint/savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the checkpoint/savepoint.", checkpointMetadata, operatorState.getOperatorID(), operatorState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
throw new IllegalStateException(msg);
}
} else if (allowNonRestoredState) {
LOG.info("Skipping savepoint state for operator {}.", operatorState.getOperatorID());
} else {
if (operatorState.getCoordinatorState() != null) {
throwNonRestoredStateException(checkpointPointer, operatorState.getOperatorID());
}
for (OperatorSubtaskState operatorSubtaskState : operatorState.getStates()) {
if (operatorSubtaskState.hasState()) {
throwNonRestoredStateException(checkpointPointer, operatorState.getOperatorID());
}
}
LOG.info("Skipping empty savepoint state for operator {}.", operatorState.getOperatorID());
}
}
return new CompletedCheckpoint(jobId, checkpointMetadata.getCheckpointId(), 0L, 0L, operatorStates, checkpointMetadata.getMasterStates(), checkpointProperties, restoreMode == RestoreMode.CLAIM ? new ClaimModeCompletedStorageLocation(location) : location);
}
use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.
the class PendingCheckpoint method acknowledgeTask.
/**
* Acknowledges the task with the given execution attempt id and the given subtask state.
*
* @param executionAttemptId of the acknowledged task
* @param operatorSubtaskStates of the acknowledged task
* @param metrics Checkpoint metrics for the stats
* @return TaskAcknowledgeResult of the operation
*/
public TaskAcknowledgeResult acknowledgeTask(ExecutionAttemptID executionAttemptId, TaskStateSnapshot operatorSubtaskStates, CheckpointMetrics metrics, @Nullable PendingCheckpointStats statsCallback) {
synchronized (lock) {
if (disposed) {
return TaskAcknowledgeResult.DISCARDED;
}
final ExecutionVertex vertex = notYetAcknowledgedTasks.remove(executionAttemptId);
if (vertex == null) {
if (acknowledgedTasks.contains(executionAttemptId)) {
return TaskAcknowledgeResult.DUPLICATE;
} else {
return TaskAcknowledgeResult.UNKNOWN;
}
} else {
acknowledgedTasks.add(executionAttemptId);
}
long ackTimestamp = System.currentTimeMillis();
if (operatorSubtaskStates != null && operatorSubtaskStates.isTaskDeployedAsFinished()) {
checkpointPlan.reportTaskFinishedOnRestore(vertex);
} else {
List<OperatorIDPair> operatorIDs = vertex.getJobVertex().getOperatorIDs();
for (OperatorIDPair operatorID : operatorIDs) {
updateOperatorState(vertex, operatorSubtaskStates, operatorID);
}
if (operatorSubtaskStates != null && operatorSubtaskStates.isTaskFinished()) {
checkpointPlan.reportTaskHasFinishedOperators(vertex);
}
}
++numAcknowledgedTasks;
// to prevent null-pointers from concurrent modification, copy reference onto stack
if (statsCallback != null) {
// Do this in millis because the web frontend works with them
long alignmentDurationMillis = metrics.getAlignmentDurationNanos() / 1_000_000;
long checkpointStartDelayMillis = metrics.getCheckpointStartDelayNanos() / 1_000_000;
SubtaskStateStats subtaskStateStats = new SubtaskStateStats(vertex.getParallelSubtaskIndex(), ackTimestamp, metrics.getBytesPersistedOfThisCheckpoint(), metrics.getTotalBytesPersisted(), metrics.getSyncDurationMillis(), metrics.getAsyncDurationMillis(), metrics.getBytesProcessedDuringAlignment(), metrics.getBytesPersistedDuringAlignment(), alignmentDurationMillis, checkpointStartDelayMillis, metrics.getUnalignedCheckpoint(), true);
LOG.trace("Checkpoint {} stats for {}: size={}Kb, duration={}ms, sync part={}ms, async part={}ms", checkpointId, vertex.getTaskNameWithSubtaskIndex(), subtaskStateStats.getStateSize() == 0 ? 0 : subtaskStateStats.getStateSize() / 1024, subtaskStateStats.getEndToEndDuration(statsCallback.getTriggerTimestamp()), subtaskStateStats.getSyncCheckpointDuration(), subtaskStateStats.getAsyncCheckpointDuration());
statsCallback.reportSubtaskStats(vertex.getJobvertexId(), subtaskStateStats);
}
return TaskAcknowledgeResult.SUCCESS;
}
}
use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.
the class DefaultCheckpointPlan method fulfillSubtaskStateForPartiallyFinishedOperators.
private void fulfillSubtaskStateForPartiallyFinishedOperators(Map<OperatorID, OperatorState> operatorStates) {
for (Execution finishedTask : finishedTasks) {
ExecutionJobVertex jobVertex = finishedTask.getVertex().getJobVertex();
for (OperatorIDPair operatorIDPair : jobVertex.getOperatorIDs()) {
OperatorState operatorState = operatorStates.get(operatorIDPair.getGeneratedOperatorID());
if (operatorState != null && operatorState.isFullyFinished()) {
continue;
}
if (operatorState == null) {
operatorState = new OperatorState(operatorIDPair.getGeneratedOperatorID(), jobVertex.getParallelism(), jobVertex.getMaxParallelism());
operatorStates.put(operatorIDPair.getGeneratedOperatorID(), operatorState);
}
operatorState.putState(finishedTask.getParallelSubtaskIndex(), FinishedOperatorSubtaskState.INSTANCE);
}
}
}
Aggregations