Search in sources :

Example 1 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testStateRecoveryWithTopologyChange.

/**
 * old topology. [operator1,operator2] * parallelism1 -> [operator3,operator4] * parallelism2
 *
 * <p>new topology
 *
 * <p>[operator5,operator1,operator3] * newParallelism1 -> [operator3, operator6] *
 * newParallelism2
 */
public void testStateRecoveryWithTopologyChange(TestScaleType scaleType) throws Exception {
    /*
         * Old topology
         * CHAIN(op1 -> op2) * parallelism1 -> CHAIN(op3 -> op4) * parallelism2
         */
    Tuple2<JobVertexID, OperatorID> id1 = generateIDPair();
    Tuple2<JobVertexID, OperatorID> id2 = generateIDPair();
    int parallelism1 = 10;
    int maxParallelism1 = 64;
    Tuple2<JobVertexID, OperatorID> id3 = generateIDPair();
    Tuple2<JobVertexID, OperatorID> id4 = generateIDPair();
    int parallelism2 = 10;
    int maxParallelism2 = 64;
    List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
    Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
    // prepare vertex1 state
    for (Tuple2<JobVertexID, OperatorID> id : Arrays.asList(id1, id2)) {
        OperatorState taskState = new OperatorState(id.f1, parallelism1, maxParallelism1);
        operatorStates.put(id.f1, taskState);
        for (int index = 0; index < taskState.getParallelism(); index++) {
            OperatorSubtaskState subtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(id.f0, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(id.f0, index, 2, 8, true)).build();
            taskState.putState(index, subtaskState);
        }
    }
    List<List<ChainedStateHandle<OperatorStateHandle>>> expectedManagedOperatorStates = new ArrayList<>();
    List<List<ChainedStateHandle<OperatorStateHandle>>> expectedRawOperatorStates = new ArrayList<>();
    // prepare vertex2 state
    for (Tuple2<JobVertexID, OperatorID> id : Arrays.asList(id3, id4)) {
        OperatorState operatorState = new OperatorState(id.f1, parallelism2, maxParallelism2);
        operatorStates.put(id.f1, operatorState);
        List<ChainedStateHandle<OperatorStateHandle>> expectedManagedOperatorState = new ArrayList<>();
        List<ChainedStateHandle<OperatorStateHandle>> expectedRawOperatorState = new ArrayList<>();
        expectedManagedOperatorStates.add(expectedManagedOperatorState);
        expectedRawOperatorStates.add(expectedRawOperatorState);
        for (int index = 0; index < operatorState.getParallelism(); index++) {
            final OperatorSubtaskState.Builder stateBuilder = OperatorSubtaskState.builder();
            OperatorStateHandle subManagedOperatorState = generateChainedPartitionableStateHandle(id.f0, index, 2, 8, false).get(0);
            OperatorStateHandle subRawOperatorState = generateChainedPartitionableStateHandle(id.f0, index, 2, 8, true).get(0);
            if (id.f0.equals(id3.f0)) {
                stateBuilder.setManagedKeyedState(generateKeyGroupState(id.f0, keyGroupPartitions2.get(index), false));
            }
            if (id.f0.equals(id3.f0)) {
                stateBuilder.setRawKeyedState(generateKeyGroupState(id.f0, keyGroupPartitions2.get(index), true));
            }
            expectedManagedOperatorState.add(ChainedStateHandle.wrapSingleHandle(subManagedOperatorState));
            expectedRawOperatorState.add(ChainedStateHandle.wrapSingleHandle(subRawOperatorState));
            OperatorSubtaskState subtaskState = stateBuilder.setManagedOperatorState(subManagedOperatorState).setRawOperatorState(subRawOperatorState).build();
            operatorState.putState(index, subtaskState);
        }
    }
    /*
         * New topology
         * CHAIN(op5 -> op1 -> op2) * newParallelism1 -> CHAIN(op3 -> op6) * newParallelism2
         */
    Tuple2<JobVertexID, OperatorID> id5 = generateIDPair();
    int newParallelism1 = 10;
    Tuple2<JobVertexID, OperatorID> id6 = generateIDPair();
    int newParallelism2 = parallelism2;
    if (scaleType == TestScaleType.INCREASE_PARALLELISM) {
        newParallelism2 = 20;
    } else if (scaleType == TestScaleType.DECREASE_PARALLELISM) {
        newParallelism2 = 8;
    }
    List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
    ExecutionGraph newGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(id5.f0, newParallelism1, maxParallelism1, Stream.of(id2.f1, id1.f1, id5.f1).map(OperatorIDPair::generatedIDOnly).collect(Collectors.toList()), true).addJobVertex(id3.f0, newParallelism2, maxParallelism2, Stream.of(id6.f1, id3.f1).map(OperatorIDPair::generatedIDOnly).collect(Collectors.toList()), true).build();
    ExecutionJobVertex newJobVertex1 = newGraph.getJobVertex(id5.f0);
    ExecutionJobVertex newJobVertex2 = newGraph.getJobVertex(id3.f0);
    Set<ExecutionJobVertex> tasks = new HashSet<>();
    tasks.add(newJobVertex1);
    tasks.add(newJobVertex2);
    CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(newGraph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.<MasterState>emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
    // set up the coordinator and validate the initial state
    SharedStateRegistry sharedStateRegistry = SharedStateRegistry.DEFAULT_FACTORY.create(Executors.directExecutor(), emptyList());
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(newGraph).setCompletedCheckpointStore(storeFor(sharedStateRegistry, () -> {
    }, completedCheckpoint)).setTimer(manuallyTriggeredScheduledExecutor).build();
    coord.restoreLatestCheckpointedStateToAll(tasks, true);
    for (int i = 0; i < newJobVertex1.getParallelism(); i++) {
        final List<OperatorIDPair> operatorIDs = newJobVertex1.getOperatorIDs();
        JobManagerTaskRestore taskRestore = newJobVertex1.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(2L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
        OperatorSubtaskState headOpState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIDs.size() - 1).getGeneratedOperatorID());
        assertTrue(headOpState.getManagedKeyedState().isEmpty());
        assertTrue(headOpState.getRawKeyedState().isEmpty());
        // operator5
        {
            int operatorIndexInChain = 2;
            OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
            assertTrue(opState.getManagedOperatorState().isEmpty());
            assertTrue(opState.getRawOperatorState().isEmpty());
        }
        // operator1
        {
            int operatorIndexInChain = 1;
            OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
            OperatorStateHandle expectedManagedOpState = generatePartitionableStateHandle(id1.f0, i, 2, 8, false);
            OperatorStateHandle expectedRawOpState = generatePartitionableStateHandle(id1.f0, i, 2, 8, true);
            Collection<OperatorStateHandle> managedOperatorState = opState.getManagedOperatorState();
            assertEquals(1, managedOperatorState.size());
            assertTrue(CommonTestUtils.isStreamContentEqual(expectedManagedOpState.openInputStream(), managedOperatorState.iterator().next().openInputStream()));
            Collection<OperatorStateHandle> rawOperatorState = opState.getRawOperatorState();
            assertEquals(1, rawOperatorState.size());
            assertTrue(CommonTestUtils.isStreamContentEqual(expectedRawOpState.openInputStream(), rawOperatorState.iterator().next().openInputStream()));
        }
        // operator2
        {
            int operatorIndexInChain = 0;
            OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
            OperatorStateHandle expectedManagedOpState = generatePartitionableStateHandle(id2.f0, i, 2, 8, false);
            OperatorStateHandle expectedRawOpState = generatePartitionableStateHandle(id2.f0, i, 2, 8, true);
            Collection<OperatorStateHandle> managedOperatorState = opState.getManagedOperatorState();
            assertEquals(1, managedOperatorState.size());
            assertTrue(CommonTestUtils.isStreamContentEqual(expectedManagedOpState.openInputStream(), managedOperatorState.iterator().next().openInputStream()));
            Collection<OperatorStateHandle> rawOperatorState = opState.getRawOperatorState();
            assertEquals(1, rawOperatorState.size());
            assertTrue(CommonTestUtils.isStreamContentEqual(expectedRawOpState.openInputStream(), rawOperatorState.iterator().next().openInputStream()));
        }
    }
    List<List<Collection<OperatorStateHandle>>> actualManagedOperatorStates = new ArrayList<>(newJobVertex2.getParallelism());
    List<List<Collection<OperatorStateHandle>>> actualRawOperatorStates = new ArrayList<>(newJobVertex2.getParallelism());
    for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
        final List<OperatorIDPair> operatorIDs = newJobVertex2.getOperatorIDs();
        JobManagerTaskRestore taskRestore = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(2L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
        // operator 3
        {
            int operatorIndexInChain = 1;
            OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
            List<Collection<OperatorStateHandle>> actualSubManagedOperatorState = new ArrayList<>(1);
            actualSubManagedOperatorState.add(opState.getManagedOperatorState());
            List<Collection<OperatorStateHandle>> actualSubRawOperatorState = new ArrayList<>(1);
            actualSubRawOperatorState.add(opState.getRawOperatorState());
            actualManagedOperatorStates.add(actualSubManagedOperatorState);
            actualRawOperatorStates.add(actualSubRawOperatorState);
        }
        // operator 6
        {
            int operatorIndexInChain = 0;
            OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
            assertTrue(opState.getManagedOperatorState().isEmpty());
            assertTrue(opState.getRawOperatorState().isEmpty());
        }
        KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(id3.f0, newKeyGroupPartitions2.get(i), false);
        KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(id3.f0, newKeyGroupPartitions2.get(i), true);
        OperatorSubtaskState headOpState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIDs.size() - 1).getGeneratedOperatorID());
        Collection<KeyedStateHandle> keyedStateBackend = headOpState.getManagedKeyedState();
        Collection<KeyedStateHandle> keyGroupStateRaw = headOpState.getRawKeyedState();
        compareKeyedState(singletonList(originalKeyedStateBackend), keyedStateBackend);
        compareKeyedState(singletonList(originalKeyedStateRaw), keyGroupStateRaw);
    }
    comparePartitionableState(expectedManagedOperatorStates.get(0), actualManagedOperatorStates);
    comparePartitionableState(expectedRawOperatorStates.get(0), actualRawOperatorStates);
}
Also used : HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) ArrayList(java.util.ArrayList) TestCompletedCheckpointStorageLocation(org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) ChainedStateHandle(org.apache.flink.runtime.state.ChainedStateHandle) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) SharedStateRegistry(org.apache.flink.runtime.state.SharedStateRegistry) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Collections.singletonList(java.util.Collections.singletonList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Collection(java.util.Collection) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 2 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreFinishedStateWithoutInFlightData.

@Test
public void testRestoreFinishedStateWithoutInFlightData() throws Exception {
    // given: Operator with not empty states.
    OperatorIDPair op1 = OperatorIDPair.generatedIDOnly(new OperatorID());
    final JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1, singletonList(op1), true).build();
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
    operatorStates.put(op1.getGeneratedOperatorID(), new FullyFinishedOperatorState(op1.getGeneratedOperatorID(), 1, 1));
    CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
    completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
    });
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(2).build()).setCompletedCheckpointStore(completedCheckpointStore).build();
    ExecutionJobVertex vertex = graph.getJobVertex(jobVertexID);
    coord.restoreInitialCheckpointIfPresent(Collections.singleton(vertex));
    TaskStateSnapshot restoredState = vertex.getTaskVertices()[0].getCurrentExecutionAttempt().getTaskRestore().getTaskStateSnapshot();
    assertTrue(restoredState.isTaskDeployedAsFinished());
}
Also used : HashMap(java.util.HashMap) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestCompletedCheckpointStorageLocation(org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair) Test(org.junit.Test)

Example 3 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class StateAssignmentOperation method assignNonFinishedStateToTask.

private void assignNonFinishedStateToTask(TaskStateAssignment assignment, List<OperatorIDPair> operatorIDs, int subTaskIndex, Execution currentExecutionAttempt) {
    TaskStateSnapshot taskState = new TaskStateSnapshot(operatorIDs.size(), false);
    for (OperatorIDPair operatorID : operatorIDs) {
        OperatorInstanceID instanceID = OperatorInstanceID.of(subTaskIndex, operatorID.getGeneratedOperatorID());
        OperatorSubtaskState operatorSubtaskState = assignment.getSubtaskState(instanceID);
        taskState.putSubtaskStateByOperatorID(operatorID.getGeneratedOperatorID(), operatorSubtaskState);
    }
    JobManagerTaskRestore taskRestore = new JobManagerTaskRestore(restoreCheckpointId, taskState);
    currentExecutionAttempt.setInitialState(taskRestore);
}
Also used : OperatorInstanceID(org.apache.flink.runtime.jobgraph.OperatorInstanceID) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 4 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class DefaultCheckpointPlan method fulfillFullyFinishedOrFinishedOnRestoreOperatorStates.

private void fulfillFullyFinishedOrFinishedOnRestoreOperatorStates(Map<OperatorID, OperatorState> operatorStates) {
    // Completes the operator state for the fully finished operators
    for (ExecutionJobVertex jobVertex : fullyFinishedOrFinishedOnRestoreVertices.values()) {
        for (OperatorIDPair operatorID : jobVertex.getOperatorIDs()) {
            OperatorState operatorState = operatorStates.get(operatorID.getGeneratedOperatorID());
            checkState(operatorState == null || !operatorState.hasSubtaskStates(), "There should be no states or only coordinator state reported for fully finished operators");
            operatorState = new FullyFinishedOperatorState(operatorID.getGeneratedOperatorID(), jobVertex.getParallelism(), jobVertex.getMaxParallelism());
            operatorStates.put(operatorID.getGeneratedOperatorID(), operatorState);
        }
    }
}
Also used : ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 5 with OperatorIDPair

use of org.apache.flink.runtime.OperatorIDPair in project flink by apache.

the class StateAssignmentOperation method checkStateMappingCompleteness.

/**
 * Verifies that all operator states can be mapped to an execution job vertex.
 *
 * @param allowNonRestoredState if false an exception will be thrown if a state could not be
 *     mapped
 * @param operatorStates operator states to map
 * @param tasks task to map to
 */
private static void checkStateMappingCompleteness(boolean allowNonRestoredState, Map<OperatorID, OperatorState> operatorStates, Set<ExecutionJobVertex> tasks) {
    Set<OperatorID> allOperatorIDs = new HashSet<>();
    for (ExecutionJobVertex executionJobVertex : tasks) {
        for (OperatorIDPair operatorIDPair : executionJobVertex.getOperatorIDs()) {
            allOperatorIDs.add(operatorIDPair.getGeneratedOperatorID());
            operatorIDPair.getUserDefinedOperatorID().ifPresent(allOperatorIDs::add);
        }
    }
    for (Map.Entry<OperatorID, OperatorState> operatorGroupStateEntry : operatorStates.entrySet()) {
        if (!allOperatorIDs.contains(operatorGroupStateEntry.getKey())) {
            OperatorState operatorState = operatorGroupStateEntry.getValue();
            if (allowNonRestoredState) {
                LOG.info("Skipped checkpoint state for operator {}.", operatorState.getOperatorID());
            } else {
                throw new IllegalStateException("There is no operator for the state " + operatorState.getOperatorID());
            }
        }
    }
}
Also used : ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Aggregations

OperatorIDPair (org.apache.flink.runtime.OperatorIDPair)18 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)11 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)5 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)5 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)5 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)4 ArrayList (java.util.ArrayList)3 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)3 Collection (java.util.Collection)2 Collections.emptyList (java.util.Collections.emptyList)2 Collections.singletonList (java.util.Collections.singletonList)2 List (java.util.List)2 Execution (org.apache.flink.runtime.executiongraph.Execution)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)2 ChainedStateHandle (org.apache.flink.runtime.state.ChainedStateHandle)2 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)2 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)2