Search in sources :

Example 61 with OperatorStateHandle

use of org.apache.flink.runtime.state.OperatorStateHandle in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithChangingParallelism.

/**
 * Tests the checkpoint restoration with changing parallelism of job vertex with partitioned
 * state.
 */
private void testRestoreLatestCheckpointedStateWithChangingParallelism(boolean scaleOut) throws Exception {
    final JobVertexID jobVertexID1 = new JobVertexID();
    final JobVertexID jobVertexID2 = new JobVertexID();
    int parallelism1 = 3;
    int parallelism2 = scaleOut ? 2 : 13;
    int maxParallelism1 = 42;
    int maxParallelism2 = 13;
    int newParallelism2 = scaleOut ? 13 : 2;
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, parallelism2, maxParallelism2).build();
    final ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
    final ExecutionJobVertex jobVertex2 = graph.getJobVertex(jobVertexID2);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger the checkpoint
    coord.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, coord.getPendingCheckpoints().size());
    long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
    List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
    List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
    // vertex 1
    for (int index = 0; index < jobVertex1.getParallelism(); index++) {
        OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID1, index, 2, 8, false);
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), true);
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, new Random()))).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID1), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    // vertex 2
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesBackend = new ArrayList<>(jobVertex2.getParallelism());
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesRaw = new ArrayList<>(jobVertex2.getParallelism());
    for (int index = 0; index < jobVertex2.getParallelism(); index++) {
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), true);
        OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, false);
        OperatorStateHandle opStateRaw = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, true);
        expectedOpStatesBackend.add(new ChainedStateHandle<>(singletonList(opStateBackend)));
        expectedOpStatesRaw.add(new ChainedStateHandle<>(singletonList(opStateRaw)));
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setRawOperatorState(opStateRaw).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID2), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
    assertEquals(1, completedCheckpoints.size());
    List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
    // rescale vertex 2
    final ExecutionGraph newGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, newParallelism2, maxParallelism2).build();
    final ExecutionJobVertex newJobVertex1 = newGraph.getJobVertex(jobVertexID1);
    final ExecutionJobVertex newJobVertex2 = newGraph.getJobVertex(jobVertexID2);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator newCoord = new CheckpointCoordinatorBuilder().setExecutionGraph(newGraph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    Set<ExecutionJobVertex> tasks = new HashSet<>();
    tasks.add(newJobVertex1);
    tasks.add(newJobVertex2);
    assertTrue(newCoord.restoreLatestCheckpointedStateToAll(tasks, false));
    // verify the restored state
    verifyStateRestore(jobVertexID1, newJobVertex1, keyGroupPartitions1);
    List<List<Collection<OperatorStateHandle>>> actualOpStatesBackend = new ArrayList<>(newJobVertex2.getParallelism());
    List<List<Collection<OperatorStateHandle>>> actualOpStatesRaw = new ArrayList<>(newJobVertex2.getParallelism());
    for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
        List<OperatorIDPair> operatorIDs = newJobVertex2.getOperatorIDs();
        KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), false);
        KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), true);
        JobManagerTaskRestore taskRestore = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot taskStateHandles = taskRestore.getTaskStateSnapshot();
        final int headOpIndex = operatorIDs.size() - 1;
        List<Collection<OperatorStateHandle>> allParallelManagedOpStates = new ArrayList<>(operatorIDs.size());
        List<Collection<OperatorStateHandle>> allParallelRawOpStates = new ArrayList<>(operatorIDs.size());
        for (int idx = 0; idx < operatorIDs.size(); ++idx) {
            OperatorID operatorID = operatorIDs.get(idx).getGeneratedOperatorID();
            OperatorSubtaskState opState = taskStateHandles.getSubtaskStateByOperatorID(operatorID);
            Collection<OperatorStateHandle> opStateBackend = opState.getManagedOperatorState();
            Collection<OperatorStateHandle> opStateRaw = opState.getRawOperatorState();
            allParallelManagedOpStates.add(opStateBackend);
            allParallelRawOpStates.add(opStateRaw);
            if (idx == headOpIndex) {
                Collection<KeyedStateHandle> keyedStateBackend = opState.getManagedKeyedState();
                Collection<KeyedStateHandle> keyGroupStateRaw = opState.getRawKeyedState();
                compareKeyedState(singletonList(originalKeyedStateBackend), keyedStateBackend);
                compareKeyedState(singletonList(originalKeyedStateRaw), keyGroupStateRaw);
            }
        }
        actualOpStatesBackend.add(allParallelManagedOpStates);
        actualOpStatesRaw.add(allParallelRawOpStates);
    }
    comparePartitionableState(expectedOpStatesBackend, actualOpStatesBackend);
    comparePartitionableState(expectedOpStatesRaw, actualOpStatesRaw);
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) ArrayList(java.util.ArrayList) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) ChainedStateHandle(org.apache.flink.runtime.state.ChainedStateHandle) Random(java.util.Random) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Collections.singletonList(java.util.Collections.singletonList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Collection(java.util.Collection) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 62 with OperatorStateHandle

use of org.apache.flink.runtime.state.OperatorStateHandle in project flink by apache.

the class StreamTaskStateInitializerImplTest method testWithRestore.

@SuppressWarnings("unchecked")
@Test
public void testWithRestore() throws Exception {
    StateBackend mockingBackend = spy(new StateBackend() {

        @Override
        public <K> AbstractKeyedStateBackend<K> createKeyedStateBackend(Environment env, JobID jobID, String operatorIdentifier, TypeSerializer<K> keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection<KeyedStateHandle> stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception {
            return mock(AbstractKeyedStateBackend.class);
        }

        @Override
        public OperatorStateBackend createOperatorStateBackend(Environment env, String operatorIdentifier, @Nonnull Collection<OperatorStateHandle> stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception {
            return mock(OperatorStateBackend.class);
        }
    });
    OperatorID operatorID = new OperatorID(47L, 11L);
    TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot();
    Random random = new Random(0x42);
    OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(new OperatorStreamStateHandle(Collections.singletonMap("a", new OperatorStateHandle.StateMetaInfo(new long[] { 0, 10 }, SPLIT_DISTRIBUTE)), CheckpointTestUtils.createDummyStreamStateHandle(random, null))).setRawOperatorState(new OperatorStreamStateHandle(Collections.singletonMap("_default_", new OperatorStateHandle.StateMetaInfo(new long[] { 0, 20, 30 }, SPLIT_DISTRIBUTE)), CheckpointTestUtils.createDummyStreamStateHandle(random, null))).setManagedKeyedState(CheckpointTestUtils.createDummyKeyGroupStateHandle(random, null)).setRawKeyedState(CheckpointTestUtils.createDummyKeyGroupStateHandle(random, null)).setInputChannelState(singleton(createNewInputChannelStateHandle(10, random))).setResultSubpartitionState(singleton(createNewResultSubpartitionStateHandle(10, random))).build();
    taskStateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
    JobManagerTaskRestore jobManagerTaskRestore = new JobManagerTaskRestore(42L, taskStateSnapshot);
    StreamTaskStateInitializer streamTaskStateManager = streamTaskStateManager(mockingBackend, jobManagerTaskRestore, false);
    AbstractStreamOperator<?> streamOperator = mock(AbstractStreamOperator.class);
    when(streamOperator.getOperatorID()).thenReturn(operatorID);
    TypeSerializer<?> typeSerializer = new IntSerializer();
    CloseableRegistry closeableRegistry = new CloseableRegistry();
    StreamOperatorStateContext stateContext = streamTaskStateManager.streamOperatorStateContext(streamOperator.getOperatorID(), streamOperator.getClass().getSimpleName(), new TestProcessingTimeService(), streamOperator, typeSerializer, closeableRegistry, new UnregisteredMetricsGroup(), 1.0, false);
    OperatorStateBackend operatorStateBackend = stateContext.operatorStateBackend();
    CheckpointableKeyedStateBackend<?> keyedStateBackend = stateContext.keyedStateBackend();
    InternalTimeServiceManager<?> timeServiceManager = stateContext.internalTimerServiceManager();
    CloseableIterable<KeyGroupStatePartitionStreamProvider> keyedStateInputs = stateContext.rawKeyedStateInputs();
    CloseableIterable<StatePartitionStreamProvider> operatorStateInputs = stateContext.rawOperatorStateInputs();
    Assert.assertTrue("Expected the context to be restored", stateContext.isRestored());
    Assert.assertEquals(OptionalLong.of(42L), stateContext.getRestoredCheckpointId());
    Assert.assertNotNull(operatorStateBackend);
    Assert.assertNotNull(keyedStateBackend);
    // this is deactivated on purpose so that it does not attempt to consume the raw keyed
    // state.
    Assert.assertNull(timeServiceManager);
    Assert.assertNotNull(keyedStateInputs);
    Assert.assertNotNull(operatorStateInputs);
    int count = 0;
    for (KeyGroupStatePartitionStreamProvider keyedStateInput : keyedStateInputs) {
        ++count;
    }
    Assert.assertEquals(1, count);
    count = 0;
    for (StatePartitionStreamProvider operatorStateInput : operatorStateInputs) {
        ++count;
    }
    Assert.assertEquals(3, count);
    checkCloseablesRegistered(closeableRegistry, operatorStateBackend, keyedStateBackend, keyedStateInputs, operatorStateInputs);
}
Also used : IntSerializer(org.apache.flink.api.common.typeutils.base.IntSerializer) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) CloseableRegistry(org.apache.flink.core.fs.CloseableRegistry) StateBackend(org.apache.flink.runtime.state.StateBackend) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) CheckpointableKeyedStateBackend(org.apache.flink.runtime.state.CheckpointableKeyedStateBackend) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) KeyGroupStatePartitionStreamProvider(org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider) KeyGroupStatePartitionStreamProvider(org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider) StatePartitionStreamProvider(org.apache.flink.runtime.state.StatePartitionStreamProvider) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) Random(java.util.Random) TtlTimeProvider(org.apache.flink.runtime.state.ttl.TtlTimeProvider) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) DummyEnvironment(org.apache.flink.runtime.operators.testutils.DummyEnvironment) Environment(org.apache.flink.runtime.execution.Environment) TestProcessingTimeService(org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) JobID(org.apache.flink.api.common.JobID) TaskStateManagerImplTest(org.apache.flink.runtime.state.TaskStateManagerImplTest) Test(org.junit.Test)

Example 63 with OperatorStateHandle

use of org.apache.flink.runtime.state.OperatorStateHandle in project flink by apache.

the class BackendRestorerProcedureTest method testExceptionThrownIfAllRestoresFailed.

/**
 * Tests if there is an exception if all restore attempts are exhausted and failed.
 */
@Test
public void testExceptionThrownIfAllRestoresFailed() throws Exception {
    CloseableRegistry closeableRegistry = new CloseableRegistry();
    OperatorStateHandle firstFailHandle = mock(OperatorStateHandle.class);
    OperatorStateHandle secondFailHandle = mock(OperatorStateHandle.class);
    OperatorStateHandle thirdFailHandle = mock(OperatorStateHandle.class);
    List<StateObjectCollection<OperatorStateHandle>> sortedRestoreOptions = Arrays.asList(new StateObjectCollection<>(Collections.singletonList(firstFailHandle)), new StateObjectCollection<>(Collections.singletonList(secondFailHandle)), new StateObjectCollection<>(Collections.singletonList(thirdFailHandle)));
    BackendRestorerProcedure<OperatorStateBackend, OperatorStateHandle> restorerProcedure = new BackendRestorerProcedure<>(backendSupplier, closeableRegistry, "test op state backend");
    try {
        restorerProcedure.createAndRestore(sortedRestoreOptions);
        Assert.fail();
    } catch (Exception ignore) {
    }
    verify(firstFailHandle).openInputStream();
    verify(secondFailHandle).openInputStream();
    verify(thirdFailHandle).openInputStream();
}
Also used : StateObjectCollection(org.apache.flink.runtime.checkpoint.StateObjectCollection) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) CloseableRegistry(org.apache.flink.core.fs.CloseableRegistry) FlinkException(org.apache.flink.util.FlinkException) FunctionWithException(org.apache.flink.util.function.FunctionWithException) Test(org.junit.Test)

Example 64 with OperatorStateHandle

use of org.apache.flink.runtime.state.OperatorStateHandle in project flink by apache.

the class BackendRestorerProcedureTest method testCanBeCanceledViaRegistry.

/**
 * Test that the restore can be stopped via the provided closeable registry.
 */
@Test
public void testCanBeCanceledViaRegistry() throws Exception {
    CloseableRegistry closeableRegistry = new CloseableRegistry();
    OneShotLatch waitForBlock = new OneShotLatch();
    OneShotLatch unblock = new OneShotLatch();
    OperatorStateHandle blockingRestoreHandle = mock(OperatorStateHandle.class);
    when(blockingRestoreHandle.openInputStream()).thenReturn(new BlockingFSDataInputStream(waitForBlock, unblock));
    List<StateObjectCollection<OperatorStateHandle>> sortedRestoreOptions = Collections.singletonList(new StateObjectCollection<>(Collections.singletonList(blockingRestoreHandle)));
    BackendRestorerProcedure<OperatorStateBackend, OperatorStateHandle> restorerProcedure = new BackendRestorerProcedure<>(backendSupplier, closeableRegistry, "test op state backend");
    AtomicReference<Exception> exceptionReference = new AtomicReference<>(null);
    Thread restoreThread = new Thread(() -> {
        try {
            restorerProcedure.createAndRestore(sortedRestoreOptions);
        } catch (Exception e) {
            exceptionReference.set(e);
        }
    });
    restoreThread.start();
    waitForBlock.await();
    closeableRegistry.close();
    unblock.trigger();
    restoreThread.join();
    Exception exception = exceptionReference.get();
    Assert.assertTrue(exception instanceof FlinkException);
}
Also used : BlockingFSDataInputStream(org.apache.flink.runtime.util.BlockingFSDataInputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) CloseableRegistry(org.apache.flink.core.fs.CloseableRegistry) FlinkException(org.apache.flink.util.FlinkException) FunctionWithException(org.apache.flink.util.function.FunctionWithException) FlinkException(org.apache.flink.util.FlinkException) StateObjectCollection(org.apache.flink.runtime.checkpoint.StateObjectCollection) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) Test(org.junit.Test)

Aggregations

OperatorStateHandle (org.apache.flink.runtime.state.OperatorStateHandle)64 HashMap (java.util.HashMap)26 ArrayList (java.util.ArrayList)25 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)22 Test (org.junit.Test)21 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)20 OperatorStreamStateHandle (org.apache.flink.runtime.state.OperatorStreamStateHandle)19 ByteStreamStateHandle (org.apache.flink.runtime.state.memory.ByteStreamStateHandle)17 List (java.util.List)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)15 KeyedStateHandle (org.apache.flink.runtime.state.KeyedStateHandle)15 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)14 Map (java.util.Map)13 Collection (java.util.Collection)10 StateObjectCollection (org.apache.flink.runtime.checkpoint.StateObjectCollection)10 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)10 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)9 EnumMap (java.util.EnumMap)8 JobID (org.apache.flink.api.common.JobID)8 CloseableRegistry (org.apache.flink.core.fs.CloseableRegistry)8