Search in sources :

Example 11 with JobManagerTaskRestore

use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.

the class ExecutionTest method testTaskRestoreStateIsNulledAfterDeployment.

/**
 * Tests that the task restore state is nulled after the {@link Execution} has been deployed.
 * See FLINK-9693.
 */
@Test
public void testTaskRestoreStateIsNulledAfterDeployment() throws Exception {
    final JobVertex jobVertex = createNoOpJobVertex();
    final JobVertexID jobVertexId = jobVertex.getID();
    final SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(JobGraphTestUtils.streamingJobGraph(jobVertex), ComponentMainThreadExecutorServiceAdapter.forMainThread()).setExecutionSlotAllocatorFactory(SchedulerTestingUtils.newSlotSharingExecutionSlotAllocatorFactory(TestingPhysicalSlotProvider.createWithLimitedAmountOfPhysicalSlots(1))).build();
    ExecutionJobVertex executionJobVertex = scheduler.getExecutionJobVertex(jobVertexId);
    ExecutionVertex executionVertex = executionJobVertex.getTaskVertices()[0];
    final Execution execution = executionVertex.getCurrentExecutionAttempt();
    final JobManagerTaskRestore taskRestoreState = new JobManagerTaskRestore(1L, new TaskStateSnapshot());
    execution.setInitialState(taskRestoreState);
    assertThat(execution.getTaskRestore(), is(notNullValue()));
    // schedule the execution vertex and wait for its deployment
    scheduler.startScheduling();
    assertThat(execution.getTaskRestore(), is(nullValue()));
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) Test(org.junit.Test)

Example 12 with JobManagerTaskRestore

use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.

the class RestoreStreamTaskTest method testRestoreTailWithNewId.

@Test
public void testRestoreTailWithNewId() throws Exception {
    OperatorID headOperatorID = new OperatorID(42L, 42L);
    JobManagerTaskRestore restore = createRunAndCheckpointOperatorChain(headOperatorID, new CounterOperator(), new OperatorID(44L, 44L), new CounterOperator(), Optional.empty());
    TaskStateSnapshot stateHandles = restore.getTaskStateSnapshot();
    assertEquals(2, stateHandles.getSubtaskStateMappings().size());
    createRunAndCheckpointOperatorChain(headOperatorID, new CounterOperator(), new OperatorID(4444L, 4444L), new CounterOperator(), Optional.of(restore));
    assertEquals(Collections.singleton(headOperatorID), RESTORED_OPERATORS.keySet());
    assertThat(new HashSet<>(RESTORED_OPERATORS.values()), contains(restore.getRestoreCheckpointId()));
}
Also used : TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Test(org.junit.Test)

Example 13 with JobManagerTaskRestore

use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.

the class TaskStateManagerImplTest method testStateReportingAndRetrieving.

/**
 * Test reporting and retrieving prioritized local and remote state.
 */
@Test
public void testStateReportingAndRetrieving() {
    JobID jobID = new JobID();
    ExecutionAttemptID executionAttemptID = new ExecutionAttemptID();
    TestCheckpointResponder testCheckpointResponder = new TestCheckpointResponder();
    TestTaskLocalStateStore testTaskLocalStateStore = new TestTaskLocalStateStore();
    InMemoryStateChangelogStorage changelogStorage = new InMemoryStateChangelogStorage();
    TaskStateManager taskStateManager = taskStateManager(jobID, executionAttemptID, testCheckpointResponder, null, testTaskLocalStateStore, changelogStorage);
    // ---------------------------------------- test reporting
    // -----------------------------------------
    CheckpointMetaData checkpointMetaData = new CheckpointMetaData(74L, 11L);
    CheckpointMetrics checkpointMetrics = new CheckpointMetrics();
    TaskStateSnapshot jmTaskStateSnapshot = new TaskStateSnapshot();
    OperatorID operatorID_1 = new OperatorID(1L, 1L);
    OperatorID operatorID_2 = new OperatorID(2L, 2L);
    OperatorID operatorID_3 = new OperatorID(3L, 3L);
    Assert.assertFalse(taskStateManager.prioritizedOperatorState(operatorID_1).isRestored());
    Assert.assertFalse(taskStateManager.prioritizedOperatorState(operatorID_2).isRestored());
    Assert.assertFalse(taskStateManager.prioritizedOperatorState(operatorID_3).isRestored());
    KeyGroupRange keyGroupRange = new KeyGroupRange(0, 1);
    // Remote state of operator 1 has only managed keyed state.
    OperatorSubtaskState jmOperatorSubtaskState_1 = OperatorSubtaskState.builder().setManagedKeyedState(StateHandleDummyUtil.createNewKeyedStateHandle(keyGroupRange)).build();
    // Remote state of operator 1 has only raw keyed state.
    OperatorSubtaskState jmOperatorSubtaskState_2 = OperatorSubtaskState.builder().setRawKeyedState(StateHandleDummyUtil.createNewKeyedStateHandle(keyGroupRange)).build();
    jmTaskStateSnapshot.putSubtaskStateByOperatorID(operatorID_1, jmOperatorSubtaskState_1);
    jmTaskStateSnapshot.putSubtaskStateByOperatorID(operatorID_2, jmOperatorSubtaskState_2);
    TaskStateSnapshot tmTaskStateSnapshot = new TaskStateSnapshot();
    // Only operator 1 has a local alternative for the managed keyed state.
    OperatorSubtaskState tmOperatorSubtaskState_1 = OperatorSubtaskState.builder().setManagedKeyedState(StateHandleDummyUtil.createNewKeyedStateHandle(keyGroupRange)).build();
    tmTaskStateSnapshot.putSubtaskStateByOperatorID(operatorID_1, tmOperatorSubtaskState_1);
    taskStateManager.reportTaskStateSnapshots(checkpointMetaData, checkpointMetrics, jmTaskStateSnapshot, tmTaskStateSnapshot);
    TestCheckpointResponder.AcknowledgeReport acknowledgeReport = testCheckpointResponder.getAcknowledgeReports().get(0);
    // checks that the checkpoint responder and the local state store received state as
    // expected.
    Assert.assertEquals(checkpointMetaData.getCheckpointId(), acknowledgeReport.getCheckpointId());
    Assert.assertEquals(checkpointMetrics, acknowledgeReport.getCheckpointMetrics());
    Assert.assertEquals(executionAttemptID, acknowledgeReport.getExecutionAttemptID());
    Assert.assertEquals(jobID, acknowledgeReport.getJobID());
    Assert.assertEquals(jmTaskStateSnapshot, acknowledgeReport.getSubtaskState());
    Assert.assertEquals(tmTaskStateSnapshot, testTaskLocalStateStore.retrieveLocalState(checkpointMetaData.getCheckpointId()));
    // -------------------------------------- test prio retrieving
    // ---------------------------------------
    JobManagerTaskRestore taskRestore = new JobManagerTaskRestore(checkpointMetaData.getCheckpointId(), acknowledgeReport.getSubtaskState());
    taskStateManager = taskStateManager(jobID, executionAttemptID, testCheckpointResponder, taskRestore, testTaskLocalStateStore, changelogStorage);
    // this has remote AND local managed keyed state.
    PrioritizedOperatorSubtaskState prioritized_1 = taskStateManager.prioritizedOperatorState(operatorID_1);
    // this has only remote raw keyed state.
    PrioritizedOperatorSubtaskState prioritized_2 = taskStateManager.prioritizedOperatorState(operatorID_2);
    // not restored.
    PrioritizedOperatorSubtaskState prioritized_3 = taskStateManager.prioritizedOperatorState(operatorID_3);
    Assert.assertTrue(prioritized_1.isRestored());
    Assert.assertTrue(prioritized_2.isRestored());
    Assert.assertTrue(prioritized_3.isRestored());
    Assert.assertTrue(taskStateManager.prioritizedOperatorState(new OperatorID()).isRestored());
    // checks for operator 1.
    Iterator<StateObjectCollection<KeyedStateHandle>> prioritizedManagedKeyedState_1 = prioritized_1.getPrioritizedManagedKeyedState().iterator();
    Assert.assertTrue(prioritizedManagedKeyedState_1.hasNext());
    StateObjectCollection<KeyedStateHandle> current = prioritizedManagedKeyedState_1.next();
    KeyedStateHandle keyedStateHandleExp = tmOperatorSubtaskState_1.getManagedKeyedState().iterator().next();
    KeyedStateHandle keyedStateHandleAct = current.iterator().next();
    Assert.assertTrue(keyedStateHandleExp == keyedStateHandleAct);
    Assert.assertTrue(prioritizedManagedKeyedState_1.hasNext());
    current = prioritizedManagedKeyedState_1.next();
    keyedStateHandleExp = jmOperatorSubtaskState_1.getManagedKeyedState().iterator().next();
    keyedStateHandleAct = current.iterator().next();
    Assert.assertTrue(keyedStateHandleExp == keyedStateHandleAct);
    Assert.assertFalse(prioritizedManagedKeyedState_1.hasNext());
    // checks for operator 2.
    Iterator<StateObjectCollection<KeyedStateHandle>> prioritizedRawKeyedState_2 = prioritized_2.getPrioritizedRawKeyedState().iterator();
    Assert.assertTrue(prioritizedRawKeyedState_2.hasNext());
    current = prioritizedRawKeyedState_2.next();
    keyedStateHandleExp = jmOperatorSubtaskState_2.getRawKeyedState().iterator().next();
    keyedStateHandleAct = current.iterator().next();
    Assert.assertTrue(keyedStateHandleExp == keyedStateHandleAct);
    Assert.assertFalse(prioritizedRawKeyedState_2.hasNext());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) PrioritizedOperatorSubtaskState(org.apache.flink.runtime.checkpoint.PrioritizedOperatorSubtaskState) CheckpointMetrics(org.apache.flink.runtime.checkpoint.CheckpointMetrics) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) PrioritizedOperatorSubtaskState(org.apache.flink.runtime.checkpoint.PrioritizedOperatorSubtaskState) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) StateObjectCollection(org.apache.flink.runtime.checkpoint.StateObjectCollection) InMemoryStateChangelogStorage(org.apache.flink.runtime.state.changelog.inmemory.InMemoryStateChangelogStorage) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) TestCheckpointResponder(org.apache.flink.runtime.taskmanager.TestCheckpointResponder) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 14 with JobManagerTaskRestore

use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.

the class TaskStateManagerImplTest method testStateRetrievingWithFinishedOperator.

@Test
public void testStateRetrievingWithFinishedOperator() {
    TaskStateSnapshot taskStateSnapshot = TaskStateSnapshot.FINISHED_ON_RESTORE;
    JobManagerTaskRestore jobManagerTaskRestore = new JobManagerTaskRestore(2, taskStateSnapshot);
    TaskStateManagerImpl stateManager = new TaskStateManagerImpl(new JobID(), new ExecutionAttemptID(), new TestTaskLocalStateStore(), null, jobManagerTaskRestore, new TestCheckpointResponder());
    Assert.assertTrue(stateManager.isTaskDeployedAsFinished());
}
Also used : TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) TestCheckpointResponder(org.apache.flink.runtime.taskmanager.TestCheckpointResponder) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 15 with JobManagerTaskRestore

use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.

the class StreamTaskStateInitializerImplTest method testWithRestore.

@SuppressWarnings("unchecked")
@Test
public void testWithRestore() throws Exception {
    StateBackend mockingBackend = spy(new StateBackend() {

        @Override
        public <K> AbstractKeyedStateBackend<K> createKeyedStateBackend(Environment env, JobID jobID, String operatorIdentifier, TypeSerializer<K> keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection<KeyedStateHandle> stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception {
            return mock(AbstractKeyedStateBackend.class);
        }

        @Override
        public OperatorStateBackend createOperatorStateBackend(Environment env, String operatorIdentifier, @Nonnull Collection<OperatorStateHandle> stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception {
            return mock(OperatorStateBackend.class);
        }
    });
    OperatorID operatorID = new OperatorID(47L, 11L);
    TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot();
    Random random = new Random(0x42);
    OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(new OperatorStreamStateHandle(Collections.singletonMap("a", new OperatorStateHandle.StateMetaInfo(new long[] { 0, 10 }, SPLIT_DISTRIBUTE)), CheckpointTestUtils.createDummyStreamStateHandle(random, null))).setRawOperatorState(new OperatorStreamStateHandle(Collections.singletonMap("_default_", new OperatorStateHandle.StateMetaInfo(new long[] { 0, 20, 30 }, SPLIT_DISTRIBUTE)), CheckpointTestUtils.createDummyStreamStateHandle(random, null))).setManagedKeyedState(CheckpointTestUtils.createDummyKeyGroupStateHandle(random, null)).setRawKeyedState(CheckpointTestUtils.createDummyKeyGroupStateHandle(random, null)).setInputChannelState(singleton(createNewInputChannelStateHandle(10, random))).setResultSubpartitionState(singleton(createNewResultSubpartitionStateHandle(10, random))).build();
    taskStateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
    JobManagerTaskRestore jobManagerTaskRestore = new JobManagerTaskRestore(42L, taskStateSnapshot);
    StreamTaskStateInitializer streamTaskStateManager = streamTaskStateManager(mockingBackend, jobManagerTaskRestore, false);
    AbstractStreamOperator<?> streamOperator = mock(AbstractStreamOperator.class);
    when(streamOperator.getOperatorID()).thenReturn(operatorID);
    TypeSerializer<?> typeSerializer = new IntSerializer();
    CloseableRegistry closeableRegistry = new CloseableRegistry();
    StreamOperatorStateContext stateContext = streamTaskStateManager.streamOperatorStateContext(streamOperator.getOperatorID(), streamOperator.getClass().getSimpleName(), new TestProcessingTimeService(), streamOperator, typeSerializer, closeableRegistry, new UnregisteredMetricsGroup(), 1.0, false);
    OperatorStateBackend operatorStateBackend = stateContext.operatorStateBackend();
    CheckpointableKeyedStateBackend<?> keyedStateBackend = stateContext.keyedStateBackend();
    InternalTimeServiceManager<?> timeServiceManager = stateContext.internalTimerServiceManager();
    CloseableIterable<KeyGroupStatePartitionStreamProvider> keyedStateInputs = stateContext.rawKeyedStateInputs();
    CloseableIterable<StatePartitionStreamProvider> operatorStateInputs = stateContext.rawOperatorStateInputs();
    Assert.assertTrue("Expected the context to be restored", stateContext.isRestored());
    Assert.assertEquals(OptionalLong.of(42L), stateContext.getRestoredCheckpointId());
    Assert.assertNotNull(operatorStateBackend);
    Assert.assertNotNull(keyedStateBackend);
    // this is deactivated on purpose so that it does not attempt to consume the raw keyed
    // state.
    Assert.assertNull(timeServiceManager);
    Assert.assertNotNull(keyedStateInputs);
    Assert.assertNotNull(operatorStateInputs);
    int count = 0;
    for (KeyGroupStatePartitionStreamProvider keyedStateInput : keyedStateInputs) {
        ++count;
    }
    Assert.assertEquals(1, count);
    count = 0;
    for (StatePartitionStreamProvider operatorStateInput : operatorStateInputs) {
        ++count;
    }
    Assert.assertEquals(3, count);
    checkCloseablesRegistered(closeableRegistry, operatorStateBackend, keyedStateBackend, keyedStateInputs, operatorStateInputs);
}
Also used : IntSerializer(org.apache.flink.api.common.typeutils.base.IntSerializer) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) CloseableRegistry(org.apache.flink.core.fs.CloseableRegistry) StateBackend(org.apache.flink.runtime.state.StateBackend) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) CheckpointableKeyedStateBackend(org.apache.flink.runtime.state.CheckpointableKeyedStateBackend) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) KeyGroupStatePartitionStreamProvider(org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider) KeyGroupStatePartitionStreamProvider(org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider) StatePartitionStreamProvider(org.apache.flink.runtime.state.StatePartitionStreamProvider) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) Random(java.util.Random) TtlTimeProvider(org.apache.flink.runtime.state.ttl.TtlTimeProvider) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) DummyEnvironment(org.apache.flink.runtime.operators.testutils.DummyEnvironment) Environment(org.apache.flink.runtime.execution.Environment) TestProcessingTimeService(org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) JobID(org.apache.flink.api.common.JobID) TaskStateManagerImplTest(org.apache.flink.runtime.state.TaskStateManagerImplTest) Test(org.junit.Test)

Aggregations

JobManagerTaskRestore (org.apache.flink.runtime.checkpoint.JobManagerTaskRestore)16 TaskStateSnapshot (org.apache.flink.runtime.checkpoint.TaskStateSnapshot)13 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)10 Test (org.junit.Test)10 JobID (org.apache.flink.api.common.JobID)7 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)6 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)5 StateObjectCollection (org.apache.flink.runtime.checkpoint.StateObjectCollection)3 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)3 KeyedStateHandle (org.apache.flink.runtime.state.KeyedStateHandle)3 OperatorStateHandle (org.apache.flink.runtime.state.OperatorStateHandle)3 OperatorStreamStateHandle (org.apache.flink.runtime.state.OperatorStreamStateHandle)3 TestTaskStateManager (org.apache.flink.runtime.state.TestTaskStateManager)3 CheckpointResponder (org.apache.flink.runtime.taskmanager.CheckpointResponder)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 Configuration (org.apache.flink.configuration.Configuration)2 CloseableRegistry (org.apache.flink.core.fs.CloseableRegistry)2 UnregisteredMetricsGroup (org.apache.flink.metrics.groups.UnregisteredMetricsGroup)2 JobInformation (org.apache.flink.runtime.executiongraph.JobInformation)2