use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.
the class TaskExecutor method submitTask.
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> submitTask(TaskDeploymentDescriptor tdd, JobMasterId jobMasterId, Time timeout) {
try {
final JobID jobId = tdd.getJobId();
final ExecutionAttemptID executionAttemptID = tdd.getExecutionAttemptId();
final JobTable.Connection jobManagerConnection = jobTable.getConnection(jobId).orElseThrow(() -> {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
return new TaskSubmissionException(message);
});
if (!Objects.equals(jobManagerConnection.getJobMasterId(), jobMasterId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobMasterId + " does not match the expected job manager leader id " + jobManagerConnection.getJobMasterId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.tryMarkSlotActive(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
// re-integrate offloaded data:
try {
tdd.loadBigData(taskExecutorBlobService.getPermanentBlobService());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not re-integrate offloaded TaskDeploymentDescriptor data.", e);
}
// deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
if (!jobId.equals(jobInformation.getJobId())) {
throw new TaskSubmissionException("Inconsistent job ID information inside TaskDeploymentDescriptor (" + tdd.getJobId() + " vs. " + jobInformation.getJobId() + ")");
}
TaskManagerJobMetricGroup jobGroup = taskManagerMetricGroup.addJob(jobInformation.getJobId(), jobInformation.getJobName());
// note that a pre-existing job group can NOT be closed concurrently - this is done by
// the same TM thread in removeJobMetricsGroup
TaskMetricGroup taskMetricGroup = jobGroup.addTask(taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getJobManagerGateway(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getRpcTimeout());
final TaskOperatorEventGateway taskOperatorEventGateway = new RpcTaskOperatorEventGateway(jobManagerConnection.getJobManagerGateway(), executionAttemptID, (t) -> runAsync(() -> failTask(executionAttemptID, t)));
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
GlobalAggregateManager aggregateManager = jobManagerConnection.getGlobalAggregateManager();
LibraryCacheManager.ClassLoaderHandle classLoaderHandle = jobManagerConnection.getClassLoaderHandle();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
final TaskLocalStateStore localStateStore = localStateStoresManager.localStateStoreForSubtask(jobId, tdd.getAllocationId(), taskInformation.getJobVertexId(), tdd.getSubtaskIndex());
// TODO: Pass config value from user program and do overriding here.
final StateChangelogStorage<?> changelogStorage;
try {
changelogStorage = changelogStoragesManager.stateChangelogStorageForJob(jobId, taskManagerConfiguration.getConfiguration(), jobGroup);
} catch (IOException e) {
throw new TaskSubmissionException(e);
}
final JobManagerTaskRestore taskRestore = tdd.getTaskRestore();
final TaskStateManager taskStateManager = new TaskStateManagerImpl(jobId, tdd.getExecutionAttemptId(), localStateStore, changelogStorage, taskRestore, checkpointResponder);
MemoryManager memoryManager;
try {
memoryManager = taskSlotTable.getTaskMemoryManager(tdd.getAllocationId());
} catch (SlotNotFoundException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), memoryManager, taskExecutorServices.getIOManager(), taskExecutorServices.getShuffleEnvironment(), taskExecutorServices.getKvStateService(), taskExecutorServices.getBroadcastVariableManager(), taskExecutorServices.getTaskEventDispatcher(), externalResourceInfoProvider, taskStateManager, taskManagerActions, inputSplitProvider, checkpointResponder, taskOperatorEventGateway, aggregateManager, classLoaderHandle, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getScheduledExecutor());
taskMetricGroup.gauge(MetricNames.IS_BACK_PRESSURED, task::isBackPressured);
log.info("Received task {} ({}), deploy into slot with allocation id {}.", task.getTaskInfo().getTaskNameWithSubtasks(), tdd.getExecutionAttemptId(), tdd.getAllocationId());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
setupResultPartitionBookkeeping(tdd.getJobId(), tdd.getProducedPartitions(), task.getTerminationFuture());
return CompletableFuture.completedFuture(Acknowledge.get());
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
} catch (TaskSubmissionException e) {
return FutureUtils.completedExceptionally(e);
}
}
use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.
the class StateInitializationContextImplTest method setUp.
@Before
public void setUp() throws Exception {
this.writtenKeyGroups = 0;
this.writtenOperatorStates = new HashSet<>();
this.closableRegistry = new CloseableRegistry();
ByteArrayOutputStreamWithPos out = new ByteArrayOutputStreamWithPos(64);
List<KeyedStateHandle> keyedStateHandles = new ArrayList<>(NUM_HANDLES);
int prev = 0;
for (int i = 0; i < NUM_HANDLES; ++i) {
out.reset();
int size = i % 4;
int end = prev + size;
DataOutputView dov = new DataOutputViewStreamWrapper(out);
KeyGroupRangeOffsets offsets = new KeyGroupRangeOffsets(i == 9 ? KeyGroupRange.EMPTY_KEY_GROUP_RANGE : new KeyGroupRange(prev, end));
prev = end + 1;
for (int kg : offsets.getKeyGroupRange()) {
offsets.setKeyGroupOffset(kg, out.getPosition());
dov.writeInt(kg);
++writtenKeyGroups;
}
KeyedStateHandle handle = new KeyGroupsStateHandle(offsets, new ByteStateHandleCloseChecking("kg-" + i, out.toByteArray()));
keyedStateHandles.add(handle);
}
List<OperatorStateHandle> operatorStateHandles = new ArrayList<>(NUM_HANDLES);
for (int i = 0; i < NUM_HANDLES; ++i) {
int size = i % 4;
out.reset();
DataOutputView dov = new DataOutputViewStreamWrapper(out);
LongArrayList offsets = new LongArrayList(size);
for (int s = 0; s < size; ++s) {
offsets.add(out.getPosition());
int val = i * NUM_HANDLES + s;
dov.writeInt(val);
writtenOperatorStates.add(val);
}
Map<String, OperatorStateHandle.StateMetaInfo> offsetsMap = new HashMap<>();
offsetsMap.put(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, new OperatorStateHandle.StateMetaInfo(offsets.toArray(), OperatorStateHandle.Mode.SPLIT_DISTRIBUTE));
OperatorStateHandle operatorStateHandle = new OperatorStreamStateHandle(offsetsMap, new ByteStateHandleCloseChecking("os-" + i, out.toByteArray()));
operatorStateHandles.add(operatorStateHandle);
}
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setRawOperatorState(new StateObjectCollection<>(operatorStateHandles)).setRawKeyedState(new StateObjectCollection<>(keyedStateHandles)).build();
OperatorID operatorID = new OperatorID();
TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot();
taskStateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
JobManagerTaskRestore jobManagerTaskRestore = new JobManagerTaskRestore(0L, taskStateSnapshot);
TaskStateManager manager = new TaskStateManagerImpl(new JobID(), new ExecutionAttemptID(), new TestTaskLocalStateStore(), new InMemoryStateChangelogStorage(), jobManagerTaskRestore, mock(CheckpointResponder.class));
DummyEnvironment environment = new DummyEnvironment("test", 1, 0, prev);
environment.setTaskStateManager(manager);
StateBackend stateBackend = new MemoryStateBackend(1024);
StreamTaskStateInitializer streamTaskStateManager = new StreamTaskStateInitializerImpl(environment, stateBackend, TtlTimeProvider.DEFAULT, new InternalTimeServiceManager.Provider() {
@Override
public <K> InternalTimeServiceManager<K> create(CheckpointableKeyedStateBackend<K> keyedStatedBackend, ClassLoader userClassloader, KeyContext keyContext, ProcessingTimeService processingTimeService, Iterable<KeyGroupStatePartitionStreamProvider> rawKeyedStates) throws Exception {
// stream.
return null;
}
});
AbstractStreamOperator<?> mockOperator = mock(AbstractStreamOperator.class);
when(mockOperator.getOperatorID()).thenReturn(operatorID);
StreamOperatorStateContext stateContext = streamTaskStateManager.streamOperatorStateContext(operatorID, "TestOperatorClass", mock(ProcessingTimeService.class), mockOperator, // consumed by the timer service.
IntSerializer.INSTANCE, closableRegistry, new UnregisteredMetricsGroup(), 1.0, false);
OptionalLong restoredCheckpointId = stateContext.getRestoredCheckpointId();
this.initializationContext = new StateInitializationContextImpl(restoredCheckpointId.isPresent() ? restoredCheckpointId.getAsLong() : null, stateContext.operatorStateBackend(), mock(KeyedStateStore.class), stateContext.rawKeyedStateInputs(), stateContext.rawOperatorStateInputs());
}
use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.
the class StatefulOperatorChainedTaskTest method createRunAndCheckpointOperatorChain.
private JobManagerTaskRestore createRunAndCheckpointOperatorChain(OperatorID headId, OneInputStreamOperator<String, String> headOperator, OperatorID tailId, OneInputStreamOperator<String, String> tailOperator, Optional<JobManagerTaskRestore> restore) throws Exception {
File localRootDir = temporaryFolder.newFolder();
final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, 1, 1, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO, localRootDir);
testHarness.setupOperatorChain(headId, headOperator).chain(tailId, tailOperator, StringSerializer.INSTANCE, true).finish();
if (restore.isPresent()) {
JobManagerTaskRestore taskRestore = restore.get();
testHarness.setTaskStateSnapshot(taskRestore.getRestoreCheckpointId(), taskRestore.getTaskStateSnapshot());
}
StreamMockEnvironment environment = new StreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.getExecutionConfig(), testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize, testHarness.getTaskStateManager());
Configuration configuration = new Configuration();
configuration.setString(STATE_BACKEND.key(), "rocksdb");
File file = temporaryFolder.newFolder();
configuration.setString(CHECKPOINTS_DIRECTORY.key(), file.toURI().toString());
configuration.setString(INCREMENTAL_CHECKPOINTS.key(), "true");
environment.setTaskManagerInfo(new TestingTaskManagerRuntimeInfo(configuration, System.getProperty("java.io.tmpdir").split(",|" + File.pathSeparator)));
testHarness.invoke(environment);
testHarness.waitForTaskRunning();
OneInputStreamTask<String, String> streamTask = testHarness.getTask();
processRecords(testHarness);
triggerCheckpoint(testHarness, streamTask);
TestTaskStateManager taskStateManager = testHarness.getTaskStateManager();
JobManagerTaskRestore jobManagerTaskRestore = new JobManagerTaskRestore(taskStateManager.getReportedCheckpointId(), taskStateManager.getLastJobManagerTaskStateSnapshot());
testHarness.endInput();
testHarness.waitForTaskCompletion();
return jobManagerTaskRestore;
}
use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.
the class TaskStateManagerImplTest method testAcquringRestoreCheckpointId.
public void testAcquringRestoreCheckpointId() {
TaskStateManagerImpl emptyStateManager = new TaskStateManagerImpl(new JobID(), new ExecutionAttemptID(), new TestTaskLocalStateStore(), null, null, new TestCheckpointResponder());
Assert.assertFalse(emptyStateManager.getRestoreCheckpointId().isPresent());
TaskStateManagerImpl nonEmptyStateManager = new TaskStateManagerImpl(new JobID(), new ExecutionAttemptID(), new TestTaskLocalStateStore(), null, new JobManagerTaskRestore(2, new TaskStateSnapshot()), new TestCheckpointResponder());
Assert.assertEquals(2L, (long) nonEmptyStateManager.getRestoreCheckpointId().get());
}
use of org.apache.flink.runtime.checkpoint.JobManagerTaskRestore in project flink by apache.
the class InterruptSensitiveRestoreTest method createTask.
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
private static Task createTask(StreamConfig streamConfig, Configuration taskConfig, StreamStateHandle state, int mode) throws IOException {
ShuffleEnvironment<?, ?> shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
Collection<KeyedStateHandle> keyedStateFromBackend = Collections.emptyList();
Collection<KeyedStateHandle> keyedStateFromStream = Collections.emptyList();
Collection<OperatorStateHandle> operatorStateBackend = Collections.emptyList();
Collection<OperatorStateHandle> operatorStateStream = Collections.emptyList();
Map<String, OperatorStateHandle.StateMetaInfo> operatorStateMetadata = new HashMap<>(1);
OperatorStateHandle.StateMetaInfo metaInfo = new OperatorStateHandle.StateMetaInfo(new long[] { 0 }, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE);
operatorStateMetadata.put(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, metaInfo);
KeyGroupRangeOffsets keyGroupRangeOffsets = new KeyGroupRangeOffsets(new KeyGroupRange(0, 0));
Collection<OperatorStateHandle> operatorStateHandles = Collections.singletonList(new OperatorStreamStateHandle(operatorStateMetadata, state));
List<KeyedStateHandle> keyedStateHandles = Collections.singletonList(new KeyGroupsStateHandle(keyGroupRangeOffsets, state));
switch(mode) {
case OPERATOR_MANAGED:
operatorStateBackend = operatorStateHandles;
break;
case OPERATOR_RAW:
operatorStateStream = operatorStateHandles;
break;
case KEYED_MANAGED:
keyedStateFromBackend = keyedStateHandles;
break;
case KEYED_RAW:
keyedStateFromStream = keyedStateHandles;
break;
default:
throw new IllegalArgumentException();
}
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(new StateObjectCollection<>(operatorStateBackend)).setRawOperatorState(new StateObjectCollection<>(operatorStateStream)).setManagedKeyedState(new StateObjectCollection<>(keyedStateFromBackend)).setRawKeyedState(new StateObjectCollection<>(keyedStateFromStream)).build();
JobVertexID jobVertexID = new JobVertexID();
OperatorID operatorID = OperatorID.fromJobVertexID(jobVertexID);
streamConfig.setOperatorID(operatorID);
TaskStateSnapshot stateSnapshot = new TaskStateSnapshot();
stateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
JobManagerTaskRestore taskRestore = new JobManagerTaskRestore(1L, stateSnapshot);
JobInformation jobInformation = new JobInformation(new JobID(), "test job name", new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.emptyList(), Collections.emptyList());
TaskInformation taskInformation = new TaskInformation(jobVertexID, "test task name", 1, 1, SourceStreamTask.class.getName(), taskConfig);
TestTaskStateManager taskStateManager = TestTaskStateManager.builder().setReportedCheckpointId(taskRestore.getRestoreCheckpointId()).setJobManagerTaskStateSnapshotsByCheckpointId(Collections.singletonMap(taskRestore.getRestoreCheckpointId(), taskRestore.getTaskStateSnapshot())).build();
return new Task(jobInformation, taskInformation, new ExecutionAttemptID(), new AllocationID(), 0, 0, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), mock(MemoryManager.class), mock(IOManager.class), shuffleEnvironment, new KvStateService(new KvStateRegistry(), null, null), mock(BroadcastVariableManager.class), new TaskEventDispatcher(), ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES, taskStateManager, mock(TaskManagerActions.class), mock(InputSplitProvider.class), mock(CheckpointResponder.class), new NoOpTaskOperatorEventGateway(), new TestGlobalAggregateManager(), TestingClassLoaderLease.newBuilder().build(), new FileCache(new String[] { EnvironmentInformation.getTemporaryFileDirectory() }, VoidPermanentBlobService.INSTANCE), new TestingTaskManagerRuntimeInfo(), UnregisteredMetricGroups.createUnregisteredTaskMetricGroup(), new NoOpResultPartitionConsumableNotifier(), mock(PartitionProducerStateChecker.class), mock(Executor.class));
}
Aggregations