use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class TaskExecutor method submitTask.
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> submitTask(TaskDeploymentDescriptor tdd, JobMasterId jobMasterId, Time timeout) {
try {
final JobID jobId = tdd.getJobId();
final ExecutionAttemptID executionAttemptID = tdd.getExecutionAttemptId();
final JobTable.Connection jobManagerConnection = jobTable.getConnection(jobId).orElseThrow(() -> {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
return new TaskSubmissionException(message);
});
if (!Objects.equals(jobManagerConnection.getJobMasterId(), jobMasterId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobMasterId + " does not match the expected job manager leader id " + jobManagerConnection.getJobMasterId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.tryMarkSlotActive(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
// re-integrate offloaded data:
try {
tdd.loadBigData(taskExecutorBlobService.getPermanentBlobService());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not re-integrate offloaded TaskDeploymentDescriptor data.", e);
}
// deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
if (!jobId.equals(jobInformation.getJobId())) {
throw new TaskSubmissionException("Inconsistent job ID information inside TaskDeploymentDescriptor (" + tdd.getJobId() + " vs. " + jobInformation.getJobId() + ")");
}
TaskManagerJobMetricGroup jobGroup = taskManagerMetricGroup.addJob(jobInformation.getJobId(), jobInformation.getJobName());
// note that a pre-existing job group can NOT be closed concurrently - this is done by
// the same TM thread in removeJobMetricsGroup
TaskMetricGroup taskMetricGroup = jobGroup.addTask(taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getJobManagerGateway(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getRpcTimeout());
final TaskOperatorEventGateway taskOperatorEventGateway = new RpcTaskOperatorEventGateway(jobManagerConnection.getJobManagerGateway(), executionAttemptID, (t) -> runAsync(() -> failTask(executionAttemptID, t)));
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
GlobalAggregateManager aggregateManager = jobManagerConnection.getGlobalAggregateManager();
LibraryCacheManager.ClassLoaderHandle classLoaderHandle = jobManagerConnection.getClassLoaderHandle();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
final TaskLocalStateStore localStateStore = localStateStoresManager.localStateStoreForSubtask(jobId, tdd.getAllocationId(), taskInformation.getJobVertexId(), tdd.getSubtaskIndex());
// TODO: Pass config value from user program and do overriding here.
final StateChangelogStorage<?> changelogStorage;
try {
changelogStorage = changelogStoragesManager.stateChangelogStorageForJob(jobId, taskManagerConfiguration.getConfiguration(), jobGroup);
} catch (IOException e) {
throw new TaskSubmissionException(e);
}
final JobManagerTaskRestore taskRestore = tdd.getTaskRestore();
final TaskStateManager taskStateManager = new TaskStateManagerImpl(jobId, tdd.getExecutionAttemptId(), localStateStore, changelogStorage, taskRestore, checkpointResponder);
MemoryManager memoryManager;
try {
memoryManager = taskSlotTable.getTaskMemoryManager(tdd.getAllocationId());
} catch (SlotNotFoundException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), memoryManager, taskExecutorServices.getIOManager(), taskExecutorServices.getShuffleEnvironment(), taskExecutorServices.getKvStateService(), taskExecutorServices.getBroadcastVariableManager(), taskExecutorServices.getTaskEventDispatcher(), externalResourceInfoProvider, taskStateManager, taskManagerActions, inputSplitProvider, checkpointResponder, taskOperatorEventGateway, aggregateManager, classLoaderHandle, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getScheduledExecutor());
taskMetricGroup.gauge(MetricNames.IS_BACK_PRESSURED, task::isBackPressured);
log.info("Received task {} ({}), deploy into slot with allocation id {}.", task.getTaskInfo().getTaskNameWithSubtasks(), tdd.getExecutionAttemptId(), tdd.getAllocationId());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
setupResultPartitionBookkeeping(tdd.getJobId(), tdd.getProducedPartitions(), task.getTerminationFuture());
return CompletableFuture.completedFuture(Acknowledge.get());
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
} catch (TaskSubmissionException e) {
return FutureUtils.completedExceptionally(e);
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class TaskExecutor method associateWithJobManager.
private JobTable.Connection associateWithJobManager(JobTable.Job job, ResourceID resourceID, JobMasterGateway jobMasterGateway) {
checkNotNull(resourceID);
checkNotNull(jobMasterGateway);
TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobMasterGateway);
CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway);
GlobalAggregateManager aggregateManager = new RpcGlobalAggregateManager(jobMasterGateway);
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier(jobMasterGateway, getRpcService().getScheduledExecutor(), taskManagerConfiguration.getRpcTimeout());
PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobMasterGateway);
registerQueryableState(job.getJobId(), jobMasterGateway);
return job.connect(resourceID, jobMasterGateway, taskManagerActions, checkpointResponder, aggregateManager, resultPartitionConsumableNotifier, partitionStateChecker);
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class MultipleInputStreamTaskTest method testTriggeringStopWithSavepointWithDrain.
@Test
public void testTriggeringStopWithSavepointWithDrain() throws Exception {
SourceOperatorFactory<Integer> sourceOperatorFactory = new SourceOperatorFactory<>(new MockSource(Boundedness.CONTINUOUS_UNBOUNDED, 2), WatermarkStrategy.noWatermarks());
CompletableFuture<Boolean> checkpointCompleted = new CompletableFuture<>();
CheckpointResponder checkpointResponder = new TestCheckpointResponder() {
@Override
public void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, TaskStateSnapshot subtaskState) {
super.acknowledgeCheckpoint(jobID, executionAttemptID, checkpointId, checkpointMetrics, subtaskState);
checkpointCompleted.complete(null);
}
};
try (StreamTaskMailboxTestHarness<String> testHarness = new StreamTaskMailboxTestHarnessBuilder<>(MultipleInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO).setCollectNetworkEvents().modifyStreamConfig(config -> config.setCheckpointingEnabled(true)).modifyExecutionConfig(applyObjectReuse(objectReuse)).addInput(BasicTypeInfo.INT_TYPE_INFO).addInput(BasicTypeInfo.INT_TYPE_INFO).addInput(BasicTypeInfo.INT_TYPE_INFO).setTaskStateSnapshot(1, TaskStateSnapshot.FINISHED_ON_RESTORE).setupOperatorChain(new LifeCycleMonitorMultipleInputOperatorFactory()).finishForSingletonOperatorChain(StringSerializer.INSTANCE).setCheckpointResponder(checkpointResponder).build()) {
CompletableFuture<Boolean> triggerResult = testHarness.streamTask.triggerCheckpointAsync(new CheckpointMetaData(2, 2), CheckpointOptions.alignedNoTimeout(SavepointType.terminate(SavepointFormatType.CANONICAL), CheckpointStorageLocationReference.getDefault()));
checkpointCompleted.whenComplete((ignored, exception) -> testHarness.streamTask.notifyCheckpointCompleteAsync(2));
testHarness.waitForTaskCompletion();
testHarness.finishProcessing();
assertTrue(triggerResult.isDone());
assertTrue(triggerResult.get());
assertTrue(checkpointCompleted.isDone());
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class SourceStreamTaskTest method testTriggeringStopWithSavepointWithDrain.
@Test
public void testTriggeringStopWithSavepointWithDrain() throws Exception {
SourceFunction<String> testSource = new EmptySource();
CompletableFuture<Boolean> checkpointCompleted = new CompletableFuture<>();
CheckpointResponder checkpointResponder = new TestCheckpointResponder() {
@Override
public void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, TaskStateSnapshot subtaskState) {
super.acknowledgeCheckpoint(jobID, executionAttemptID, checkpointId, checkpointMetrics, subtaskState);
checkpointCompleted.complete(null);
}
};
try (StreamTaskMailboxTestHarness<String> harness = new StreamTaskMailboxTestHarnessBuilder<>(SourceStreamTask::new, STRING_TYPE_INFO).setTaskStateSnapshot(1, TaskStateSnapshot.FINISHED_ON_RESTORE).setCheckpointResponder(checkpointResponder).setupOutputForSingletonOperatorChain(new StreamSource<>(testSource)).build()) {
CompletableFuture<Boolean> triggerResult = harness.streamTask.triggerCheckpointAsync(new CheckpointMetaData(2, 2), CheckpointOptions.alignedNoTimeout(SavepointType.terminate(SavepointFormatType.CANONICAL), CheckpointStorageLocationReference.getDefault()));
checkpointCompleted.whenComplete((ignored, exception) -> harness.streamTask.notifyCheckpointCompleteAsync(2));
// Run mailbox till the source thread finished and suspend the mailbox
harness.streamTask.runMailboxLoop();
harness.finishProcessing();
assertTrue(triggerResult.isDone());
assertTrue(triggerResult.get());
assertTrue(checkpointCompleted.isDone());
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class RocksDBAsyncSnapshotTest method testFullyAsyncSnapshot.
/**
* This ensures that asynchronous state handles are actually materialized asynchronously.
*
* <p>We use latches to block at various stages and see if the code still continues through the
* parts that are not asynchronous. If the checkpoint is not done asynchronously the test will
* simply lock forever.
*/
@Test
public void testFullyAsyncSnapshot() throws Exception {
final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
testHarness.setupOutputForSingletonOperatorChain();
testHarness.configureForKeyedStream(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return value;
}
}, BasicTypeInfo.STRING_TYPE_INFO);
StreamConfig streamConfig = testHarness.getStreamConfig();
File dbDir = temporaryFolder.newFolder();
RocksDBStateBackend backend = new RocksDBStateBackend(new MemoryStateBackend());
backend.setDbStoragePath(dbDir.getAbsolutePath());
streamConfig.setStateBackend(backend);
streamConfig.setStreamOperator(new AsyncCheckpointOperator());
streamConfig.setOperatorID(new OperatorID());
final OneShotLatch delayCheckpointLatch = new OneShotLatch();
final OneShotLatch ensureCheckpointLatch = new OneShotLatch();
CheckpointResponder checkpointResponderMock = new CheckpointResponder() {
@Override
public void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, TaskStateSnapshot subtaskState) {
// even though the async checkpoint would not finish
try {
delayCheckpointLatch.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
boolean hasManagedKeyedState = false;
for (Map.Entry<OperatorID, OperatorSubtaskState> entry : subtaskState.getSubtaskStateMappings()) {
OperatorSubtaskState state = entry.getValue();
if (state != null) {
hasManagedKeyedState |= state.getManagedKeyedState() != null;
}
}
// should be one k/v state
assertTrue(hasManagedKeyedState);
// we now know that the checkpoint went through
ensureCheckpointLatch.trigger();
}
@Override
public void reportCheckpointMetrics(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics) {
}
@Override
public void declineCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointException checkpointException) {
}
};
JobID jobID = new JobID();
ExecutionAttemptID executionAttemptID = new ExecutionAttemptID();
TestTaskStateManager taskStateManagerTestMock = new TestTaskStateManager(jobID, executionAttemptID, checkpointResponderMock, TestLocalRecoveryConfig.disabled(), new InMemoryStateChangelogStorage(), new HashMap<>(), -1L, new OneShotLatch());
StreamMockEnvironment mockEnv = new StreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize, taskStateManagerTestMock);
AtomicReference<Throwable> errorRef = new AtomicReference<>();
mockEnv.setExternalExceptionHandler(errorRef::set);
testHarness.invoke(mockEnv);
testHarness.waitForTaskRunning();
final OneInputStreamTask<String, String> task = testHarness.getTask();
task.triggerCheckpointAsync(new CheckpointMetaData(42, 17), CheckpointOptions.forCheckpointWithDefaultLocation()).get();
testHarness.processElement(new StreamRecord<>("Wohoo", 0));
// now we allow the checkpoint
delayCheckpointLatch.trigger();
// wait for the checkpoint to go through
ensureCheckpointLatch.await();
testHarness.endInput();
ExecutorService threadPool = task.getAsyncOperationsThreadPool();
threadPool.shutdown();
Assert.assertTrue(threadPool.awaitTermination(60_000, TimeUnit.MILLISECONDS));
testHarness.waitForTaskCompletion();
if (errorRef.get() != null) {
fail("Unexpected exception during execution.");
}
}
Aggregations