use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class SourceOperatorStreamTaskTest method testTriggeringStopWithSavepointWithDrain.
@Test
public void testTriggeringStopWithSavepointWithDrain() throws Exception {
SourceOperatorFactory<Integer> sourceOperatorFactory = new SourceOperatorFactory<>(new MockSource(Boundedness.CONTINUOUS_UNBOUNDED, 2), WatermarkStrategy.noWatermarks());
CompletableFuture<Boolean> checkpointCompleted = new CompletableFuture<>();
CheckpointResponder checkpointResponder = new TestCheckpointResponder() {
@Override
public void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, TaskStateSnapshot subtaskState) {
super.acknowledgeCheckpoint(jobID, executionAttemptID, checkpointId, checkpointMetrics, subtaskState);
checkpointCompleted.complete(null);
}
};
try (StreamTaskMailboxTestHarness<Integer> testHarness = new StreamTaskMailboxTestHarnessBuilder<>(SourceOperatorStreamTask::new, BasicTypeInfo.INT_TYPE_INFO).setupOutputForSingletonOperatorChain(sourceOperatorFactory).setCheckpointResponder(checkpointResponder).build()) {
CompletableFuture<Boolean> triggerResult = testHarness.streamTask.triggerCheckpointAsync(new CheckpointMetaData(2, 2), CheckpointOptions.alignedNoTimeout(SavepointType.terminate(SavepointFormatType.CANONICAL), CheckpointStorageLocationReference.getDefault()));
checkpointCompleted.whenComplete((ignored, exception) -> testHarness.streamTask.notifyCheckpointCompleteAsync(2));
testHarness.waitForTaskCompletion();
testHarness.finishProcessing();
assertTrue(triggerResult.isDone());
assertTrue(triggerResult.get());
assertTrue(checkpointCompleted.isDone());
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class TaskExecutor method associateWithJobManager.
private JobManagerConnection associateWithJobManager(JobID jobID, ResourceID resourceID, JobMasterGateway jobMasterGateway, UUID jobManagerLeaderId, int blobPort) {
Preconditions.checkNotNull(jobID);
Preconditions.checkNotNull(resourceID);
Preconditions.checkNotNull(jobManagerLeaderId);
Preconditions.checkNotNull(jobMasterGateway);
Preconditions.checkArgument(blobPort > 0 || blobPort < MAX_BLOB_PORT, "Blob server port is out of range.");
TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobManagerLeaderId, jobMasterGateway);
CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway);
InetSocketAddress blobServerAddress = new InetSocketAddress(jobMasterGateway.getHostname(), blobPort);
final LibraryCacheManager libraryCacheManager;
try {
final BlobCache blobCache = new BlobCache(blobServerAddress, taskManagerConfiguration.getConfiguration(), haServices);
libraryCacheManager = new BlobLibraryCacheManager(blobCache, taskManagerConfiguration.getCleanupInterval());
} catch (IOException e) {
// Can't pass the IOException up - we need a RuntimeException anyway
// two levels up where this is run asynchronously. Also, we don't
// know whether this is caught in the thread running this method.
final String message = "Could not create BLOB cache or library cache.";
log.error(message, e);
throw new RuntimeException(message, e);
}
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier(jobManagerLeaderId, jobMasterGateway, getRpcService().getExecutor(), taskManagerConfiguration.getTimeout());
PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobManagerLeaderId, jobMasterGateway);
return new JobManagerConnection(jobID, resourceID, jobMasterGateway, jobManagerLeaderId, taskManagerActions, checkpointResponder, libraryCacheManager, resultPartitionConsumableNotifier, partitionStateChecker);
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class TaskExecutor method submitTask.
// ======================================================================
// RPC methods
// ======================================================================
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@RpcMethod
public Acknowledge submitTask(TaskDeploymentDescriptor tdd, UUID jobManagerLeaderId) throws TaskSubmissionException {
// first, deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
final JobID jobId = jobInformation.getJobId();
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!jobManagerConnection.getLeaderId().equals(jobManagerLeaderId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobManagerLeaderId + " does not match the expected job manager leader id " + jobManagerConnection.getLeaderId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.existsActiveSlot(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
TaskMetricGroup taskMetricGroup = taskManagerMetricGroup.addTaskForJob(jobInformation.getJobId(), jobInformation.getJobName(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getLeaderId(), jobManagerConnection.getJobManagerGateway(), jobInformation.getJobId(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getTimeout());
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
LibraryCacheManager libraryCache = jobManagerConnection.getLibraryCacheManager();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), tdd.getTargetSlotNumber(), tdd.getTaskStateHandles(), memoryManager, ioManager, networkEnvironment, broadcastVariableManager, taskManagerActions, inputSplitProvider, checkpointResponder, libraryCache, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getExecutor());
log.info("Received task {}.", task.getTaskInfo().getTaskNameWithSubtasks());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
return Acknowledge.get();
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class StreamTaskTest method testAsyncCheckpointingConcurrentCloseAfterAcknowledge.
/**
* FLINK-5667
*
* <p>Tests that a concurrent cancel operation does not discard the state handles of an
* acknowledged checkpoint. The situation can only happen if the cancel call is executed after
* Environment.acknowledgeCheckpoint() and before the CloseableRegistry.unregisterClosable()
* call.
*/
@Test
public void testAsyncCheckpointingConcurrentCloseAfterAcknowledge() throws Exception {
final OneShotLatch acknowledgeCheckpointLatch = new OneShotLatch();
final OneShotLatch completeAcknowledge = new OneShotLatch();
CheckpointResponder checkpointResponder = mock(CheckpointResponder.class);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) {
acknowledgeCheckpointLatch.trigger();
// block here so that we can issue the concurrent cancel call
while (true) {
try {
// wait until we successfully await (no pun intended)
completeAcknowledge.await();
// when await() returns normally, we break out of the loop
break;
} catch (InterruptedException e) {
// survive interruptions that arise from thread pool
// shutdown
// production code cannot actually throw
// InterruptedException from
// checkpoint acknowledgement
}
}
return null;
}
}).when(checkpointResponder).acknowledgeCheckpoint(any(JobID.class), any(ExecutionAttemptID.class), anyLong(), any(CheckpointMetrics.class), any(TaskStateSnapshot.class));
TaskStateManager taskStateManager = new TaskStateManagerImpl(new JobID(1L, 2L), new ExecutionAttemptID(), mock(TaskLocalStateStoreImpl.class), new InMemoryStateChangelogStorage(), null, checkpointResponder);
KeyedStateHandle managedKeyedStateHandle = mock(KeyedStateHandle.class);
KeyedStateHandle rawKeyedStateHandle = mock(KeyedStateHandle.class);
OperatorStateHandle managedOperatorStateHandle = mock(OperatorStreamStateHandle.class);
OperatorStateHandle rawOperatorStateHandle = mock(OperatorStreamStateHandle.class);
OperatorSnapshotFutures operatorSnapshotResult = new OperatorSnapshotFutures(DoneFuture.of(SnapshotResult.of(managedKeyedStateHandle)), DoneFuture.of(SnapshotResult.of(rawKeyedStateHandle)), DoneFuture.of(SnapshotResult.of(managedOperatorStateHandle)), DoneFuture.of(SnapshotResult.of(rawOperatorStateHandle)), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()));
try (MockEnvironment mockEnvironment = new MockEnvironmentBuilder().setTaskName("mock-task").setTaskStateManager(taskStateManager).build()) {
RunningTask<MockStreamTask> task = runTask(() -> createMockStreamTask(mockEnvironment, operatorChain(streamOperatorWithSnapshot(operatorSnapshotResult))));
MockStreamTask streamTask = task.streamTask;
waitTaskIsRunning(streamTask, task.invocationFuture);
final long checkpointId = 42L;
streamTask.triggerCheckpointAsync(new CheckpointMetaData(checkpointId, 1L), CheckpointOptions.forCheckpointWithDefaultLocation());
acknowledgeCheckpointLatch.await();
ArgumentCaptor<TaskStateSnapshot> subtaskStateCaptor = ArgumentCaptor.forClass(TaskStateSnapshot.class);
// check that the checkpoint has been completed
verify(checkpointResponder).acknowledgeCheckpoint(any(JobID.class), any(ExecutionAttemptID.class), eq(checkpointId), any(CheckpointMetrics.class), subtaskStateCaptor.capture());
TaskStateSnapshot subtaskStates = subtaskStateCaptor.getValue();
OperatorSubtaskState subtaskState = subtaskStates.getSubtaskStateMappings().iterator().next().getValue();
// check that the subtask state contains the expected state handles
assertEquals(singleton(managedKeyedStateHandle), subtaskState.getManagedKeyedState());
assertEquals(singleton(rawKeyedStateHandle), subtaskState.getRawKeyedState());
assertEquals(singleton(managedOperatorStateHandle), subtaskState.getManagedOperatorState());
assertEquals(singleton(rawOperatorStateHandle), subtaskState.getRawOperatorState());
// check that the state handles have not been discarded
verify(managedKeyedStateHandle, never()).discardState();
verify(rawKeyedStateHandle, never()).discardState();
verify(managedOperatorStateHandle, never()).discardState();
verify(rawOperatorStateHandle, never()).discardState();
streamTask.cancel();
completeAcknowledge.trigger();
// canceling the stream task after it has acknowledged the checkpoint should not discard
// the state handles
verify(managedKeyedStateHandle, never()).discardState();
verify(rawKeyedStateHandle, never()).discardState();
verify(managedOperatorStateHandle, never()).discardState();
verify(rawOperatorStateHandle, never()).discardState();
task.waitForTaskCompletion(true);
}
}
use of org.apache.flink.runtime.taskmanager.CheckpointResponder in project flink by apache.
the class StreamTaskTest method testEmptySubtaskStateLeadsToStatelessAcknowledgment.
/**
* FLINK-5985
*
* <p>This test ensures that empty snapshots (no op/keyed stated whatsoever) will be reported as
* stateless tasks. This happens by translating an empty {@link SubtaskState} into reporting
* 'null' to #acknowledgeCheckpoint.
*/
@Test
public void testEmptySubtaskStateLeadsToStatelessAcknowledgment() throws Exception {
// latch blocks until the async checkpoint thread acknowledges
final OneShotLatch checkpointCompletedLatch = new OneShotLatch();
final List<SubtaskState> checkpointResult = new ArrayList<>(1);
CheckpointResponder checkpointResponder = mock(CheckpointResponder.class);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
SubtaskState subtaskState = invocation.getArgument(4);
checkpointResult.add(subtaskState);
checkpointCompletedLatch.trigger();
return null;
}
}).when(checkpointResponder).acknowledgeCheckpoint(any(JobID.class), any(ExecutionAttemptID.class), anyLong(), any(CheckpointMetrics.class), nullable(TaskStateSnapshot.class));
TaskStateManager taskStateManager = new TaskStateManagerImpl(new JobID(1L, 2L), new ExecutionAttemptID(), mock(TaskLocalStateStoreImpl.class), new InMemoryStateChangelogStorage(), null, checkpointResponder);
// mock the operator with empty snapshot result (all state handles are null)
OneInputStreamOperator<String, String> statelessOperator = streamOperatorWithSnapshot(new OperatorSnapshotFutures());
try (MockEnvironment mockEnvironment = new MockEnvironmentBuilder().setTaskStateManager(taskStateManager).build()) {
RunningTask<MockStreamTask> task = runTask(() -> createMockStreamTask(mockEnvironment, operatorChain(statelessOperator)));
waitTaskIsRunning(task.streamTask, task.invocationFuture);
task.streamTask.triggerCheckpointAsync(new CheckpointMetaData(42L, 1L), CheckpointOptions.forCheckpointWithDefaultLocation());
checkpointCompletedLatch.await(30, TimeUnit.SECONDS);
// ensure that 'null' was acknowledged as subtask state
Assert.assertNull(checkpointResult.get(0));
task.streamTask.cancel();
task.waitForTaskCompletion(true);
}
}
Aggregations