use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorLocalStateStoresManagerTest method testSubtaskStateStoreDirectoryCreateAndDelete.
/**
* This tests that the {@link TaskExecutorLocalStateStoresManager} creates {@link
* TaskLocalStateStoreImpl} that have a properly initialized local state base directory. It also
* checks that subdirectories are correctly deleted on shutdown.
*/
@Test
public void testSubtaskStateStoreDirectoryCreateAndDelete() throws Exception {
JobID jobID = new JobID();
JobVertexID jobVertexID = new JobVertexID();
AllocationID allocationID = new AllocationID();
int subtaskIdx = 23;
File[] rootDirs = { temporaryFolder.newFolder(), temporaryFolder.newFolder(), temporaryFolder.newFolder() };
TaskExecutorLocalStateStoresManager storesManager = new TaskExecutorLocalStateStoresManager(true, Reference.owned(rootDirs), Executors.directExecutor());
TaskLocalStateStore taskLocalStateStore = storesManager.localStateStoreForSubtask(jobID, allocationID, jobVertexID, subtaskIdx);
LocalRecoveryDirectoryProvider directoryProvider = taskLocalStateStore.getLocalRecoveryConfig().getLocalStateDirectoryProvider().orElseThrow(LocalRecoveryConfig.localRecoveryNotEnabled());
for (int i = 0; i < 10; ++i) {
Assert.assertEquals(new File(rootDirs[(i & Integer.MAX_VALUE) % rootDirs.length], storesManager.allocationSubDirString(allocationID)), directoryProvider.allocationBaseDirectory(i));
}
long chkId = 42L;
File allocBaseDirChk42 = directoryProvider.allocationBaseDirectory(chkId);
File subtaskSpecificCheckpointDirectory = directoryProvider.subtaskSpecificCheckpointDirectory(chkId);
Assert.assertEquals(new File(allocBaseDirChk42, "jid_" + jobID + File.separator + "vtx_" + jobVertexID + "_" + "sti_" + subtaskIdx + File.separator + "chk_" + chkId), subtaskSpecificCheckpointDirectory);
Assert.assertTrue(subtaskSpecificCheckpointDirectory.mkdirs());
File testFile = new File(subtaskSpecificCheckpointDirectory, "test");
Assert.assertTrue(testFile.createNewFile());
// test that local recovery mode is forwarded to the created store
Assert.assertEquals(storesManager.isLocalRecoveryEnabled(), taskLocalStateStore.getLocalRecoveryConfig().isLocalRecoveryEnabled());
Assert.assertTrue(testFile.exists());
// check cleanup after releasing allocation id
storesManager.releaseLocalStateForAllocationId(allocationID);
checkRootDirsClean(rootDirs);
AllocationID otherAllocationID = new AllocationID();
taskLocalStateStore = storesManager.localStateStoreForSubtask(jobID, otherAllocationID, jobVertexID, subtaskIdx);
directoryProvider = taskLocalStateStore.getLocalRecoveryConfig().getLocalStateDirectoryProvider().orElseThrow(LocalRecoveryConfig.localRecoveryNotEnabled());
File chkDir = directoryProvider.subtaskSpecificCheckpointDirectory(23L);
Assert.assertTrue(chkDir.mkdirs());
testFile = new File(chkDir, "test");
Assert.assertTrue(testFile.createNewFile());
// check cleanup after shutdown
storesManager.shutdown();
checkRootDirsClean(rootDirs);
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class LocalStateForwardingTest method testReportingFromTaskStateManagerToResponderAndTaskLocalStateStore.
/**
* This tests that state that was reported to the {@link
* org.apache.flink.runtime.state.TaskStateManager} is also reported to {@link
* org.apache.flink.runtime.taskmanager.CheckpointResponder} and {@link
* TaskLocalStateStoreImpl}.
*/
@Test
public void testReportingFromTaskStateManagerToResponderAndTaskLocalStateStore() throws Exception {
final JobID jobID = new JobID();
final AllocationID allocationID = new AllocationID();
final ExecutionAttemptID executionAttemptID = new ExecutionAttemptID();
final CheckpointMetaData checkpointMetaData = new CheckpointMetaData(42L, 4711L);
final CheckpointMetrics checkpointMetrics = new CheckpointMetrics();
final int subtaskIdx = 42;
JobVertexID jobVertexID = new JobVertexID();
TaskStateSnapshot jmSnapshot = new TaskStateSnapshot();
TaskStateSnapshot tmSnapshot = new TaskStateSnapshot();
final AtomicBoolean jmReported = new AtomicBoolean(false);
final AtomicBoolean tmReported = new AtomicBoolean(false);
TestCheckpointResponder checkpointResponder = new TestCheckpointResponder() {
@Override
public void acknowledgeCheckpoint(JobID lJobID, ExecutionAttemptID lExecutionAttemptID, long lCheckpointId, CheckpointMetrics lCheckpointMetrics, TaskStateSnapshot lSubtaskState) {
Assert.assertEquals(jobID, lJobID);
Assert.assertEquals(executionAttemptID, lExecutionAttemptID);
Assert.assertEquals(checkpointMetaData.getCheckpointId(), lCheckpointId);
Assert.assertEquals(checkpointMetrics, lCheckpointMetrics);
jmReported.set(true);
}
};
Executor executor = Executors.directExecutor();
LocalRecoveryDirectoryProviderImpl directoryProvider = new LocalRecoveryDirectoryProviderImpl(temporaryFolder.newFolder(), jobID, jobVertexID, subtaskIdx);
LocalRecoveryConfig localRecoveryConfig = new LocalRecoveryConfig(directoryProvider);
TaskLocalStateStore taskLocalStateStore = new TaskLocalStateStoreImpl(jobID, allocationID, jobVertexID, subtaskIdx, localRecoveryConfig, executor) {
@Override
public void storeLocalState(@Nonnegative long checkpointId, @Nullable TaskStateSnapshot localState) {
Assert.assertEquals(tmSnapshot, localState);
tmReported.set(true);
}
};
StateChangelogStorage<?> stateChangelogStorage = new InMemoryStateChangelogStorage();
TaskStateManagerImpl taskStateManager = new TaskStateManagerImpl(jobID, executionAttemptID, taskLocalStateStore, stateChangelogStorage, null, checkpointResponder);
taskStateManager.reportTaskStateSnapshots(checkpointMetaData, checkpointMetrics, jmSnapshot, tmSnapshot);
Assert.assertTrue("Reporting for JM state was not called.", jmReported.get());
Assert.assertTrue("Reporting for TM state was not called.", tmReported.get());
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class InterruptSensitiveRestoreTest method createTask.
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
private static Task createTask(StreamConfig streamConfig, Configuration taskConfig, StreamStateHandle state, int mode) throws IOException {
ShuffleEnvironment<?, ?> shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
Collection<KeyedStateHandle> keyedStateFromBackend = Collections.emptyList();
Collection<KeyedStateHandle> keyedStateFromStream = Collections.emptyList();
Collection<OperatorStateHandle> operatorStateBackend = Collections.emptyList();
Collection<OperatorStateHandle> operatorStateStream = Collections.emptyList();
Map<String, OperatorStateHandle.StateMetaInfo> operatorStateMetadata = new HashMap<>(1);
OperatorStateHandle.StateMetaInfo metaInfo = new OperatorStateHandle.StateMetaInfo(new long[] { 0 }, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE);
operatorStateMetadata.put(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, metaInfo);
KeyGroupRangeOffsets keyGroupRangeOffsets = new KeyGroupRangeOffsets(new KeyGroupRange(0, 0));
Collection<OperatorStateHandle> operatorStateHandles = Collections.singletonList(new OperatorStreamStateHandle(operatorStateMetadata, state));
List<KeyedStateHandle> keyedStateHandles = Collections.singletonList(new KeyGroupsStateHandle(keyGroupRangeOffsets, state));
switch(mode) {
case OPERATOR_MANAGED:
operatorStateBackend = operatorStateHandles;
break;
case OPERATOR_RAW:
operatorStateStream = operatorStateHandles;
break;
case KEYED_MANAGED:
keyedStateFromBackend = keyedStateHandles;
break;
case KEYED_RAW:
keyedStateFromStream = keyedStateHandles;
break;
default:
throw new IllegalArgumentException();
}
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(new StateObjectCollection<>(operatorStateBackend)).setRawOperatorState(new StateObjectCollection<>(operatorStateStream)).setManagedKeyedState(new StateObjectCollection<>(keyedStateFromBackend)).setRawKeyedState(new StateObjectCollection<>(keyedStateFromStream)).build();
JobVertexID jobVertexID = new JobVertexID();
OperatorID operatorID = OperatorID.fromJobVertexID(jobVertexID);
streamConfig.setOperatorID(operatorID);
TaskStateSnapshot stateSnapshot = new TaskStateSnapshot();
stateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
JobManagerTaskRestore taskRestore = new JobManagerTaskRestore(1L, stateSnapshot);
JobInformation jobInformation = new JobInformation(new JobID(), "test job name", new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.emptyList(), Collections.emptyList());
TaskInformation taskInformation = new TaskInformation(jobVertexID, "test task name", 1, 1, SourceStreamTask.class.getName(), taskConfig);
TestTaskStateManager taskStateManager = TestTaskStateManager.builder().setReportedCheckpointId(taskRestore.getRestoreCheckpointId()).setJobManagerTaskStateSnapshotsByCheckpointId(Collections.singletonMap(taskRestore.getRestoreCheckpointId(), taskRestore.getTaskStateSnapshot())).build();
return new Task(jobInformation, taskInformation, new ExecutionAttemptID(), new AllocationID(), 0, 0, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), mock(MemoryManager.class), mock(IOManager.class), shuffleEnvironment, new KvStateService(new KvStateRegistry(), null, null), mock(BroadcastVariableManager.class), new TaskEventDispatcher(), ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES, taskStateManager, mock(TaskManagerActions.class), mock(InputSplitProvider.class), mock(CheckpointResponder.class), new NoOpTaskOperatorEventGateway(), new TestGlobalAggregateManager(), TestingClassLoaderLease.newBuilder().build(), new FileCache(new String[] { EnvironmentInformation.getTemporaryFileDirectory() }, VoidPermanentBlobService.INSTANCE), new TestingTaskManagerRuntimeInfo(), UnregisteredMetricGroups.createUnregisteredTaskMetricGroup(), new NoOpResultPartitionConsumableNotifier(), mock(PartitionProducerStateChecker.class), mock(Executor.class));
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorRecoveryTest method testRecoveredTaskExecutorWillRestoreAllocationState.
@Test
public void testRecoveredTaskExecutorWillRestoreAllocationState(@TempDir File tempDir) throws Exception {
final ResourceID resourceId = ResourceID.generate();
final Configuration configuration = new Configuration();
configuration.set(TaskManagerOptions.NUM_TASK_SLOTS, 2);
configuration.set(CheckpointingOptions.LOCAL_RECOVERY, true);
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final ArrayBlockingQueue<TaskExecutorSlotReport> queue = new ArrayBlockingQueue<>(2);
testingResourceManagerGateway.setSendSlotReportFunction(slotReportInformation -> {
queue.offer(TaskExecutorSlotReport.create(slotReportInformation.f0, slotReportInformation.f2));
return CompletableFuture.completedFuture(Acknowledge.get());
});
final TestingRpcService rpcService = rpcServiceExtension.getTestingRpcService();
rpcService.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
final JobID jobId = new JobID();
final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
highAvailabilityServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID()));
final SettableLeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
final WorkingDirectory workingDirectory = WorkingDirectory.create(tempDir);
final TaskExecutor taskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
taskExecutor.start();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final TaskExecutorSlotReport taskExecutorSlotReport = queue.take();
final SlotReport slotReport = taskExecutorSlotReport.getSlotReport();
assertThat(slotReport.getNumSlotStatus(), is(2));
final SlotStatus slotStatus = slotReport.iterator().next();
final SlotID allocatedSlotID = slotStatus.getSlotID();
final AllocationID allocationId = new AllocationID();
taskExecutorGateway.requestSlot(allocatedSlotID, jobId, allocationId, slotStatus.getResourceProfile(), "localhost", testingResourceManagerGateway.getFencingToken(), Time.seconds(10L)).join();
taskExecutor.close();
final BlockingQueue<Collection<SlotOffer>> offeredSlots = new ArrayBlockingQueue<>(1);
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offeredSlots.offer(new HashSet<>(slotOffers));
return CompletableFuture.completedFuture(slotOffers);
}).build();
rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
jobMasterLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
// recover the TaskExecutor
final TaskExecutor recoveredTaskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
recoveredTaskExecutor.start();
final TaskExecutorSlotReport recoveredSlotReport = queue.take();
for (SlotStatus status : recoveredSlotReport.getSlotReport()) {
if (status.getSlotID().equals(allocatedSlotID)) {
assertThat(status.getJobID(), is(jobId));
assertThat(status.getAllocationID(), is(allocationId));
} else {
assertThat(status.getJobID(), is(nullValue()));
}
}
final Collection<SlotOffer> take = offeredSlots.take();
assertThat(take, hasSize(1));
final SlotOffer offeredSlot = take.iterator().next();
assertThat(offeredSlot.getAllocationId(), is(allocationId));
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorSlotLifetimeTest method testUserCodeClassLoaderIsBoundToSlot.
/**
* Tests that the user code class loader is bound to the lifetime of the slot. This means that
* it is being reused across a failover, for example. See FLINK-16408.
*/
@Test
public void testUserCodeClassLoaderIsBoundToSlot() throws Exception {
final Configuration configuration = new Configuration();
final TestingRpcService rpcService = TESTING_RPC_SERVICE_RESOURCE.getTestingRpcService();
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<SlotReport> firstSlotReportFuture = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
firstSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final BlockingQueue<TaskExecutionState> taskExecutionStates = new ArrayBlockingQueue<>(3);
final OneShotLatch slotsOfferedLatch = new OneShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
slotsOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
}).setUpdateTaskExecutionStateFunction(FunctionUtils.uncheckedFunction(taskExecutionState -> {
taskExecutionStates.put(taskExecutionState);
return CompletableFuture.completedFuture(Acknowledge.get());
})).build();
final LeaderRetrievalService resourceManagerLeaderRetriever = new SettableLeaderRetrievalService(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
final LeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setResourceManagerLeaderRetriever(resourceManagerLeaderRetriever).setJobMasterLeaderRetrieverFunction(ignored -> jobMasterLeaderRetriever).build();
rpcService.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final LocalUnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
try (final TaskExecutor taskExecutor = createTaskExecutor(configuration, rpcService, haServices, unresolvedTaskManagerLocation)) {
taskExecutor.start();
final SlotReport slotReport = firstSlotReportFuture.join();
final SlotID firstSlotId = slotReport.iterator().next().getSlotID();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final JobID jobId = new JobID();
final AllocationID allocationId = new AllocationID();
taskExecutorGateway.requestSlot(firstSlotId, jobId, allocationId, ResourceProfile.ZERO, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final TaskDeploymentDescriptor tdd = TaskDeploymentDescriptorBuilder.newBuilder(jobId, UserClassLoaderExtractingInvokable.class).setAllocationId(allocationId).build();
slotsOfferedLatch.await();
taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final ClassLoader firstClassLoader = UserClassLoaderExtractingInvokable.take();
// wait for the first task to finish
TaskExecutionState taskExecutionState;
do {
taskExecutionState = taskExecutionStates.take();
} while (!taskExecutionState.getExecutionState().isTerminal());
// check that a second task will re-use the same class loader
taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final ClassLoader secondClassLoader = UserClassLoaderExtractingInvokable.take();
assertThat(firstClassLoader, sameInstance(secondClassLoader));
}
}
Aggregations