Search in sources :

Example 66 with AllocationID

use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.

the class TaskExecutorLocalStateStoresManagerTest method testSubtaskStateStoreDirectoryCreateAndDelete.

/**
 * This tests that the {@link TaskExecutorLocalStateStoresManager} creates {@link
 * TaskLocalStateStoreImpl} that have a properly initialized local state base directory. It also
 * checks that subdirectories are correctly deleted on shutdown.
 */
@Test
public void testSubtaskStateStoreDirectoryCreateAndDelete() throws Exception {
    JobID jobID = new JobID();
    JobVertexID jobVertexID = new JobVertexID();
    AllocationID allocationID = new AllocationID();
    int subtaskIdx = 23;
    File[] rootDirs = { temporaryFolder.newFolder(), temporaryFolder.newFolder(), temporaryFolder.newFolder() };
    TaskExecutorLocalStateStoresManager storesManager = new TaskExecutorLocalStateStoresManager(true, Reference.owned(rootDirs), Executors.directExecutor());
    TaskLocalStateStore taskLocalStateStore = storesManager.localStateStoreForSubtask(jobID, allocationID, jobVertexID, subtaskIdx);
    LocalRecoveryDirectoryProvider directoryProvider = taskLocalStateStore.getLocalRecoveryConfig().getLocalStateDirectoryProvider().orElseThrow(LocalRecoveryConfig.localRecoveryNotEnabled());
    for (int i = 0; i < 10; ++i) {
        Assert.assertEquals(new File(rootDirs[(i & Integer.MAX_VALUE) % rootDirs.length], storesManager.allocationSubDirString(allocationID)), directoryProvider.allocationBaseDirectory(i));
    }
    long chkId = 42L;
    File allocBaseDirChk42 = directoryProvider.allocationBaseDirectory(chkId);
    File subtaskSpecificCheckpointDirectory = directoryProvider.subtaskSpecificCheckpointDirectory(chkId);
    Assert.assertEquals(new File(allocBaseDirChk42, "jid_" + jobID + File.separator + "vtx_" + jobVertexID + "_" + "sti_" + subtaskIdx + File.separator + "chk_" + chkId), subtaskSpecificCheckpointDirectory);
    Assert.assertTrue(subtaskSpecificCheckpointDirectory.mkdirs());
    File testFile = new File(subtaskSpecificCheckpointDirectory, "test");
    Assert.assertTrue(testFile.createNewFile());
    // test that local recovery mode is forwarded to the created store
    Assert.assertEquals(storesManager.isLocalRecoveryEnabled(), taskLocalStateStore.getLocalRecoveryConfig().isLocalRecoveryEnabled());
    Assert.assertTrue(testFile.exists());
    // check cleanup after releasing allocation id
    storesManager.releaseLocalStateForAllocationId(allocationID);
    checkRootDirsClean(rootDirs);
    AllocationID otherAllocationID = new AllocationID();
    taskLocalStateStore = storesManager.localStateStoreForSubtask(jobID, otherAllocationID, jobVertexID, subtaskIdx);
    directoryProvider = taskLocalStateStore.getLocalRecoveryConfig().getLocalStateDirectoryProvider().orElseThrow(LocalRecoveryConfig.localRecoveryNotEnabled());
    File chkDir = directoryProvider.subtaskSpecificCheckpointDirectory(23L);
    Assert.assertTrue(chkDir.mkdirs());
    testFile = new File(chkDir, "test");
    Assert.assertTrue(testFile.createNewFile());
    // check cleanup after shutdown
    storesManager.shutdown();
    checkRootDirsClean(rootDirs);
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 67 with AllocationID

use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.

the class LocalStateForwardingTest method testReportingFromTaskStateManagerToResponderAndTaskLocalStateStore.

/**
 * This tests that state that was reported to the {@link
 * org.apache.flink.runtime.state.TaskStateManager} is also reported to {@link
 * org.apache.flink.runtime.taskmanager.CheckpointResponder} and {@link
 * TaskLocalStateStoreImpl}.
 */
@Test
public void testReportingFromTaskStateManagerToResponderAndTaskLocalStateStore() throws Exception {
    final JobID jobID = new JobID();
    final AllocationID allocationID = new AllocationID();
    final ExecutionAttemptID executionAttemptID = new ExecutionAttemptID();
    final CheckpointMetaData checkpointMetaData = new CheckpointMetaData(42L, 4711L);
    final CheckpointMetrics checkpointMetrics = new CheckpointMetrics();
    final int subtaskIdx = 42;
    JobVertexID jobVertexID = new JobVertexID();
    TaskStateSnapshot jmSnapshot = new TaskStateSnapshot();
    TaskStateSnapshot tmSnapshot = new TaskStateSnapshot();
    final AtomicBoolean jmReported = new AtomicBoolean(false);
    final AtomicBoolean tmReported = new AtomicBoolean(false);
    TestCheckpointResponder checkpointResponder = new TestCheckpointResponder() {

        @Override
        public void acknowledgeCheckpoint(JobID lJobID, ExecutionAttemptID lExecutionAttemptID, long lCheckpointId, CheckpointMetrics lCheckpointMetrics, TaskStateSnapshot lSubtaskState) {
            Assert.assertEquals(jobID, lJobID);
            Assert.assertEquals(executionAttemptID, lExecutionAttemptID);
            Assert.assertEquals(checkpointMetaData.getCheckpointId(), lCheckpointId);
            Assert.assertEquals(checkpointMetrics, lCheckpointMetrics);
            jmReported.set(true);
        }
    };
    Executor executor = Executors.directExecutor();
    LocalRecoveryDirectoryProviderImpl directoryProvider = new LocalRecoveryDirectoryProviderImpl(temporaryFolder.newFolder(), jobID, jobVertexID, subtaskIdx);
    LocalRecoveryConfig localRecoveryConfig = new LocalRecoveryConfig(directoryProvider);
    TaskLocalStateStore taskLocalStateStore = new TaskLocalStateStoreImpl(jobID, allocationID, jobVertexID, subtaskIdx, localRecoveryConfig, executor) {

        @Override
        public void storeLocalState(@Nonnegative long checkpointId, @Nullable TaskStateSnapshot localState) {
            Assert.assertEquals(tmSnapshot, localState);
            tmReported.set(true);
        }
    };
    StateChangelogStorage<?> stateChangelogStorage = new InMemoryStateChangelogStorage();
    TaskStateManagerImpl taskStateManager = new TaskStateManagerImpl(jobID, executionAttemptID, taskLocalStateStore, stateChangelogStorage, null, checkpointResponder);
    taskStateManager.reportTaskStateSnapshots(checkpointMetaData, checkpointMetrics, jmSnapshot, tmSnapshot);
    Assert.assertTrue("Reporting for JM state was not called.", jmReported.get());
    Assert.assertTrue("Reporting for TM state was not called.", tmReported.get());
}
Also used : TaskStateManagerImpl(org.apache.flink.runtime.state.TaskStateManagerImpl) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskLocalStateStore(org.apache.flink.runtime.state.TaskLocalStateStore) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointMetrics(org.apache.flink.runtime.checkpoint.CheckpointMetrics) LocalRecoveryDirectoryProviderImpl(org.apache.flink.runtime.state.LocalRecoveryDirectoryProviderImpl) LocalRecoveryConfig(org.apache.flink.runtime.state.LocalRecoveryConfig) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) Executor(java.util.concurrent.Executor) InMemoryStateChangelogStorage(org.apache.flink.runtime.state.changelog.inmemory.InMemoryStateChangelogStorage) TaskLocalStateStoreImpl(org.apache.flink.runtime.state.TaskLocalStateStoreImpl) Nonnegative(javax.annotation.Nonnegative) TestCheckpointResponder(org.apache.flink.runtime.taskmanager.TestCheckpointResponder) JobID(org.apache.flink.api.common.JobID) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Example 68 with AllocationID

use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.

the class InterruptSensitiveRestoreTest method createTask.

// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
private static Task createTask(StreamConfig streamConfig, Configuration taskConfig, StreamStateHandle state, int mode) throws IOException {
    ShuffleEnvironment<?, ?> shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
    Collection<KeyedStateHandle> keyedStateFromBackend = Collections.emptyList();
    Collection<KeyedStateHandle> keyedStateFromStream = Collections.emptyList();
    Collection<OperatorStateHandle> operatorStateBackend = Collections.emptyList();
    Collection<OperatorStateHandle> operatorStateStream = Collections.emptyList();
    Map<String, OperatorStateHandle.StateMetaInfo> operatorStateMetadata = new HashMap<>(1);
    OperatorStateHandle.StateMetaInfo metaInfo = new OperatorStateHandle.StateMetaInfo(new long[] { 0 }, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE);
    operatorStateMetadata.put(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, metaInfo);
    KeyGroupRangeOffsets keyGroupRangeOffsets = new KeyGroupRangeOffsets(new KeyGroupRange(0, 0));
    Collection<OperatorStateHandle> operatorStateHandles = Collections.singletonList(new OperatorStreamStateHandle(operatorStateMetadata, state));
    List<KeyedStateHandle> keyedStateHandles = Collections.singletonList(new KeyGroupsStateHandle(keyGroupRangeOffsets, state));
    switch(mode) {
        case OPERATOR_MANAGED:
            operatorStateBackend = operatorStateHandles;
            break;
        case OPERATOR_RAW:
            operatorStateStream = operatorStateHandles;
            break;
        case KEYED_MANAGED:
            keyedStateFromBackend = keyedStateHandles;
            break;
        case KEYED_RAW:
            keyedStateFromStream = keyedStateHandles;
            break;
        default:
            throw new IllegalArgumentException();
    }
    OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(new StateObjectCollection<>(operatorStateBackend)).setRawOperatorState(new StateObjectCollection<>(operatorStateStream)).setManagedKeyedState(new StateObjectCollection<>(keyedStateFromBackend)).setRawKeyedState(new StateObjectCollection<>(keyedStateFromStream)).build();
    JobVertexID jobVertexID = new JobVertexID();
    OperatorID operatorID = OperatorID.fromJobVertexID(jobVertexID);
    streamConfig.setOperatorID(operatorID);
    TaskStateSnapshot stateSnapshot = new TaskStateSnapshot();
    stateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
    JobManagerTaskRestore taskRestore = new JobManagerTaskRestore(1L, stateSnapshot);
    JobInformation jobInformation = new JobInformation(new JobID(), "test job name", new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.emptyList(), Collections.emptyList());
    TaskInformation taskInformation = new TaskInformation(jobVertexID, "test task name", 1, 1, SourceStreamTask.class.getName(), taskConfig);
    TestTaskStateManager taskStateManager = TestTaskStateManager.builder().setReportedCheckpointId(taskRestore.getRestoreCheckpointId()).setJobManagerTaskStateSnapshotsByCheckpointId(Collections.singletonMap(taskRestore.getRestoreCheckpointId(), taskRestore.getTaskStateSnapshot())).build();
    return new Task(jobInformation, taskInformation, new ExecutionAttemptID(), new AllocationID(), 0, 0, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), mock(MemoryManager.class), mock(IOManager.class), shuffleEnvironment, new KvStateService(new KvStateRegistry(), null, null), mock(BroadcastVariableManager.class), new TaskEventDispatcher(), ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES, taskStateManager, mock(TaskManagerActions.class), mock(InputSplitProvider.class), mock(CheckpointResponder.class), new NoOpTaskOperatorEventGateway(), new TestGlobalAggregateManager(), TestingClassLoaderLease.newBuilder().build(), new FileCache(new String[] { EnvironmentInformation.getTemporaryFileDirectory() }, VoidPermanentBlobService.INSTANCE), new TestingTaskManagerRuntimeInfo(), UnregisteredMetricGroups.createUnregisteredTaskMetricGroup(), new NoOpResultPartitionConsumableNotifier(), mock(PartitionProducerStateChecker.class), mock(Executor.class));
}
Also used : KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) Task(org.apache.flink.runtime.taskmanager.Task) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) KeyGroupRangeOffsets(org.apache.flink.runtime.state.KeyGroupRangeOffsets) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) NettyShuffleEnvironmentBuilder(org.apache.flink.runtime.io.network.NettyShuffleEnvironmentBuilder) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) NoOpTaskOperatorEventGateway(org.apache.flink.runtime.taskmanager.NoOpTaskOperatorEventGateway) TestingTaskManagerRuntimeInfo(org.apache.flink.runtime.util.TestingTaskManagerRuntimeInfo) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) PartitionProducerStateChecker(org.apache.flink.runtime.taskexecutor.PartitionProducerStateChecker) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TestGlobalAggregateManager(org.apache.flink.runtime.taskexecutor.TestGlobalAggregateManager) FileCache(org.apache.flink.runtime.filecache.FileCache) StateObjectCollection(org.apache.flink.runtime.checkpoint.StateObjectCollection) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) JobID(org.apache.flink.api.common.JobID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) KvStateService(org.apache.flink.runtime.taskexecutor.KvStateService) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) Executor(java.util.concurrent.Executor) InputSplitProvider(org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) NoOpResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.NoOpResultPartitionConsumableNotifier) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) TestTaskStateManager(org.apache.flink.runtime.state.TestTaskStateManager) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher)

Example 69 with AllocationID

use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.

the class TaskExecutorRecoveryTest method testRecoveredTaskExecutorWillRestoreAllocationState.

@Test
public void testRecoveredTaskExecutorWillRestoreAllocationState(@TempDir File tempDir) throws Exception {
    final ResourceID resourceId = ResourceID.generate();
    final Configuration configuration = new Configuration();
    configuration.set(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    configuration.set(CheckpointingOptions.LOCAL_RECOVERY, true);
    final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
    final ArrayBlockingQueue<TaskExecutorSlotReport> queue = new ArrayBlockingQueue<>(2);
    testingResourceManagerGateway.setSendSlotReportFunction(slotReportInformation -> {
        queue.offer(TaskExecutorSlotReport.create(slotReportInformation.f0, slotReportInformation.f2));
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    final TestingRpcService rpcService = rpcServiceExtension.getTestingRpcService();
    rpcService.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
    final JobID jobId = new JobID();
    final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    highAvailabilityServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID()));
    final SettableLeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
    final WorkingDirectory workingDirectory = WorkingDirectory.create(tempDir);
    final TaskExecutor taskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
    taskExecutor.start();
    final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
    final TaskExecutorSlotReport taskExecutorSlotReport = queue.take();
    final SlotReport slotReport = taskExecutorSlotReport.getSlotReport();
    assertThat(slotReport.getNumSlotStatus(), is(2));
    final SlotStatus slotStatus = slotReport.iterator().next();
    final SlotID allocatedSlotID = slotStatus.getSlotID();
    final AllocationID allocationId = new AllocationID();
    taskExecutorGateway.requestSlot(allocatedSlotID, jobId, allocationId, slotStatus.getResourceProfile(), "localhost", testingResourceManagerGateway.getFencingToken(), Time.seconds(10L)).join();
    taskExecutor.close();
    final BlockingQueue<Collection<SlotOffer>> offeredSlots = new ArrayBlockingQueue<>(1);
    final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
        offeredSlots.offer(new HashSet<>(slotOffers));
        return CompletableFuture.completedFuture(slotOffers);
    }).build();
    rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
    jobMasterLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
    // recover the TaskExecutor
    final TaskExecutor recoveredTaskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
    recoveredTaskExecutor.start();
    final TaskExecutorSlotReport recoveredSlotReport = queue.take();
    for (SlotStatus status : recoveredSlotReport.getSlotReport()) {
        if (status.getSlotID().equals(allocatedSlotID)) {
            assertThat(status.getJobID(), is(jobId));
            assertThat(status.getAllocationID(), is(allocationId));
        } else {
            assertThat(status.getJobID(), is(nullValue()));
        }
    }
    final Collection<SlotOffer> take = offeredSlots.take();
    assertThat(take, hasSize(1));
    final SlotOffer offeredSlot = take.iterator().next();
    assertThat(offeredSlot.getAllocationId(), is(allocationId));
}
Also used : TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) CompletableFuture(java.util.concurrent.CompletableFuture) EachCallbackWrapper(org.apache.flink.core.testutils.EachCallbackWrapper) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) HashSet(java.util.HashSet) TestLoggerExtension(org.apache.flink.util.TestLoggerExtension) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingRpcServiceExtension(org.apache.flink.runtime.rpc.TestingRpcServiceExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) RegisterExtension(org.junit.jupiter.api.extension.RegisterExtension) Matchers.nullValue(org.hamcrest.Matchers.nullValue) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) BlockingQueue(java.util.concurrent.BlockingQueue) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) File(java.io.File) CheckpointingOptions(org.apache.flink.configuration.CheckpointingOptions) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Test(org.junit.jupiter.api.Test) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) JobID(org.apache.flink.api.common.JobID) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) TempDir(org.junit.jupiter.api.io.TempDir) Matchers.is(org.hamcrest.Matchers.is) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Configuration(org.apache.flink.configuration.Configuration) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) HashSet(java.util.HashSet) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) Collection(java.util.Collection) JobID(org.apache.flink.api.common.JobID) Test(org.junit.jupiter.api.Test)

Example 70 with AllocationID

use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.

the class TaskExecutorSlotLifetimeTest method testUserCodeClassLoaderIsBoundToSlot.

/**
 * Tests that the user code class loader is bound to the lifetime of the slot. This means that
 * it is being reused across a failover, for example. See FLINK-16408.
 */
@Test
public void testUserCodeClassLoaderIsBoundToSlot() throws Exception {
    final Configuration configuration = new Configuration();
    final TestingRpcService rpcService = TESTING_RPC_SERVICE_RESOURCE.getTestingRpcService();
    final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
    final CompletableFuture<SlotReport> firstSlotReportFuture = new CompletableFuture<>();
    resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
        firstSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    final BlockingQueue<TaskExecutionState> taskExecutionStates = new ArrayBlockingQueue<>(3);
    final OneShotLatch slotsOfferedLatch = new OneShotLatch();
    final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
        slotsOfferedLatch.trigger();
        return CompletableFuture.completedFuture(slotOffers);
    }).setUpdateTaskExecutionStateFunction(FunctionUtils.uncheckedFunction(taskExecutionState -> {
        taskExecutionStates.put(taskExecutionState);
        return CompletableFuture.completedFuture(Acknowledge.get());
    })).build();
    final LeaderRetrievalService resourceManagerLeaderRetriever = new SettableLeaderRetrievalService(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
    final LeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
    final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setResourceManagerLeaderRetriever(resourceManagerLeaderRetriever).setJobMasterLeaderRetrieverFunction(ignored -> jobMasterLeaderRetriever).build();
    rpcService.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
    rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
    final LocalUnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
    try (final TaskExecutor taskExecutor = createTaskExecutor(configuration, rpcService, haServices, unresolvedTaskManagerLocation)) {
        taskExecutor.start();
        final SlotReport slotReport = firstSlotReportFuture.join();
        final SlotID firstSlotId = slotReport.iterator().next().getSlotID();
        final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
        final JobID jobId = new JobID();
        final AllocationID allocationId = new AllocationID();
        taskExecutorGateway.requestSlot(firstSlotId, jobId, allocationId, ResourceProfile.ZERO, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
        final TaskDeploymentDescriptor tdd = TaskDeploymentDescriptorBuilder.newBuilder(jobId, UserClassLoaderExtractingInvokable.class).setAllocationId(allocationId).build();
        slotsOfferedLatch.await();
        taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
        final ClassLoader firstClassLoader = UserClassLoaderExtractingInvokable.take();
        // wait for the first task to finish
        TaskExecutionState taskExecutionState;
        do {
            taskExecutionState = taskExecutionStates.take();
        } while (!taskExecutionState.getExecutionState().isTerminal());
        // check that a second task will re-use the same class loader
        taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
        final ClassLoader secondClassLoader = UserClassLoaderExtractingInvokable.take();
        assertThat(firstClassLoader, sameInstance(secondClassLoader));
    }
}
Also used : OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) CompletableFuture(java.util.concurrent.CompletableFuture) TaskDeploymentDescriptorBuilder(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorBuilder) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) InetAddress(java.net.InetAddress) Assert.assertThat(org.junit.Assert.assertThat) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) FunctionUtils(org.apache.flink.util.function.FunctionUtils) ExternalResourceInfoProvider(org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider) TestLogger(org.apache.flink.util.TestLogger) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) TestingFatalErrorHandlerResource(org.apache.flink.runtime.util.TestingFatalErrorHandlerResource) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) TestFileUtils(org.apache.flink.testutils.TestFileUtils) ClassRule(org.junit.ClassRule) Before(org.junit.Before) CoreMatchers.sameInstance(org.hamcrest.CoreMatchers.sameInstance) TaskSlotUtils(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) NoOpTaskExecutorBlobService(org.apache.flink.runtime.blob.NoOpTaskExecutorBlobService) TestingTaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingTaskExecutorPartitionTracker) Test(org.junit.Test) IOException(java.io.IOException) BlockingQueue(java.util.concurrent.BlockingQueue) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) Rule(org.junit.Rule) TestingRpcServiceResource(org.apache.flink.runtime.rpc.TestingRpcServiceResource) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Environment(org.apache.flink.runtime.execution.Environment) Configuration(org.apache.flink.configuration.Configuration) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)194 Test (org.junit.Test)137 JobID (org.apache.flink.api.common.JobID)106 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)60 CompletableFuture (java.util.concurrent.CompletableFuture)56 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)56 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)53 ArrayList (java.util.ArrayList)39 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)36 Collection (java.util.Collection)35 Time (org.apache.flink.api.common.time.Time)35 Configuration (org.apache.flink.configuration.Configuration)34 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)34 SlotOffer (org.apache.flink.runtime.taskexecutor.slot.SlotOffer)34 List (java.util.List)32 TestLogger (org.apache.flink.util.TestLogger)32 FlinkException (org.apache.flink.util.FlinkException)31 Matchers.is (org.hamcrest.Matchers.is)31 Assert.assertThat (org.junit.Assert.assertThat)31 Arrays (java.util.Arrays)30