Search in sources :

Example 1 with WorkerResourceSpec

use of org.apache.flink.runtime.resourcemanager.WorkerResourceSpec in project flink by apache.

the class ActiveResourceManager method clearStateForWorker.

/**
 * Clear states for a terminated worker.
 *
 * @param resourceId Identifier of the worker
 * @return True if the worker is known and states are cleared; false if the worker is unknown
 *     (duplicate call to already cleared worker)
 */
private boolean clearStateForWorker(ResourceID resourceId) {
    WorkerType worker = workerNodeMap.remove(resourceId);
    if (worker == null) {
        log.debug("Ignore unrecognized worker {}.", resourceId.getStringWithMetadata());
        return false;
    }
    WorkerResourceSpec workerResourceSpec = currentAttemptUnregisteredWorkers.remove(resourceId);
    previousAttemptUnregisteredWorkers.remove(resourceId);
    if (workerResourceSpec != null) {
        final int count = pendingWorkerCounter.decreaseAndGet(workerResourceSpec);
        log.info("Worker {} with resource spec {} was requested in current attempt and has not registered." + " Current pending count after removing: {}.", resourceId.getStringWithMetadata(), workerResourceSpec, count);
    }
    return true;
}
Also used : WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec)

Example 2 with WorkerResourceSpec

use of org.apache.flink.runtime.resourcemanager.WorkerResourceSpec in project flink by apache.

the class ActiveResourceManager method requestNewWorker.

// ------------------------------------------------------------------------
// Internal
// ------------------------------------------------------------------------
private void requestNewWorker(WorkerResourceSpec workerResourceSpec) {
    final TaskExecutorProcessSpec taskExecutorProcessSpec = TaskExecutorProcessUtils.processSpecFromWorkerResourceSpec(flinkConfig, workerResourceSpec);
    final int pendingCount = pendingWorkerCounter.increaseAndGet(workerResourceSpec);
    log.info("Requesting new worker with resource spec {}, current pending count: {}.", workerResourceSpec, pendingCount);
    // In case of start worker failures, we should wait for an interval before
    // trying to start new workers.
    // Otherwise, ActiveResourceManager will always re-requesting the worker,
    // which keeps the main thread busy.
    final CompletableFuture<WorkerType> requestResourceFuture = startWorkerCoolDown.thenCompose((ignore) -> resourceManagerDriver.requestResource(taskExecutorProcessSpec));
    FutureUtils.assertNoException(requestResourceFuture.handle((worker, exception) -> {
        if (exception != null) {
            final int count = pendingWorkerCounter.decreaseAndGet(workerResourceSpec);
            log.warn("Failed requesting worker with resource spec {}, current pending count: {}", workerResourceSpec, count, exception);
            recordWorkerFailureAndPauseWorkerCreationIfNeeded();
            requestWorkerIfRequired();
        } else {
            final ResourceID resourceId = worker.getResourceID();
            workerNodeMap.put(resourceId, worker);
            currentAttemptUnregisteredWorkers.put(resourceId, workerResourceSpec);
            scheduleWorkerRegistrationTimeoutCheck(resourceId);
            log.info("Requested worker {} with resource spec {}.", resourceId.getStringWithMetadata(), workerResourceSpec);
        }
        return null;
    }));
}
Also used : TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) ScheduledFuture(java.util.concurrent.ScheduledFuture) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) ResourceIDRetrievable(org.apache.flink.runtime.clusterframework.types.ResourceIDRetrievable) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) HashSet(java.util.HashSet) ResourceManagerMetricGroup(org.apache.flink.runtime.metrics.groups.ResourceManagerMetricGroup) ThresholdMeter(org.apache.flink.runtime.metrics.ThresholdMeter) RpcService(org.apache.flink.runtime.rpc.RpcService) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) Duration(java.time.Duration) Map(java.util.Map) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) SlotManager(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nullable(javax.annotation.Nullable) AkkaOptions(org.apache.flink.configuration.AkkaOptions) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) UUID(java.util.UUID) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) Preconditions(org.apache.flink.util.Preconditions) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) ResourceManagerPartitionTrackerFactory(org.apache.flink.runtime.io.network.partition.ResourceManagerPartitionTrackerFactory) TimeUnit(java.util.concurrent.TimeUnit) MetricNames(org.apache.flink.runtime.metrics.MetricNames) TaskExecutorProcessUtils(org.apache.flink.runtime.clusterframework.TaskExecutorProcessUtils) ResourceManager(org.apache.flink.runtime.resourcemanager.ResourceManager) Time(org.apache.flink.api.common.time.Time) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID)

Example 3 with WorkerResourceSpec

use of org.apache.flink.runtime.resourcemanager.WorkerResourceSpec in project flink by apache.

the class TaskExecutorManagerTest method testTimeoutForUnusedTaskManager.

/**
 * Tests that formerly used task managers can timeout after all of their slots have been freed.
 */
@Test
public void testTimeoutForUnusedTaskManager() throws Exception {
    WorkerResourceSpec workerResourceSpec = new WorkerResourceSpec.Builder().setCpuCores(1).build();
    final ResourceProfile resourceProfile = ResourceProfile.newBuilder().setCpuCores(1).build();
    final Time taskManagerTimeout = Time.milliseconds(50L);
    final CompletableFuture<InstanceID> releaseResourceFuture = new CompletableFuture<>();
    final ResourceActions resourceManagerActions = new TestingResourceActionsBuilder().setReleaseResourceConsumer((instanceID, e) -> releaseResourceFuture.complete(instanceID)).build();
    final Executor mainThreadExecutor = TestingUtils.defaultExecutor();
    try (final TaskExecutorManager taskExecutorManager = createTaskExecutorManagerBuilder().setTaskManagerTimeout(taskManagerTimeout).setDefaultWorkerResourceSpec(workerResourceSpec).setResourceActions(resourceManagerActions).setMainThreadExecutor(mainThreadExecutor).createTaskExecutorManager()) {
        CompletableFuture.supplyAsync(() -> {
            taskExecutorManager.allocateWorker(resourceProfile);
            InstanceID taskExecutorId = createAndRegisterTaskExecutor(taskExecutorManager, 1, resourceProfile);
            taskExecutorManager.occupySlot(taskExecutorId);
            taskExecutorManager.freeSlot(taskExecutorId);
            return taskExecutorId;
        }, mainThreadExecutor).thenAcceptBoth(releaseResourceFuture, (registeredInstance, releasedInstance) -> assertThat(registeredInstance, is(releasedInstance))).get();
    }
}
Also used : IntStream(java.util.stream.IntStream) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) CompletableFuture(java.util.concurrent.CompletableFuture) Assert.assertThat(org.junit.Assert.assertThat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Matchers.nullValue(org.hamcrest.Matchers.nullValue) TestLogger(org.apache.flink.util.TestLogger) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Executor(java.util.concurrent.Executor) Test(org.junit.Test) InstanceID(org.apache.flink.runtime.instance.InstanceID) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingUtils(org.apache.flink.testutils.TestingUtils) List(java.util.List) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) Matchers.is(org.hamcrest.Matchers.is) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Assert.assertEquals(org.junit.Assert.assertEquals) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) CompletableFuture(java.util.concurrent.CompletableFuture) Executor(java.util.concurrent.Executor) InstanceID(org.apache.flink.runtime.instance.InstanceID) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Time(org.apache.flink.api.common.time.Time) Test(org.junit.Test)

Example 4 with WorkerResourceSpec

use of org.apache.flink.runtime.resourcemanager.WorkerResourceSpec in project flink by apache.

the class TaskExecutorManagerTest method testPendingSlotNotFulfilledByAllocatedSlot.

/**
 * Tests that a pending slot is not fulfilled by an already allocated slot.
 */
@Test
public void testPendingSlotNotFulfilledByAllocatedSlot() {
    final int numWorkerCpuCores = 3;
    final WorkerResourceSpec workerResourceSpec = new WorkerResourceSpec.Builder().setCpuCores(numWorkerCpuCores).build();
    final ResourceProfile requestedSlotProfile = ResourceProfile.newBuilder().setCpuCores(numWorkerCpuCores).build();
    try (final TaskExecutorManager taskExecutorManager = createTaskExecutorManagerBuilder().setDefaultWorkerResourceSpec(workerResourceSpec).setNumSlotsPerWorker(// set to one so that the slot profiles directly correspond to
    1).setMaxNumSlots(2).createTaskExecutorManager()) {
        // create pending slot
        taskExecutorManager.allocateWorker(requestedSlotProfile);
        assertThat(taskExecutorManager.getNumberPendingTaskManagerSlots(), is(1));
        final TaskExecutorConnection taskExecutorConnection = createTaskExecutorConnection();
        final SlotReport slotReport = new SlotReport(new SlotStatus(new SlotID(taskExecutorConnection.getResourceID(), 0), requestedSlotProfile, JobID.generate(), new AllocationID()));
        taskExecutorManager.registerTaskManager(taskExecutorConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
        // the slot from the task executor should be accepted, but we should still be waiting
        // for the originally requested slot
        assertThat(taskExecutorManager.getNumberRegisteredSlots(), is(1));
        assertThat(taskExecutorManager.getNumberPendingTaskManagerSlots(), is(1));
    }
}
Also used : ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Example 5 with WorkerResourceSpec

use of org.apache.flink.runtime.resourcemanager.WorkerResourceSpec in project flink by apache.

the class SlotManagerUtilsTest method testGenerateDefaultSlotConsistentWithTaskExecutorResourceUtils.

@Test
public void testGenerateDefaultSlotConsistentWithTaskExecutorResourceUtils() {
    final int numSlots = 5;
    final TaskExecutorResourceSpec taskExecutorResourceSpec = new TaskExecutorResourceSpec(new CPUResource(1.0), MemorySize.parse("1m"), MemorySize.parse("2m"), MemorySize.parse("3m"), MemorySize.parse("4m"), Collections.singleton(new ExternalResource(EXTERNAL_RESOURCE_NAME, numSlots)));
    final ResourceProfile resourceProfileFromTaskExecutorResourceUtils = TaskExecutorResourceUtils.generateDefaultSlotResourceProfile(taskExecutorResourceSpec, numSlots);
    final ResourceProfile totalResourceProfile = TaskExecutorResourceUtils.generateTotalAvailableResourceProfile(taskExecutorResourceSpec);
    final WorkerResourceSpec workerResourceSpec = WorkerResourceSpec.fromTotalResourceProfile(totalResourceProfile, numSlots);
    assertThat(SlotManagerUtils.generateDefaultSlotResourceProfile(totalResourceProfile, numSlots), is(resourceProfileFromTaskExecutorResourceUtils));
    assertThat(SlotManagerUtils.generateDefaultSlotResourceProfile(workerResourceSpec, numSlots), is(resourceProfileFromTaskExecutorResourceUtils));
}
Also used : ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TaskExecutorResourceSpec(org.apache.flink.runtime.taskexecutor.TaskExecutorResourceSpec) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) CPUResource(org.apache.flink.api.common.resources.CPUResource) ExternalResource(org.apache.flink.api.common.resources.ExternalResource) Test(org.junit.Test)

Aggregations

WorkerResourceSpec (org.apache.flink.runtime.resourcemanager.WorkerResourceSpec)16 Test (org.junit.Test)12 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)7 CompletableFuture (java.util.concurrent.CompletableFuture)5 ExternalResource (org.apache.flink.api.common.resources.ExternalResource)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 TestingTaskExecutorGatewayBuilder (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder)4 Executor (java.util.concurrent.Executor)3 Time (org.apache.flink.api.common.time.Time)3 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)3 TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)3 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)3 SlotStatus (org.apache.flink.runtime.taskexecutor.SlotStatus)3 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 Collectors (java.util.stream.Collectors)2 IntStream (java.util.stream.IntStream)2