Search in sources :

Example 1 with TaskExecutorProcessSpec

use of org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec in project flink by apache.

the class ActiveResourceManager method requestNewWorker.

// ------------------------------------------------------------------------
// Internal
// ------------------------------------------------------------------------
private void requestNewWorker(WorkerResourceSpec workerResourceSpec) {
    final TaskExecutorProcessSpec taskExecutorProcessSpec = TaskExecutorProcessUtils.processSpecFromWorkerResourceSpec(flinkConfig, workerResourceSpec);
    final int pendingCount = pendingWorkerCounter.increaseAndGet(workerResourceSpec);
    log.info("Requesting new worker with resource spec {}, current pending count: {}.", workerResourceSpec, pendingCount);
    // In case of start worker failures, we should wait for an interval before
    // trying to start new workers.
    // Otherwise, ActiveResourceManager will always re-requesting the worker,
    // which keeps the main thread busy.
    final CompletableFuture<WorkerType> requestResourceFuture = startWorkerCoolDown.thenCompose((ignore) -> resourceManagerDriver.requestResource(taskExecutorProcessSpec));
    FutureUtils.assertNoException(requestResourceFuture.handle((worker, exception) -> {
        if (exception != null) {
            final int count = pendingWorkerCounter.decreaseAndGet(workerResourceSpec);
            log.warn("Failed requesting worker with resource spec {}, current pending count: {}", workerResourceSpec, count, exception);
            recordWorkerFailureAndPauseWorkerCreationIfNeeded();
            requestWorkerIfRequired();
        } else {
            final ResourceID resourceId = worker.getResourceID();
            workerNodeMap.put(resourceId, worker);
            currentAttemptUnregisteredWorkers.put(resourceId, workerResourceSpec);
            scheduleWorkerRegistrationTimeoutCheck(resourceId);
            log.info("Requested worker {} with resource spec {}.", resourceId.getStringWithMetadata(), workerResourceSpec);
        }
        return null;
    }));
}
Also used : TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) ScheduledFuture(java.util.concurrent.ScheduledFuture) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) ResourceIDRetrievable(org.apache.flink.runtime.clusterframework.types.ResourceIDRetrievable) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) HashSet(java.util.HashSet) ResourceManagerMetricGroup(org.apache.flink.runtime.metrics.groups.ResourceManagerMetricGroup) ThresholdMeter(org.apache.flink.runtime.metrics.ThresholdMeter) RpcService(org.apache.flink.runtime.rpc.RpcService) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) Duration(java.time.Duration) Map(java.util.Map) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) SlotManager(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nullable(javax.annotation.Nullable) AkkaOptions(org.apache.flink.configuration.AkkaOptions) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) UUID(java.util.UUID) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) Preconditions(org.apache.flink.util.Preconditions) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) ResourceManagerPartitionTrackerFactory(org.apache.flink.runtime.io.network.partition.ResourceManagerPartitionTrackerFactory) TimeUnit(java.util.concurrent.TimeUnit) MetricNames(org.apache.flink.runtime.metrics.MetricNames) TaskExecutorProcessUtils(org.apache.flink.runtime.clusterframework.TaskExecutorProcessUtils) ResourceManager(org.apache.flink.runtime.resourcemanager.ResourceManager) Time(org.apache.flink.api.common.time.Time) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID)

Example 2 with TaskExecutorProcessSpec

use of org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec in project flink by apache.

the class KubernetesResourceManagerDriver method requestResource.

@Override
public CompletableFuture<KubernetesWorkerNode> requestResource(TaskExecutorProcessSpec taskExecutorProcessSpec) {
    final KubernetesTaskManagerParameters parameters = createKubernetesTaskManagerParameters(taskExecutorProcessSpec);
    final KubernetesPod taskManagerPod = KubernetesTaskManagerFactory.buildTaskManagerKubernetesPod(taskManagerPodTemplate, parameters);
    final String podName = taskManagerPod.getName();
    final CompletableFuture<KubernetesWorkerNode> requestResourceFuture = new CompletableFuture<>();
    requestResourceFutures.put(podName, requestResourceFuture);
    log.info("Creating new TaskManager pod with name {} and resource <{},{}>.", podName, parameters.getTaskManagerMemoryMB(), parameters.getTaskManagerCPU());
    final CompletableFuture<Void> createPodFuture = flinkKubeClient.createTaskManagerPod(taskManagerPod);
    FutureUtils.assertNoException(createPodFuture.handleAsync((ignore, exception) -> {
        if (exception != null) {
            log.warn("Could not create pod {}, exception: {}", podName, exception);
            CompletableFuture<KubernetesWorkerNode> future = requestResourceFutures.remove(taskManagerPod.getName());
            if (future != null) {
                future.completeExceptionally(exception);
            }
        } else {
            log.info("Pod {} is created.", podName);
        }
        return null;
    }, getMainThreadExecutor()));
    return requestResourceFuture;
}
Also used : TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) FlinkException(org.apache.flink.util.FlinkException) ResourceManagerUtils(org.apache.flink.runtime.util.ResourceManagerUtils) ExternalResourceUtils(org.apache.flink.runtime.externalresource.ExternalResourceUtils) ExceptionUtils(org.apache.flink.util.ExceptionUtils) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) KubernetesService(org.apache.flink.kubernetes.kubeclient.resources.KubernetesService) KubernetesWatch(org.apache.flink.kubernetes.kubeclient.resources.KubernetesWatch) KubernetesPod(org.apache.flink.kubernetes.kubeclient.resources.KubernetesPod) ArrayList(java.util.ArrayList) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) ProcessMemoryUtils(org.apache.flink.runtime.util.config.memory.ProcessMemoryUtils) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) Map(java.util.Map) KubernetesTaskManagerParameters(org.apache.flink.kubernetes.kubeclient.parameters.KubernetesTaskManagerParameters) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) KubernetesResourceManagerDriverConfiguration(org.apache.flink.kubernetes.configuration.KubernetesResourceManagerDriverConfiguration) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) HighAvailabilityMode(org.apache.flink.runtime.jobmanager.HighAvailabilityMode) Nullable(javax.annotation.Nullable) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) KubernetesUtils(org.apache.flink.kubernetes.utils.KubernetesUtils) AbstractResourceManagerDriver(org.apache.flink.runtime.resourcemanager.active.AbstractResourceManagerDriver) KubernetesConfigOptions(org.apache.flink.kubernetes.configuration.KubernetesConfigOptions) ResourceManagerDriver(org.apache.flink.runtime.resourcemanager.active.ResourceManagerDriver) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Configuration(org.apache.flink.configuration.Configuration) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) FlinkPod(org.apache.flink.kubernetes.kubeclient.FlinkPod) KubernetesTaskManagerFactory(org.apache.flink.kubernetes.kubeclient.factory.KubernetesTaskManagerFactory) Preconditions(org.apache.flink.util.Preconditions) BootstrapTools(org.apache.flink.runtime.clusterframework.BootstrapTools) File(java.io.File) List(java.util.List) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) KubernetesTooOldResourceVersionException(org.apache.flink.kubernetes.kubeclient.resources.KubernetesTooOldResourceVersionException) Optional(java.util.Optional) Constants(org.apache.flink.kubernetes.utils.Constants) FlinkKubeClient(org.apache.flink.kubernetes.kubeclient.FlinkKubeClient) CompletableFuture(java.util.concurrent.CompletableFuture) KubernetesTaskManagerParameters(org.apache.flink.kubernetes.kubeclient.parameters.KubernetesTaskManagerParameters) KubernetesPod(org.apache.flink.kubernetes.kubeclient.resources.KubernetesPod)

Example 3 with TaskExecutorProcessSpec

use of org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec in project flink by apache.

the class BashJavaUtils method getTmResourceParams.

/**
 * Generate and print JVM parameters and dynamic configs of task executor resources. The last
 * two lines of the output should be JVM parameters and dynamic configs respectively.
 */
private static List<String> getTmResourceParams(Configuration configuration) {
    Configuration configurationWithFallback = TaskExecutorProcessUtils.getConfigurationMapLegacyTaskManagerHeapSizeToConfigOption(configuration, TaskManagerOptions.TOTAL_FLINK_MEMORY);
    TaskExecutorProcessSpec taskExecutorProcessSpec = TaskExecutorProcessUtils.processSpecFromConfig(configurationWithFallback);
    logTaskExecutorConfiguration(taskExecutorProcessSpec);
    return Arrays.asList(ProcessMemoryUtils.generateJvmParametersStr(taskExecutorProcessSpec), TaskExecutorProcessUtils.generateDynamicConfigsStr(taskExecutorProcessSpec));
}
Also used : Configuration(org.apache.flink.configuration.Configuration) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec)

Example 4 with TaskExecutorProcessSpec

use of org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec in project flink by apache.

the class YarnResourceManagerDriver method requestResource.

@Override
public CompletableFuture<YarnWorkerNode> requestResource(TaskExecutorProcessSpec taskExecutorProcessSpec) {
    checkInitialized();
    final CompletableFuture<YarnWorkerNode> requestResourceFuture = new CompletableFuture<>();
    final Optional<TaskExecutorProcessSpecContainerResourcePriorityAdapter.PriorityAndResource> priorityAndResourceOpt = taskExecutorProcessSpecContainerResourcePriorityAdapter.getPriorityAndResource(taskExecutorProcessSpec);
    if (!priorityAndResourceOpt.isPresent()) {
        requestResourceFuture.completeExceptionally(new ResourceManagerException(String.format("Could not compute the container Resource from the given TaskExecutorProcessSpec %s. " + "This usually indicates the requested resource is larger than Yarn's max container resource limit.", taskExecutorProcessSpec)));
    } else {
        final Priority priority = priorityAndResourceOpt.get().getPriority();
        final Resource resource = priorityAndResourceOpt.get().getResource();
        resourceManagerClient.addContainerRequest(ContainerRequestReflector.INSTANCE.getContainerRequest(resource, priority, taskManagerNodeLabel));
        // make sure we transmit the request fast and receive fast news of granted allocations
        resourceManagerClient.setHeartbeatInterval(containerRequestHeartbeatIntervalMillis);
        requestResourceFutures.computeIfAbsent(taskExecutorProcessSpec, ignore -> new LinkedList<>()).add(requestResourceFuture);
        log.info("Requesting new TaskExecutor container with resource {}, priority {}.", taskExecutorProcessSpec, priority);
    }
    return requestResourceFuture;
}
Also used : ResourceManagerUtils(org.apache.flink.runtime.util.ResourceManagerUtils) URL(java.net.URL) NMClientAsync(org.apache.hadoop.yarn.client.api.async.NMClientAsync) ExternalResourceUtils(org.apache.flink.runtime.externalresource.ExternalResourceUtils) ExceptionUtils(org.apache.flink.util.ExceptionUtils) ByteBuffer(java.nio.ByteBuffer) Map(java.util.Map) Resource(org.apache.hadoop.yarn.api.records.Resource) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) TaskManagerOptionsInternal(org.apache.flink.configuration.TaskManagerOptionsInternal) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) Priority(org.apache.hadoop.yarn.api.records.Priority) HistoryServerUtils(org.apache.flink.runtime.webmonitor.history.HistoryServerUtils) Collection(java.util.Collection) Preconditions(org.apache.flink.util.Preconditions) Collectors(java.util.stream.Collectors) YarnConfigOptions(org.apache.flink.yarn.configuration.YarnConfigOptions) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) List(java.util.List) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) Optional(java.util.Optional) Queue(java.util.Queue) AMRMClientAsync(org.apache.hadoop.yarn.client.api.async.AMRMClientAsync) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayList(java.util.ArrayList) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ContainerExitStatus(org.apache.hadoop.yarn.api.records.ContainerExitStatus) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) LinkedList(java.util.LinkedList) Nullable(javax.annotation.Nullable) AbstractResourceManagerDriver(org.apache.flink.runtime.resourcemanager.active.AbstractResourceManagerDriver) Iterator(java.util.Iterator) ResourceManagerDriver(org.apache.flink.runtime.resourcemanager.active.ResourceManagerDriver) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) YarnResourceManagerDriverConfiguration(org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration) Configuration(org.apache.flink.configuration.Configuration) AMRMClient(org.apache.hadoop.yarn.client.api.AMRMClient) IOException(java.io.IOException) Container(org.apache.hadoop.yarn.api.records.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) BootstrapTools(org.apache.flink.runtime.clusterframework.BootstrapTools) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) RegisterApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) Phaser(java.util.concurrent.Phaser) CompletableFuture(java.util.concurrent.CompletableFuture) Priority(org.apache.hadoop.yarn.api.records.Priority) Resource(org.apache.hadoop.yarn.api.records.Resource) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LinkedList(java.util.LinkedList)

Example 5 with TaskExecutorProcessSpec

use of org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec in project flink by apache.

the class YarnResourceManagerDriver method onContainersOfPriorityAllocated.

// ------------------------------------------------------------------------
// Internal
// ------------------------------------------------------------------------
private void onContainersOfPriorityAllocated(Priority priority, List<Container> containers) {
    final Optional<TaskExecutorProcessSpecContainerResourcePriorityAdapter.TaskExecutorProcessSpecAndResource> taskExecutorProcessSpecAndResourceOpt = taskExecutorProcessSpecContainerResourcePriorityAdapter.getTaskExecutorProcessSpecAndResource(priority);
    Preconditions.checkState(taskExecutorProcessSpecAndResourceOpt.isPresent(), "Receive %s containers with unrecognized priority %s. This should not happen.", containers.size(), priority.getPriority());
    final TaskExecutorProcessSpec taskExecutorProcessSpec = taskExecutorProcessSpecAndResourceOpt.get().getTaskExecutorProcessSpec();
    final Resource resource = taskExecutorProcessSpecAndResourceOpt.get().getResource();
    final Queue<CompletableFuture<YarnWorkerNode>> pendingRequestResourceFutures = requestResourceFutures.getOrDefault(taskExecutorProcessSpec, new LinkedList<>());
    log.info("Received {} containers with priority {}, {} pending container requests.", containers.size(), priority, pendingRequestResourceFutures.size());
    final Iterator<Container> containerIterator = containers.iterator();
    final Iterator<AMRMClient.ContainerRequest> pendingContainerRequestIterator = getPendingRequestsAndCheckConsistency(priority, resource, pendingRequestResourceFutures.size()).iterator();
    int numAccepted = 0;
    while (containerIterator.hasNext() && pendingContainerRequestIterator.hasNext()) {
        final Container container = containerIterator.next();
        final AMRMClient.ContainerRequest pendingRequest = pendingContainerRequestIterator.next();
        final ResourceID resourceId = getContainerResourceId(container);
        final CompletableFuture<YarnWorkerNode> requestResourceFuture = pendingRequestResourceFutures.poll();
        Preconditions.checkState(requestResourceFuture != null);
        if (pendingRequestResourceFutures.isEmpty()) {
            requestResourceFutures.remove(taskExecutorProcessSpec);
        }
        startTaskExecutorInContainerAsync(container, taskExecutorProcessSpec, resourceId, requestResourceFuture);
        removeContainerRequest(pendingRequest);
        numAccepted++;
    }
    int numExcess = 0;
    while (containerIterator.hasNext()) {
        returnExcessContainer(containerIterator.next());
        numExcess++;
    }
    log.info("Accepted {} requested containers, returned {} excess containers, {} pending container requests of resource {}.", numAccepted, numExcess, pendingRequestResourceFutures.size(), resource);
}
Also used : AMRMClient(org.apache.hadoop.yarn.client.api.AMRMClient) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) Resource(org.apache.hadoop.yarn.api.records.Resource) CompletableFuture(java.util.concurrent.CompletableFuture) Container(org.apache.hadoop.yarn.api.records.Container) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID)

Aggregations

TaskExecutorProcessSpec (org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec)21 Test (org.junit.Test)14 CompletableFuture (java.util.concurrent.CompletableFuture)13 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)12 ArrayList (java.util.ArrayList)10 Configuration (org.apache.flink.configuration.Configuration)10 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)8 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)7 List (java.util.List)5 Duration (java.time.Duration)4 HashMap (java.util.HashMap)4 UUID (java.util.UUID)4 Callable (java.util.concurrent.Callable)4 TimeUnit (java.util.concurrent.TimeUnit)4 Time (org.apache.flink.api.common.time.Time)4 ContaineredTaskManagerParameters (org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters)4 TaskExecutorProcessUtils (org.apache.flink.runtime.clusterframework.TaskExecutorProcessUtils)4 ClusterInformation (org.apache.flink.runtime.entrypoint.ClusterInformation)4 WorkerResourceSpec (org.apache.flink.runtime.resourcemanager.WorkerResourceSpec)4 SlotManager (org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager)4