Search in sources :

Example 1 with JobMasterId

use of org.apache.flink.runtime.jobmaster.JobMasterId in project flink by apache.

the class ResourceManager method closeJobManagerConnection.

/**
 * This method should be called by the framework once it detects that a currently registered job
 * manager has failed.
 *
 * @param jobId identifying the job whose leader shall be disconnected.
 * @param resourceRequirementHandling indicating how existing resource requirements for the
 *     corresponding job should be handled
 * @param cause The exception which cause the JobManager failed.
 */
protected void closeJobManagerConnection(JobID jobId, ResourceRequirementHandling resourceRequirementHandling, Exception cause) {
    JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
    if (jobManagerRegistration != null) {
        final ResourceID jobManagerResourceId = jobManagerRegistration.getJobManagerResourceID();
        final JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
        final JobMasterId jobMasterId = jobManagerRegistration.getJobMasterId();
        log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobMasterId, jobMasterGateway.getAddress(), jobId);
        jobManagerHeartbeatManager.unmonitorTarget(jobManagerResourceId);
        jmResourceIdRegistrations.remove(jobManagerResourceId);
        if (resourceRequirementHandling == ResourceRequirementHandling.CLEAR) {
            slotManager.clearResourceRequirements(jobId);
        }
        // tell the job manager about the disconnect
        jobMasterGateway.disconnectResourceManager(getFencingToken(), cause);
    } else {
        log.debug("There was no registered job manager for job {}.", jobId);
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)

Example 2 with JobMasterId

use of org.apache.flink.runtime.jobmaster.JobMasterId in project flink by apache.

the class ResourceManager method registerJobMaster.

// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@Override
public CompletableFuture<RegistrationResponse> registerJobMaster(final JobMasterId jobMasterId, final ResourceID jobManagerResourceId, final String jobManagerAddress, final JobID jobId, final Time timeout) {
    checkNotNull(jobMasterId);
    checkNotNull(jobManagerResourceId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (!jobLeaderIdService.containsJob(jobId)) {
        try {
            jobLeaderIdService.addJob(jobId);
        } catch (Exception e) {
            ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
            onFatalError(exception);
            log.error("Could not add job {} to job leader id service.", jobId, e);
            return FutureUtils.completedExceptionally(exception);
        }
    }
    log.info("Registering job manager {}@{} for job {}.", jobMasterId, jobManagerAddress, jobId);
    CompletableFuture<JobMasterId> jobMasterIdFuture;
    try {
        jobMasterIdFuture = jobLeaderIdService.getLeaderId(jobId);
    } catch (Exception e) {
        // we cannot check the job leader id so let's fail
        // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader
        // id
        ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
        onFatalError(exception);
        log.debug("Could not obtain the job leader id future to verify the correct job leader.");
        return FutureUtils.completedExceptionally(exception);
    }
    CompletableFuture<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, jobMasterId, JobMasterGateway.class);
    CompletableFuture<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobMasterIdFuture, (JobMasterGateway jobMasterGateway, JobMasterId leadingJobMasterId) -> {
        if (Objects.equals(leadingJobMasterId, jobMasterId)) {
            return registerJobMasterInternal(jobMasterGateway, jobId, jobManagerAddress, jobManagerResourceId);
        } else {
            final String declineMessage = String.format("The leading JobMaster id %s did not match the received JobMaster id %s. " + "This indicates that a JobMaster leader change has happened.", leadingJobMasterId, jobMasterId);
            log.debug(declineMessage);
            return new RegistrationResponse.Failure(new FlinkException(declineMessage));
        }
    }, getMainThreadExecutor());
    // handle exceptions which might have occurred in one of the futures inputs of combine
    return registrationResponseFuture.handleAsync((RegistrationResponse registrationResponse, Throwable throwable) -> {
        if (throwable != null) {
            if (log.isDebugEnabled()) {
                log.debug("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress, throwable);
            } else {
                log.info("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress);
            }
            return new RegistrationResponse.Failure(throwable);
        } else {
            return registrationResponse;
        }
    }, ioExecutor);
}
Also used : JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TimeoutException(java.util.concurrent.TimeoutException) CompletionException(java.util.concurrent.CompletionException) FlinkException(org.apache.flink.util.FlinkException) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) UnknownTaskExecutorException(org.apache.flink.runtime.resourcemanager.exceptions.UnknownTaskExecutorException) FlinkException(org.apache.flink.util.FlinkException)

Example 3 with JobMasterId

use of org.apache.flink.runtime.jobmaster.JobMasterId in project flink by apache.

the class TaskExecutor method internalOfferSlotsToJobManager.

private void internalOfferSlotsToJobManager(JobTable.Connection jobManagerConnection) {
    final JobID jobId = jobManagerConnection.getJobId();
    if (taskSlotTable.hasAllocatedSlots(jobId)) {
        log.info("Offer reserved slots to the leader of job {}.", jobId);
        final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
        final Iterator<TaskSlot<Task>> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
        final JobMasterId jobMasterId = jobManagerConnection.getJobMasterId();
        final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
        while (reservedSlotsIterator.hasNext()) {
            SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
            reservedSlots.add(offer);
        }
        final UUID slotOfferId = UUID.randomUUID();
        currentSlotOfferPerJob.put(jobId, slotOfferId);
        CompletableFuture<Collection<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, taskManagerConfiguration.getRpcTimeout());
        acceptedSlotsFuture.whenCompleteAsync(handleAcceptedSlotOffers(jobId, jobMasterGateway, jobMasterId, reservedSlots, slotOfferId), getMainThreadExecutor());
    } else {
        log.debug("There are no unassigned slots for the job {}.", jobId);
    }
}
Also used : SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskSlot(org.apache.flink.runtime.taskexecutor.slot.TaskSlot) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Collection(java.util.Collection) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) HashSet(java.util.HashSet)

Example 4 with JobMasterId

use of org.apache.flink.runtime.jobmaster.JobMasterId in project flink by apache.

the class DeclarativeSlotPoolBridgeTest method testIfJobIsRestartingAllOfferedSlotsWillBeRegistered.

@Test
public void testIfJobIsRestartingAllOfferedSlotsWillBeRegistered() throws Exception {
    final CompletableFuture<Void> registerSlotsCalledFuture = new CompletableFuture<>();
    final TestingDeclarativeSlotPoolFactory declarativeSlotPoolFactory = new TestingDeclarativeSlotPoolFactory(TestingDeclarativeSlotPool.builder().setRegisterSlotsFunction((slotOffers, taskManagerLocation, taskManagerGateway, aLong) -> registerSlotsCalledFuture.complete(null)));
    try (DeclarativeSlotPoolBridge declarativeSlotPoolBridge = createDeclarativeSlotPoolBridge(declarativeSlotPoolFactory, requestSlotMatchingStrategy)) {
        declarativeSlotPoolBridge.start(jobMasterId, "localhost", mainThreadExecutor);
        declarativeSlotPoolBridge.setIsJobRestarting(true);
        final LocalTaskManagerLocation localTaskManagerLocation = new LocalTaskManagerLocation();
        declarativeSlotPoolBridge.registerTaskManager(localTaskManagerLocation.getResourceID());
        declarativeSlotPoolBridge.offerSlots(localTaskManagerLocation, new SimpleAckingTaskManagerGateway(), Collections.singleton(new SlotOffer(new AllocationID(), 0, ResourceProfile.ANY)));
        // make sure that the register slots method is called
        registerSlotsCalledFuture.join();
    }
}
Also used : ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) CoreMatchers.is(org.hamcrest.CoreMatchers.is) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) Arrays(java.util.Arrays) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) SystemClock(org.apache.flink.util.clock.SystemClock) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) RunWith(org.junit.runner.RunWith) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) CompletableFuture(java.util.concurrent.CompletableFuture) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) RpcTaskManagerGateway(org.apache.flink.runtime.jobmaster.RpcTaskManagerGateway) Duration(java.time.Duration) TestLogger(org.apache.flink.util.TestLogger) Assert.fail(org.junit.Assert.fail) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) SlotRequestId(org.apache.flink.runtime.jobmaster.SlotRequestId) Nonnull(javax.annotation.Nonnull) Parameterized(org.junit.runners.Parameterized) Collection(java.util.Collection) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Test(org.junit.Test) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) ExecutionException(java.util.concurrent.ExecutionException) List(java.util.List) JobID(org.apache.flink.api.common.JobID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Collections(java.util.Collections) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) CompletableFuture(java.util.concurrent.CompletableFuture) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Test(org.junit.Test)

Example 5 with JobMasterId

use of org.apache.flink.runtime.jobmaster.JobMasterId in project flink by apache.

the class DeclarativeSlotPoolBridgeTest method testNoConcurrentModificationWhenSuspendingAndReleasingSlot.

@Test
public void testNoConcurrentModificationWhenSuspendingAndReleasingSlot() throws Exception {
    try (DeclarativeSlotPoolBridge declarativeSlotPoolBridge = createDeclarativeSlotPoolBridge(new DefaultDeclarativeSlotPoolFactory(), requestSlotMatchingStrategy)) {
        declarativeSlotPoolBridge.start(jobMasterId, "localhost", mainThreadExecutor);
        final List<SlotRequestId> slotRequestIds = Arrays.asList(new SlotRequestId(), new SlotRequestId());
        final List<CompletableFuture<PhysicalSlot>> slotFutures = slotRequestIds.stream().map(slotRequestId -> {
            final CompletableFuture<PhysicalSlot> slotFuture = declarativeSlotPoolBridge.requestNewAllocatedSlot(slotRequestId, ResourceProfile.UNKNOWN, rpcTimeout);
            slotFuture.whenComplete((physicalSlot, throwable) -> {
                if (throwable != null) {
                    declarativeSlotPoolBridge.releaseSlot(slotRequestId, throwable);
                }
            });
            return slotFuture;
        }).collect(Collectors.toList());
        declarativeSlotPoolBridge.close();
        try {
            FutureUtils.waitForAll(slotFutures).get();
            fail("The slot futures should be completed exceptionally.");
        } catch (ExecutionException expected) {
        // expected
        }
    }
}
Also used : SlotRequestId(org.apache.flink.runtime.jobmaster.SlotRequestId) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) CoreMatchers.is(org.hamcrest.CoreMatchers.is) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) Arrays(java.util.Arrays) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) SystemClock(org.apache.flink.util.clock.SystemClock) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) RunWith(org.junit.runner.RunWith) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) CompletableFuture(java.util.concurrent.CompletableFuture) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) RpcTaskManagerGateway(org.apache.flink.runtime.jobmaster.RpcTaskManagerGateway) Duration(java.time.Duration) TestLogger(org.apache.flink.util.TestLogger) Assert.fail(org.junit.Assert.fail) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) SlotRequestId(org.apache.flink.runtime.jobmaster.SlotRequestId) Nonnull(javax.annotation.Nonnull) Parameterized(org.junit.runners.Parameterized) Collection(java.util.Collection) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Test(org.junit.Test) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) ExecutionException(java.util.concurrent.ExecutionException) List(java.util.List) JobID(org.apache.flink.api.common.JobID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Collections(java.util.Collections) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Aggregations

JobMasterId (org.apache.flink.runtime.jobmaster.JobMasterId)16 Test (org.junit.Test)13 JobID (org.apache.flink.api.common.JobID)12 Time (org.apache.flink.api.common.time.Time)10 IOException (java.io.IOException)7 Collection (java.util.Collection)7 Collections (java.util.Collections)7 CompletableFuture (java.util.concurrent.CompletableFuture)7 TestLogger (org.apache.flink.util.TestLogger)7 List (java.util.List)6 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)6 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)6 CoreMatchers.is (org.hamcrest.CoreMatchers.is)6 ExecutionException (java.util.concurrent.ExecutionException)5 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)5 Duration (java.time.Duration)4 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)4 Configuration (org.apache.flink.configuration.Configuration)4 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)4 Environment (org.apache.flink.runtime.execution.Environment)4