use of java.util.UUID in project flink by apache.
the class JobClientActorTest method testConnectionTimeoutAfterJobSubmission.
/** Tests that a {@link org.apache.flink.runtime.client.JobClientActorConnectionTimeoutException}
* is thrown after a successful job submission if the JobManager dies.
*
* @throws Exception
*/
@Test(expected = JobClientActorConnectionTimeoutException.class)
public void testConnectionTimeoutAfterJobSubmission() throws Exception {
FiniteDuration jobClientActorTimeout = new FiniteDuration(5, TimeUnit.SECONDS);
FiniteDuration timeout = jobClientActorTimeout.$times(2);
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = system.actorOf(Props.create(JobAcceptingActor.class, leaderSessionID));
TestingLeaderRetrievalService testingLeaderRetrievalService = new TestingLeaderRetrievalService(jobManager.path().toString(), leaderSessionID);
Props jobClientActorProps = JobSubmissionClientActor.createActorProps(testingLeaderRetrievalService, jobClientActorTimeout, false, clientConfig);
ActorRef jobClientActor = system.actorOf(jobClientActorProps);
Future<Object> jobExecutionResult = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(testJobGraph), new Timeout(timeout));
Future<Object> waitFuture = Patterns.ask(jobManager, new RegisterTest(), new Timeout(timeout));
Await.result(waitFuture, timeout);
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
Await.result(jobExecutionResult, timeout);
}
use of java.util.UUID in project flink by apache.
the class JobClientActorTest method testSubmissionTimeout.
/** Tests that a {@link JobClientActorSubmissionTimeoutException} is thrown when the job cannot
* be submitted by the JobSubmissionClientActor. This is here the case, because the started JobManager
* never replies to a {@link SubmitJob} message.
*
* @throws Exception
*/
@Test(expected = JobClientActorSubmissionTimeoutException.class)
public void testSubmissionTimeout() throws Exception {
FiniteDuration jobClientActorTimeout = new FiniteDuration(5, TimeUnit.SECONDS);
FiniteDuration timeout = jobClientActorTimeout.$times(2);
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = system.actorOf(Props.create(PlainActor.class, leaderSessionID));
TestingLeaderRetrievalService testingLeaderRetrievalService = new TestingLeaderRetrievalService(jobManager.path().toString(), leaderSessionID);
Props jobClientActorProps = JobSubmissionClientActor.createActorProps(testingLeaderRetrievalService, jobClientActorTimeout, false, clientConfig);
ActorRef jobClientActor = system.actorOf(jobClientActorProps);
Future<Object> jobExecutionResult = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(testJobGraph), new Timeout(timeout));
Await.result(jobExecutionResult, timeout);
}
use of java.util.UUID in project flink by apache.
the class AkkaKvStateLocationLookupServiceTest method testLeaderSessionIdChange.
/**
* Tests that messages are properly decorated with the leader session ID.
*/
@Test
public void testLeaderSessionIdChange() throws Exception {
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
Queue<LookupKvStateLocation> received = new LinkedBlockingQueue<>();
AkkaKvStateLocationLookupService lookupService = new AkkaKvStateLocationLookupService(leaderRetrievalService, testActorSystem, TIMEOUT, new AkkaKvStateLocationLookupService.DisabledLookupRetryStrategyFactory());
lookupService.start();
// Create test actors with random leader session IDs
KvStateLocation expected1 = new KvStateLocation(new JobID(), new JobVertexID(), 8282, "salt");
UUID leaderSessionId1 = UUID.randomUUID();
ActorRef testActor1 = LookupResponseActor.create(received, leaderSessionId1, expected1);
String testActorAddress1 = AkkaUtils.getAkkaURL(testActorSystem, testActor1);
KvStateLocation expected2 = new KvStateLocation(new JobID(), new JobVertexID(), 22321, "pepper");
UUID leaderSessionId2 = UUID.randomUUID();
ActorRef testActor2 = LookupResponseActor.create(received, leaderSessionId1, expected2);
String testActorAddress2 = AkkaUtils.getAkkaURL(testActorSystem, testActor2);
JobID jobId = new JobID();
//
// Notify about first leader
//
leaderRetrievalService.notifyListener(testActorAddress1, leaderSessionId1);
KvStateLocation location = Await.result(lookupService.getKvStateLookupInfo(jobId, "rock"), TIMEOUT);
assertEquals(expected1, location);
assertEquals(1, received.size());
verifyLookupMsg(received.poll(), jobId, "rock");
//
// Notify about second leader
//
leaderRetrievalService.notifyListener(testActorAddress2, leaderSessionId2);
location = Await.result(lookupService.getKvStateLookupInfo(jobId, "roll"), TIMEOUT);
assertEquals(expected2, location);
assertEquals(1, received.size());
verifyLookupMsg(received.poll(), jobId, "roll");
}
use of java.util.UUID in project flink by apache.
the class TaskExecutor method offerSlotsToJobManager.
// ------------------------------------------------------------------------
// Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
log.debug("There is no job manager connection to the leader of job {}.", jobId);
} else {
if (taskSlotTable.hasAllocatedSlots(jobId)) {
log.info("Offer reserved slots to the leader of job {}.", jobId);
final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
final UUID leaderId = jobManagerConnection.getLeaderId();
final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
while (reservedSlotsIterator.hasNext()) {
SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
try {
if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
// the slot is either free or releasing at the moment
final String message = "Could not mark slot " + jobId + " active.";
log.debug(message);
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
}
} catch (SlotNotFoundException e) {
final String message = "Could not mark slot " + jobId + " active.";
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
continue;
}
reservedSlots.add(offer);
}
Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {
@Override
public void accept(Iterable<SlotOffer> acceptedSlots) {
// check if the response is still valid
if (isJobManagerConnectionValid(jobId, leaderId)) {
// mark accepted slots active
for (SlotOffer acceptedSlot : acceptedSlots) {
reservedSlots.remove(acceptedSlot);
}
final Exception e = new Exception("The slot was rejected by the JobManager.");
for (SlotOffer rejectedSlot : reservedSlots) {
freeSlot(rejectedSlot.getAllocationId(), e);
}
} else {
// discard the response since there is a new leader for the job
log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
}
}
}, getMainThreadExecutor());
acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable throwable) {
if (throwable instanceof TimeoutException) {
// We ran into a timeout. Try again.
offerSlotsToJobManager(jobId);
} else {
// We encountered an exception. Free the slots and return them to the RM.
for (SlotOffer reservedSlot : reservedSlots) {
freeSlot(reservedSlot.getAllocationId(), throwable);
}
}
return null;
}
}, getMainThreadExecutor());
} else {
log.debug("There are no unassigned slots for the job {}.", jobId);
}
}
}
use of java.util.UUID in project flink by apache.
the class ResourceManager method registerJobManager.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future<UUID> jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {
@Override
public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
} else {
log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
Aggregations