use of org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation in project flink by apache.
the class TaskExecutorTest method testInitialSlotReportFailure.
/**
* Tests that the {@link TaskExecutor} tries to reconnect if the initial slot report fails.
*/
@Test
public void testInitialSlotReportFailure() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
taskExecutor.start();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
try {
return responseQueue.take();
} catch (InterruptedException e) {
return FutureUtils.completedExceptionally(e);
}
});
final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234)));
final CountDownLatch numberRegistrations = new CountDownLatch(2);
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
numberRegistrations.countDown();
return registrationResponse;
});
responseQueue.offer(FutureUtils.completedExceptionally(new FlinkException("Test exception")));
responseQueue.offer(CompletableFuture.completedFuture(Acknowledge.get()));
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
// wait for the second registration attempt
numberRegistrations.await();
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation in project flink by apache.
the class TaskExecutorTest method setup.
@Before
public void setup() throws IOException {
rpc = new TestingRpcService();
configuration = new Configuration();
TaskExecutorResourceUtils.adjustForLocalExecution(configuration);
unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
jobId = new JobID();
jobId2 = new JobID();
testingFatalErrorHandler = new TestingFatalErrorHandler();
haServices = new TestingHighAvailabilityServices();
resourceManagerLeaderRetriever = new SettableLeaderRetrievalService();
jobManagerLeaderRetriever = new SettableLeaderRetrievalService();
jobManagerLeaderRetriever2 = new SettableLeaderRetrievalService();
haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetriever);
haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetriever);
haServices.setJobMasterLeaderRetriever(jobId2, jobManagerLeaderRetriever2);
nettyShuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
}
use of org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation in project flink by apache.
the class JobMasterTest method testRequestPartitionState.
/**
* Tests the {@link JobMaster#requestPartitionState(IntermediateDataSetID, ResultPartitionID)}
* call for a finished result partition.
*/
@Test
public void testRequestPartitionState() throws Exception {
final JobGraph producerConsumerJobGraph = producerConsumerJobGraph();
final JobMaster jobMaster = new JobMasterBuilder(producerConsumerJobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
jobMaster.start();
try {
final CompletableFuture<TaskDeploymentDescriptor> tddFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setSubmitTaskConsumer((taskDeploymentDescriptor, jobMasterId) -> {
tddFuture.complete(taskDeploymentDescriptor);
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final LocalUnresolvedTaskManagerLocation taskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, producerConsumerJobGraph.getJobID(), testingTaskExecutorGateway, taskManagerLocation);
assertThat(slotOffers, hasSize(1));
// obtain tdd for the result partition ids
final TaskDeploymentDescriptor tdd = tddFuture.get();
assertThat(tdd.getProducedPartitions(), hasSize(1));
final ResultPartitionDeploymentDescriptor partition = tdd.getProducedPartitions().iterator().next();
final ExecutionAttemptID executionAttemptId = tdd.getExecutionAttemptId();
final ExecutionAttemptID copiedExecutionAttemptId = new ExecutionAttemptID(executionAttemptId);
// finish the producer task
jobMasterGateway.updateTaskExecutionState(new TaskExecutionState(executionAttemptId, ExecutionState.FINISHED)).get();
// request the state of the result partition of the producer
final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), copiedExecutionAttemptId);
CompletableFuture<ExecutionState> partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), partitionId);
assertThat(partitionStateFuture.get(), equalTo(ExecutionState.FINISHED));
// ask for unknown result partition
partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), new ResultPartitionID());
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, IllegalArgumentException.class).isPresent(), is(true));
}
// ask for wrong intermediate data set id
partitionStateFuture = jobMasterGateway.requestPartitionState(new IntermediateDataSetID(), partitionId);
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, IllegalArgumentException.class).isPresent(), is(true));
}
// ask for "old" execution
partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), new ResultPartitionID(partition.getPartitionId(), new ExecutionAttemptID()));
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, PartitionProducerDisposedException.class).isPresent(), is(true));
}
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation in project flink by apache.
the class JobMasterTest method testTaskExecutorNotReleasedOnFailedAllocationIfPartitionIsAllocated.
@Test
public void testTaskExecutorNotReleasedOnFailedAllocationIfPartitionIsAllocated() throws Exception {
final JobManagerSharedServices jobManagerSharedServices = new TestingJobManagerSharedServicesBuilder().build();
final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
final LocalUnresolvedTaskManagerLocation taskManagerUnresolvedLocation = new LocalUnresolvedTaskManagerLocation();
final AtomicBoolean isTrackingPartitions = new AtomicBoolean(true);
final TestingJobMasterPartitionTracker partitionTracker = new TestingJobMasterPartitionTracker();
partitionTracker.setIsTrackingPartitionsForFunction(ignored -> isTrackingPartitions.get());
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withJobManagerSharedServices(jobManagerSharedServices).withHeartbeatServices(heartbeatServices).withPartitionTrackerFactory(ignored -> partitionTracker).createJobMaster();
final CompletableFuture<JobID> disconnectTaskExecutorFuture = new CompletableFuture<>();
final CompletableFuture<AllocationID> freedSlotFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setFreeSlotFunction((allocationID, throwable) -> {
freedSlotFuture.complete(allocationID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setDisconnectJobManagerConsumer((jobID, throwable) -> disconnectTaskExecutorFuture.complete(jobID)).createTestingTaskExecutorGateway();
try {
jobMaster.start();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, jobGraph.getJobID(), testingTaskExecutorGateway, taskManagerUnresolvedLocation);
// check that we accepted the offered slot
assertThat(slotOffers, hasSize(1));
final AllocationID allocationId = slotOffers.iterator().next().getAllocationId();
jobMasterGateway.failSlot(taskManagerUnresolvedLocation.getResourceID(), allocationId, new FlinkException("Fail allocation test exception"));
// we should free the slot, but not disconnect from the TaskExecutor as we still have an
// allocated partition
assertThat(freedSlotFuture.get(), equalTo(allocationId));
// trigger some request to guarantee ensure the slotAllocationFailure processing if
// complete
jobMasterGateway.requestJobStatus(Time.seconds(5)).get();
assertThat(disconnectTaskExecutorFuture.isDone(), is(false));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation in project flink by apache.
the class JobMasterExecutionDeploymentReconciliationTest method testExecutionDeploymentReconciliation.
/**
* Tests how the job master handles unknown/missing executions.
*/
@Test
public void testExecutionDeploymentReconciliation() throws Exception {
JobMasterBuilder.TestingOnCompletionActions onCompletionActions = new JobMasterBuilder.TestingOnCompletionActions();
TestingExecutionDeploymentTrackerWrapper deploymentTrackerWrapper = new TestingExecutionDeploymentTrackerWrapper();
final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
JobMaster jobMaster = createAndStartJobMaster(onCompletionActions, deploymentTrackerWrapper, jobGraph);
JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
RPC_SERVICE_RESOURCE.getTestingRpcService().registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final CompletableFuture<ExecutionAttemptID> taskCancellationFuture = new CompletableFuture<>();
TaskExecutorGateway taskExecutorGateway = createTaskExecutorGateway(taskCancellationFuture);
LocalUnresolvedTaskManagerLocation localUnresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
registerTaskExecutorAndOfferSlots(jobMasterGateway, jobGraph.getJobID(), taskExecutorGateway, localUnresolvedTaskManagerLocation);
ExecutionAttemptID deployedExecution = deploymentTrackerWrapper.getTaskDeploymentFuture().get();
assertFalse(taskCancellationFuture.isDone());
ExecutionAttemptID unknownDeployment = new ExecutionAttemptID();
// the deployment report is missing the just deployed task, but contains the ID of some
// other unknown deployment
// the job master should cancel the unknown deployment, and fail the job
jobMasterGateway.heartbeatFromTaskManager(localUnresolvedTaskManagerLocation.getResourceID(), new TaskExecutorToJobManagerHeartbeatPayload(new AccumulatorReport(Collections.emptyList()), new ExecutionDeploymentReport(Collections.singleton(unknownDeployment))));
assertThat(taskCancellationFuture.get(), is(unknownDeployment));
assertThat(deploymentTrackerWrapper.getStopFuture().get(), is(deployedExecution));
assertThat(onCompletionActions.getJobReachedGloballyTerminalStateFuture().get().getArchivedExecutionGraph().getState(), is(JobStatus.FAILED));
}
Aggregations