use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class EmbeddedExecutor method submitJob.
private static CompletableFuture<JobID> submitJob(final Configuration configuration, final DispatcherGateway dispatcherGateway, final JobGraph jobGraph, final Time rpcTimeout) {
checkNotNull(jobGraph);
LOG.info("Submitting Job with JobId={}.", jobGraph.getJobID());
return dispatcherGateway.getBlobServerPort(rpcTimeout).thenApply(blobServerPort -> new InetSocketAddress(dispatcherGateway.getHostname(), blobServerPort)).thenCompose(blobServerAddress -> {
try {
ClientUtils.extractAndUploadJobGraphFiles(jobGraph, () -> new BlobClient(blobServerAddress, configuration));
} catch (FlinkException e) {
throw new CompletionException(e);
}
return dispatcherGateway.submitJob(jobGraph, rpcTimeout);
}).thenApply(ack -> jobGraph.getJobID());
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class ApplicationDispatcherBootstrapTest method testClusterIsShutdownInAttachedModeWhenJobCancelled.
@Test
public void testClusterIsShutdownInAttachedModeWhenJobCancelled() throws Exception {
final CompletableFuture<ApplicationStatus> clusterShutdown = new CompletableFuture<>();
final TestingDispatcherGateway dispatcherGateway = canceledJobGatewayBuilder().setClusterShutdownFunction(status -> {
clusterShutdown.complete(status);
return CompletableFuture.completedFuture(Acknowledge.get());
}).build();
final PackagedProgram program = getProgram(2);
final Configuration configuration = getConfiguration();
configuration.set(DeploymentOptions.ATTACHED, true);
final ApplicationDispatcherBootstrap bootstrap = new ApplicationDispatcherBootstrap(program, Collections.emptyList(), configuration, dispatcherGateway, scheduledExecutor, e -> {
});
final CompletableFuture<Void> applicationFuture = bootstrap.getApplicationCompletionFuture();
assertException(applicationFuture, UnsuccessfulExecutionException.class);
assertEquals(clusterShutdown.get(), ApplicationStatus.CANCELED);
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class ApplicationDispatcherBootstrapTest method testSubmitFailedJobOnApplicationError.
private void testSubmitFailedJobOnApplicationError(Configuration configuration, BiConsumer<JobID, Throwable> failedJobAssertion) throws Exception {
final CompletableFuture<Void> submitted = new CompletableFuture<>();
final TestingDispatcherGateway dispatcherGateway = TestingDispatcherGateway.newBuilder().setSubmitFailedFunction((jobId, jobName, t) -> {
try {
failedJobAssertion.accept(jobId, t);
submitted.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
} catch (Throwable assertion) {
submitted.completeExceptionally(assertion);
return FutureUtils.completedExceptionally(assertion);
}
}).setRequestJobStatusFunction(jobId -> submitted.thenApply(ignored -> JobStatus.FAILED)).setRequestJobResultFunction(jobId -> submitted.thenApply(ignored -> createJobResult(jobId, ApplicationStatus.FAILED))).build();
final ApplicationDispatcherBootstrap bootstrap = new ApplicationDispatcherBootstrap(FailingJob.getProgram(), Collections.emptyList(), configuration, dispatcherGateway, scheduledExecutor, exception -> {
});
bootstrap.getBootstrapCompletionFuture().get();
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.
@Test
public void testDispatcherProcessFailure() throws Exception {
final Time timeout = Time.seconds(30L);
final File zookeeperStoragePath = temporaryFolder.newFolder();
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Job managers
final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
// Task managers
TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
HighAvailabilityServices highAvailabilityServices = null;
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
// Cluster config
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
// Task manager configuration
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
TaskExecutorResourceUtils.adjustForLocalExecution(config);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
try {
final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
// Coordination directory
coordinateTempDir = temporaryFolder.newFolder();
// Start first process
dispatcherProcesses[0] = new DispatcherProcess(0, config);
dispatcherProcesses[0].startProcess();
highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
taskManagerRunners[i].start();
}
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
// Wait for all task managers to connect to the leading job manager
waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
// start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
dispatcherProcesses[0].destroy();
dispatcherProcesses[1] = new DispatcherProcess(1, config);
dispatcherProcesses[1].startProcess();
// we create the marker file which signals the program functions tasks that they can
// complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (DispatcherProcess p : dispatcherProcesses) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (taskManagerRunners[i] != null) {
taskManagerRunners[i].close();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
if (dispatcherProcess != null) {
dispatcherProcess.destroy();
}
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
RpcUtils.terminateRpcService(rpcService, timeout);
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class JobSubmitHandlerTest method testFailedJobSubmission.
@Test
public void testFailedJobSubmission() throws Exception {
final String errorMessage = "test";
DispatcherGateway mockGateway = TestingDispatcherGateway.newBuilder().setSubmitFunction(jobgraph -> FutureUtils.completedExceptionally(new Exception(errorMessage))).build();
JobSubmitHandler handler = new JobSubmitHandler(() -> CompletableFuture.completedFuture(mockGateway), RpcUtils.INF_TIMEOUT, Collections.emptyMap(), TestingUtils.defaultExecutor(), configuration);
final Path jobGraphFile = TEMPORARY_FOLDER.newFile().toPath();
JobGraph jobGraph = JobGraphTestUtils.emptyJobGraph();
try (ObjectOutputStream objectOut = new ObjectOutputStream(Files.newOutputStream(jobGraphFile))) {
objectOut.writeObject(jobGraph);
}
JobSubmitRequestBody request = new JobSubmitRequestBody(jobGraphFile.getFileName().toString(), Collections.emptyList(), Collections.emptyList());
try {
handler.handleRequest(HandlerRequest.create(request, EmptyMessageParameters.getInstance(), Collections.singletonList(jobGraphFile.toFile())), mockGateway).get();
} catch (Exception e) {
Throwable t = ExceptionUtils.stripExecutionException(e);
Assert.assertEquals(errorMessage, t.getMessage());
}
}
Aggregations