use of org.apache.flink.api.common.JobID in project flink by apache.
the class JobClient method submitJobDetached.
/**
* Submits a job in detached mode. The method sends the JobGraph to the
* JobManager and waits for the answer whether the job could be started or not.
*
* @param jobManagerGateway Gateway to the JobManager which will execute the jobs
* @param config The cluster wide configuration.
* @param jobGraph The job
* @param timeout Timeout in which the JobManager must have responded.
*/
public static void submitJobDetached(ActorGateway jobManagerGateway, Configuration config, JobGraph jobGraph, FiniteDuration timeout, ClassLoader classLoader) throws JobExecutionException {
checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
checkNotNull(jobGraph, "The jobGraph must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
LOG.info("Checking and uploading JAR files");
try {
jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
} catch (IOException e) {
throw new JobSubmissionException(jobGraph.getJobID(), "Could not upload the program's JAR files to the JobManager.", e);
}
Object result;
try {
Future<Object> future = jobManagerGateway.ask(new JobManagerMessages.SubmitJob(jobGraph, // only receive the Acknowledge for the job submission message
ListeningBehaviour.DETACHED), timeout);
result = Await.result(future, timeout);
} catch (TimeoutException e) {
throw new JobTimeoutException(jobGraph.getJobID(), "JobManager did not respond within " + timeout.toString(), e);
} catch (Throwable t) {
throw new JobSubmissionException(jobGraph.getJobID(), "Failed to send job to JobManager: " + t.getMessage(), t.getCause());
}
if (result instanceof JobManagerMessages.JobSubmitSuccess) {
JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
// validate response
if (!respondedID.equals(jobGraph.getJobID())) {
throw new JobExecutionException(jobGraph.getJobID(), "JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + ", response: " + respondedID);
}
} else if (result instanceof JobManagerMessages.JobResultFailure) {
try {
SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
throw t.deserializeError(classLoader);
} catch (JobExecutionException e) {
throw e;
} catch (Throwable t) {
throw new JobExecutionException(jobGraph.getJobID(), "JobSubmission failed: " + t.getMessage(), t);
}
} else {
throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
}
}
use of org.apache.flink.api.common.JobID in project flink by apache.
the class JobClient method awaitJobResult.
/**
* Given a JobListeningContext, awaits the result of the job execution that this context is bound to
* @param listeningContext The listening context of the job execution
* @return The result of the execution
* @throws JobExecutionException if anything goes wrong while monitoring the job
*/
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
final JobID jobID = listeningContext.getJobID();
final ActorRef jobClientActor = listeningContext.getJobClientActor();
final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
final FiniteDuration askTimeout = listeningContext.getTimeout();
// retrieves class loader if necessary
final ClassLoader classLoader = listeningContext.getClassLoader();
// ping the JobClientActor from time to time to check if it is still running
while (!jobSubmissionFuture.isCompleted()) {
try {
Await.ready(jobSubmissionFuture, askTimeout);
} catch (InterruptedException e) {
throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
} catch (TimeoutException e) {
try {
Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
// we got a reply, continue waiting for the job result
} catch (Exception eInner) {
// thus the health check failed
if (!jobSubmissionFuture.isCompleted()) {
throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
}
}
}
}
final Object answer;
try {
// we have already awaited the result, zero time to wait here
answer = Await.result(jobSubmissionFuture, Duration.Zero());
} catch (Throwable throwable) {
throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
} finally {
// failsafe shutdown of the client actor
jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
// second block handles the actual response
if (answer instanceof JobManagerMessages.JobResultSuccess) {
LOG.info("Job execution complete");
SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
if (result != null) {
try {
return result.toJobExecutionResult(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
}
} else {
throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
}
} else if (answer instanceof JobManagerMessages.JobResultFailure) {
LOG.info("Job execution failed");
SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
if (serThrowable != null) {
Throwable cause = serThrowable.deserializeError(classLoader);
if (cause instanceof JobExecutionException) {
throw (JobExecutionException) cause;
} else {
throw new JobExecutionException(jobID, "Job execution failed", cause);
}
} else {
throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
}
} else if (answer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
} else {
throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
}
}
use of org.apache.flink.api.common.JobID in project flink by apache.
the class ExecutionGraphBuilder method buildGraph.
/**
* Builds the ExecutionGraph from the JobGraph.
* If a prior execution graph exists, the JobGraph will be attached. If no prior execution
* graph exists, then the JobGraph will become attach to a new empty execution graph.
*/
public static ExecutionGraph buildGraph(@Nullable ExecutionGraph prior, JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, SlotProvider slotProvider, ClassLoader classLoader, CheckpointRecoveryFactory recoveryFactory, Time timeout, RestartStrategy restartStrategy, MetricGroup metrics, int parallelismForAutoMax, Logger log) throws JobExecutionException, JobException {
checkNotNull(jobGraph, "job graph cannot be null");
final String jobName = jobGraph.getName();
final JobID jobId = jobGraph.getJobID();
// create a new execution graph, if none exists so far
final ExecutionGraph executionGraph;
try {
executionGraph = (prior != null) ? prior : new ExecutionGraph(futureExecutor, ioExecutor, jobId, jobName, jobGraph.getJobConfiguration(), jobGraph.getSerializedExecutionConfig(), timeout, restartStrategy, jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths(), slotProvider, classLoader, metrics);
} catch (IOException e) {
throw new JobException("Could not create the execution graph.", e);
}
// set the basic properties
executionGraph.setScheduleMode(jobGraph.getScheduleMode());
executionGraph.setQueuedSchedulingAllowed(jobGraph.getAllowQueuedScheduling());
try {
executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
} catch (Throwable t) {
log.warn("Cannot create JSON plan for job", t);
// give the graph an empty plan
executionGraph.setJsonPlan("{}");
}
// initialize the vertices that have a master initialization hook
// file output formats create directories here, input formats create splits
final long initMasterStart = System.nanoTime();
log.info("Running initialization on master for job {} ({}).", jobName, jobId);
for (JobVertex vertex : jobGraph.getVertices()) {
String executableClass = vertex.getInvokableClassName();
if (executableClass == null || executableClass.isEmpty()) {
throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
}
if (vertex.getParallelism() == ExecutionConfig.PARALLELISM_AUTO_MAX) {
vertex.setParallelism(parallelismForAutoMax);
}
try {
vertex.initializeOnMaster(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
}
}
log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
// topologically sort the job vertices and attach the graph to the existing one
List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
if (log.isDebugEnabled()) {
log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
}
executionGraph.attachJobGraph(sortedTopology);
if (log.isDebugEnabled()) {
log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
}
// configure the state checkpointing
JobSnapshottingSettings snapshotSettings = jobGraph.getSnapshotSettings();
if (snapshotSettings != null) {
List<ExecutionJobVertex> triggerVertices = idToVertex(snapshotSettings.getVerticesToTrigger(), executionGraph);
List<ExecutionJobVertex> ackVertices = idToVertex(snapshotSettings.getVerticesToAcknowledge(), executionGraph);
List<ExecutionJobVertex> confirmVertices = idToVertex(snapshotSettings.getVerticesToConfirm(), executionGraph);
CompletedCheckpointStore completedCheckpoints;
CheckpointIDCounter checkpointIdCounter;
try {
int maxNumberOfCheckpointsToRetain = jobManagerConfig.getInteger(CoreOptions.MAX_RETAINED_CHECKPOINTS);
if (maxNumberOfCheckpointsToRetain <= 0) {
// warning and use 1 as the default value if the setting in
// state.checkpoints.max-retained-checkpoints is not greater than 0.
log.warn("The setting for '{} : {}' is invalid. Using default value of {}", CoreOptions.MAX_RETAINED_CHECKPOINTS.key(), maxNumberOfCheckpointsToRetain, CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue());
maxNumberOfCheckpointsToRetain = CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue();
}
completedCheckpoints = recoveryFactory.createCheckpointStore(jobId, maxNumberOfCheckpointsToRetain, classLoader);
checkpointIdCounter = recoveryFactory.createCheckpointIDCounter(jobId);
} catch (Exception e) {
throw new JobExecutionException(jobId, "Failed to initialize high-availability checkpoint handler", e);
}
// Maximum number of remembered checkpoints
int historySize = jobManagerConfig.getInteger(ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE);
CheckpointStatsTracker checkpointStatsTracker = new CheckpointStatsTracker(historySize, ackVertices, snapshotSettings, metrics);
// The default directory for externalized checkpoints
String externalizedCheckpointsDir = jobManagerConfig.getString(ConfigConstants.CHECKPOINTS_DIRECTORY_KEY, null);
// load the state backend for checkpoint metadata.
// if specified in the application, use from there, otherwise load from configuration
final StateBackend metadataBackend;
final StateBackend applicationConfiguredBackend = snapshotSettings.getDefaultStateBackend();
if (applicationConfiguredBackend != null) {
metadataBackend = applicationConfiguredBackend;
log.info("Using application-defined state backend for checkpoint/savepoint metadata: {}.", applicationConfiguredBackend);
} else {
try {
metadataBackend = AbstractStateBackend.loadStateBackendFromConfigOrCreateDefault(jobManagerConfig, classLoader, log);
} catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
}
}
executionGraph.enableCheckpointing(snapshotSettings.getCheckpointInterval(), snapshotSettings.getCheckpointTimeout(), snapshotSettings.getMinPauseBetweenCheckpoints(), snapshotSettings.getMaxConcurrentCheckpoints(), snapshotSettings.getExternalizedCheckpointSettings(), triggerVertices, ackVertices, confirmVertices, checkpointIdCounter, completedCheckpoints, externalizedCheckpointsDir, metadataBackend, checkpointStatsTracker);
}
return executionGraph;
}
use of org.apache.flink.api.common.JobID in project flink by apache.
the class LocalInputChannelTest method testConcurrentConsumeMultiplePartitions.
/**
* Tests the consumption of multiple subpartitions via local input channels.
*
* <p> Multiple producer tasks produce pipelined partitions, which are consumed by multiple
* tasks via local input channels.
*/
@Test
public void testConcurrentConsumeMultiplePartitions() throws Exception {
// Config
final int parallelism = 32;
final int producerBufferPoolSize = parallelism + 1;
final int numberOfBuffersPerChannel = 1024;
checkArgument(parallelism >= 1);
checkArgument(producerBufferPoolSize >= parallelism);
checkArgument(numberOfBuffersPerChannel >= 1);
// Setup
// One thread per produced partition and one per consumer
final ExecutorService executor = Executors.newFixedThreadPool(2 * parallelism);
final NetworkBufferPool networkBuffers = new NetworkBufferPool((parallelism * producerBufferPoolSize) + (parallelism * parallelism), TestBufferFactory.BUFFER_SIZE, MemoryType.HEAP);
final ResultPartitionConsumableNotifier partitionConsumableNotifier = mock(ResultPartitionConsumableNotifier.class);
final TaskActions taskActions = mock(TaskActions.class);
final IOManager ioManager = mock(IOManager.class);
final JobID jobId = new JobID();
final ResultPartitionManager partitionManager = new ResultPartitionManager();
final ResultPartitionID[] partitionIds = new ResultPartitionID[parallelism];
final TestPartitionProducer[] partitionProducers = new TestPartitionProducer[parallelism];
// Create all partitions
for (int i = 0; i < parallelism; i++) {
partitionIds[i] = new ResultPartitionID();
final ResultPartition partition = new ResultPartition("Test Name", taskActions, jobId, partitionIds[i], ResultPartitionType.PIPELINED, parallelism, parallelism, partitionManager, partitionConsumableNotifier, ioManager, true);
// Create a buffer pool for this partition
partition.registerBufferPool(networkBuffers.createBufferPool(producerBufferPoolSize, producerBufferPoolSize));
// Create the producer
partitionProducers[i] = new TestPartitionProducer(partition, false, new TestPartitionProducerBufferSource(parallelism, partition.getBufferProvider(), numberOfBuffersPerChannel));
// Register with the partition manager in order to allow the local input channels to
// request their respective partitions.
partitionManager.registerResultPartition(partition);
}
// Test
try {
// Submit producer tasks
List<Future<?>> results = Lists.newArrayListWithCapacity(parallelism + 1);
for (int i = 0; i < parallelism; i++) {
results.add(executor.submit(partitionProducers[i]));
}
// Submit consumer
for (int i = 0; i < parallelism; i++) {
results.add(executor.submit(new TestLocalInputChannelConsumer(i, parallelism, numberOfBuffersPerChannel, networkBuffers.createBufferPool(parallelism, parallelism), partitionManager, new TaskEventDispatcher(), partitionIds)));
}
// Wait for all to finish
for (Future<?> result : results) {
result.get();
}
} finally {
networkBuffers.destroy();
executor.shutdown();
}
}
use of org.apache.flink.api.common.JobID in project flink by apache.
the class LocalInputChannelTest method testConcurrentReleaseAndRetriggerPartitionRequest.
/**
* Verifies that concurrent release via the SingleInputGate and re-triggering
* of a partition request works smoothly.
*
* - SingleInputGate acquires its request lock and tries to release all
* registered channels. When releasing a channel, it needs to acquire
* the channel's shared request-release lock.
* - If a LocalInputChannel concurrently retriggers a partition request via
* a Timer Thread it acquires the channel's request-release lock and calls
* the retrigger callback on the SingleInputGate, which again tries to
* acquire the gate's request lock.
*
* For certain timings this obviously leads to a deadlock. This test reliably
* reproduced such a timing (reported in FLINK-5228). This test is pretty much
* testing the buggy implementation and has not much more general value. If it
* becomes obsolete at some point (future greatness ;)), feel free to remove it.
*
* The fix in the end was to to not acquire the channels lock when releasing it
* and/or not doing any input gate callbacks while holding the channel's lock.
* I decided to do both.
*/
@Test
public void testConcurrentReleaseAndRetriggerPartitionRequest() throws Exception {
final SingleInputGate gate = new SingleInputGate("test task name", new JobID(), new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, 1, mock(TaskActions.class), new UnregisteredTaskMetricsGroup.DummyTaskIOMetricGroup());
ResultPartitionManager partitionManager = mock(ResultPartitionManager.class);
when(partitionManager.createSubpartitionView(any(ResultPartitionID.class), anyInt(), any(BufferProvider.class), any(BufferAvailabilityListener.class))).thenAnswer(new Answer<ResultSubpartitionView>() {
@Override
public ResultSubpartitionView answer(InvocationOnMock invocationOnMock) throws Throwable {
// Sleep here a little to give the releaser Thread
// time to acquire the input gate lock. We throw
// the Exception to retrigger the request.
Thread.sleep(100);
throw new PartitionNotFoundException(new ResultPartitionID());
}
});
final LocalInputChannel channel = new LocalInputChannel(gate, 0, new ResultPartitionID(), partitionManager, new TaskEventDispatcher(), 1, 1, new UnregisteredTaskMetricsGroup.DummyTaskIOMetricGroup());
gate.setInputChannel(new IntermediateResultPartitionID(), channel);
Thread releaser = new Thread() {
@Override
public void run() {
try {
gate.releaseAllResources();
} catch (IOException ignored) {
}
}
};
Thread requester = new Thread() {
@Override
public void run() {
try {
channel.requestSubpartition(0);
} catch (IOException | InterruptedException ignored) {
}
}
};
requester.start();
releaser.start();
releaser.join();
requester.join();
}
Aggregations