use of org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup in project flink by apache.
the class TaskCancelAsyncProducerConsumerITCase method testCancelAsyncProducerAndConsumer.
/**
* Tests that a task waiting on an async producer/consumer that is stuck
* in a blocking buffer request can be properly cancelled.
*
* <p>This is currently required for the Flink Kafka sources, which spawn
* a separate Thread consuming from Kafka and producing the intermediate
* streams in the spawned Thread instead of the main task Thread.
*/
@Test
public void testCancelAsyncProducerAndConsumer() throws Exception {
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
TestingCluster flink = null;
try {
// Cluster
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SEGMENT_SIZE_KEY, 4096);
config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 8);
flink = new TestingCluster(config, true);
flink.start();
// Job with async producer and consumer
JobVertex producer = new JobVertex("AsyncProducer");
producer.setParallelism(1);
producer.setInvokableClass(AsyncProducer.class);
JobVertex consumer = new JobVertex("AsyncConsumer");
consumer.setParallelism(1);
consumer.setInvokableClass(AsyncConsumer.class);
consumer.connectNewDataSetAsInput(producer, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slot = new SlotSharingGroup(producer.getID(), consumer.getID());
producer.setSlotSharingGroup(slot);
consumer.setSlotSharingGroup(slot);
JobGraph jobGraph = new JobGraph(producer, consumer);
// Submit job and wait until running
ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
flink.submitJobDetached(jobGraph);
Object msg = new WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Future<?> runningFuture = jobManager.ask(msg, deadline.timeLeft());
Await.ready(runningFuture, deadline.timeLeft());
// Wait for blocking requests, cancel and wait for cancellation
msg = new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.CANCELED);
Future<?> cancelledFuture = jobManager.ask(msg, deadline.timeLeft());
boolean producerBlocked = false;
for (int i = 0; i < 50; i++) {
Thread thread = ASYNC_PRODUCER_THREAD;
if (thread != null && thread.isAlive()) {
StackTraceElement[] stackTrace = thread.getStackTrace();
producerBlocked = isInBlockingBufferRequest(stackTrace);
}
if (producerBlocked) {
break;
} else {
// Retry
Thread.sleep(500);
}
}
// Verify that async producer is in blocking request
assertTrue("Producer thread is not blocked: " + Arrays.toString(ASYNC_CONSUMER_THREAD.getStackTrace()), producerBlocked);
boolean consumerWaiting = false;
for (int i = 0; i < 50; i++) {
Thread thread = ASYNC_CONSUMER_THREAD;
if (thread != null && thread.isAlive()) {
consumerWaiting = thread.getState() == Thread.State.WAITING;
}
if (consumerWaiting) {
break;
} else {
// Retry
Thread.sleep(500);
}
}
// Verify that async consumer is in blocking request
assertTrue("Consumer thread is not blocked.", consumerWaiting);
msg = new CancelJob(jobGraph.getJobID());
Future<?> cancelFuture = jobManager.ask(msg, deadline.timeLeft());
Await.ready(cancelFuture, deadline.timeLeft());
Await.ready(cancelledFuture, deadline.timeLeft());
// Verify the expected Exceptions
assertNotNull(ASYNC_PRODUCER_EXCEPTION);
assertEquals(IllegalStateException.class, ASYNC_PRODUCER_EXCEPTION.getClass());
assertNotNull(ASYNC_CONSUMER_EXCEPTION);
assertEquals(IllegalStateException.class, ASYNC_CONSUMER_EXCEPTION.getClass());
} finally {
if (flink != null) {
flink.shutdown();
}
}
}
use of org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderReelection.
/**
* Tests that a job can be executed after a new leader has been elected. For all except for the
* last leader, the job is blocking. The JobManager will be terminated while executing the
* blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
* successfully executed.
*/
@Test
public void testJobExecutionOnClusterWithLeaderReelection() throws Exception {
int numJMs = 10;
int numTMs = 2;
int numSlotsPerTM = 3;
int parallelism = numTMs * numSlotsPerTM;
File rootFolder = tempFolder.getRoot();
Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTM);
// we "effectively" disable the automatic RecoverAllJobs message and sent it manually to make
// sure that all TMs have registered to the JM prior to issueing the RecoverAllJobs message
configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, AkkaUtils.INF_TIMEOUT().toString());
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(true);
JobVertex sender = new JobVertex("sender");
JobVertex receiver = new JobVertex("receiver");
sender.setInvokableClass(Tasks.Sender.class);
receiver.setInvokableClass(Tasks.BlockingOnceReceiver.class);
sender.setParallelism(parallelism);
receiver.setParallelism(parallelism);
receiver.connectNewDataSetAsInput(sender, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slotSharingGroup = new SlotSharingGroup();
sender.setSlotSharingGroup(slotSharingGroup);
receiver.setSlotSharingGroup(slotSharingGroup);
final JobGraph graph = new JobGraph("Blocking test job", sender, receiver);
final TestingCluster cluster = new TestingCluster(configuration);
ActorSystem clientActorSystem = null;
Thread thread = null;
JobSubmitterRunnable jobSubmission = null;
try {
cluster.start();
clientActorSystem = cluster.startJobClientActorSystem(graph.getJobID());
final ActorSystem clientAS = clientActorSystem;
jobSubmission = new JobSubmitterRunnable(clientAS, cluster, graph);
thread = new Thread(jobSubmission);
thread.start();
Deadline deadline = timeout.$times(3).fromNow();
// Kill all JobManager except for two
for (int i = 0; i < numJMs; i++) {
ActorGateway jm = cluster.getLeaderGateway(deadline.timeLeft());
cluster.waitForTaskManagersToBeRegisteredAtJobManager(jm.actor());
// recover all jobs, sent manually
log.info("Sent recover all jobs manually to job manager {}.", jm.path());
jm.tell(JobManagerMessages.getRecoverAllJobs());
if (i < numJMs - 1) {
Future<Object> future = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(graph.getJobID()), deadline.timeLeft());
Await.ready(future, deadline.timeLeft());
cluster.clearLeader();
if (i == numJMs - 2) {
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
}
log.info("Kill job manager {}.", jm.path());
jm.tell(TestingJobManagerMessages.getDisablePostStop());
jm.tell(Kill.getInstance());
}
}
log.info("Waiting for submitter thread to terminate.");
thread.join(deadline.timeLeft().toMillis());
log.info("Submitter thread has terminated.");
if (thread.isAlive()) {
fail("The job submission thread did not stop (meaning it did not succeeded in" + "executing the test job.");
}
Await.result(jobSubmission.resultPromise.future(), deadline.timeLeft());
} finally {
if (clientActorSystem != null) {
cluster.shutdownJobClientActorSystem(clientActorSystem);
}
if (thread != null && thread.isAlive()) {
jobSubmission.finished = true;
}
cluster.stop();
}
}
use of org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup in project flink by apache.
the class DefaultExecutionGraph method initializeJobVertex.
@Override
public void initializeJobVertex(ExecutionJobVertex ejv, long createTimestamp) throws JobException {
checkNotNull(ejv);
ejv.initialize(maxPriorAttemptsHistoryLength, rpcTimeout, createTimestamp, this.initialAttemptCounts.getAttemptCounts(ejv.getJobVertexId()), coordinatorStore);
ejv.connectToPredecessors(this.intermediateResults);
for (IntermediateResult res : ejv.getProducedDataSets()) {
IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res);
if (previousDataSet != null) {
throw new JobException(String.format("Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]", res.getId(), res, previousDataSet));
}
}
registerExecutionVerticesAndResultPartitionsFor(ejv);
// enrich network memory.
SlotSharingGroup slotSharingGroup = ejv.getSlotSharingGroup();
if (areJobVerticesAllInitialized(slotSharingGroup)) {
SsgNetworkMemoryCalculationUtils.enrichNetworkMemory(slotSharingGroup, this::getJobVertex, shuffleMaster);
}
}
use of org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup in project flink by apache.
the class SlotSharingSlotAllocator method determineParallelism.
@Override
public Optional<VertexParallelismWithSlotSharing> determineParallelism(JobInformation jobInformation, Collection<? extends SlotInfo> freeSlots) {
// TODO: This can waste slots if the max parallelism for slot sharing groups is not equal
final int slotsPerSlotSharingGroup = freeSlots.size() / jobInformation.getSlotSharingGroups().size();
if (slotsPerSlotSharingGroup == 0) {
// => less slots than slot-sharing groups
return Optional.empty();
}
final Iterator<? extends SlotInfo> slotIterator = freeSlots.iterator();
final Collection<ExecutionSlotSharingGroupAndSlot> assignments = new ArrayList<>();
final Map<JobVertexID, Integer> allVertexParallelism = new HashMap<>();
for (SlotSharingGroup slotSharingGroup : jobInformation.getSlotSharingGroups()) {
final List<JobInformation.VertexInformation> containedJobVertices = slotSharingGroup.getJobVertexIds().stream().map(jobInformation::getVertexInformation).collect(Collectors.toList());
final Map<JobVertexID, Integer> vertexParallelism = determineParallelism(containedJobVertices, slotsPerSlotSharingGroup);
final Iterable<ExecutionSlotSharingGroup> sharedSlotToVertexAssignment = createExecutionSlotSharingGroups(vertexParallelism);
for (ExecutionSlotSharingGroup executionSlotSharingGroup : sharedSlotToVertexAssignment) {
final SlotInfo slotInfo = slotIterator.next();
assignments.add(new ExecutionSlotSharingGroupAndSlot(executionSlotSharingGroup, slotInfo));
}
allVertexParallelism.putAll(vertexParallelism);
}
return Optional.of(new VertexParallelismWithSlotSharing(allVertexParallelism, assignments));
}
use of org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup in project flink by apache.
the class ExecutionGraphCoLocationRestartTest method testConstraintsAfterRestart.
@Test
public void testConstraintsAfterRestart() throws Exception {
final long timeout = 5000L;
JobVertex groupVertex = ExecutionGraphTestUtils.createNoOpVertex(NUM_TASKS);
JobVertex groupVertex2 = ExecutionGraphTestUtils.createNoOpVertex(NUM_TASKS);
groupVertex2.connectNewDataSetAsInput(groupVertex, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup sharingGroup = new SlotSharingGroup();
groupVertex.setSlotSharingGroup(sharingGroup);
groupVertex2.setSlotSharingGroup(sharingGroup);
groupVertex.setStrictlyCoLocatedWith(groupVertex2);
// initiate and schedule job
final JobGraph jobGraph = JobGraphTestUtils.streamingJobGraph(groupVertex, groupVertex2);
final ManuallyTriggeredScheduledExecutorService delayExecutor = new ManuallyTriggeredScheduledExecutorService();
final SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(jobGraph, ComponentMainThreadExecutorServiceAdapter.forMainThread()).setExecutionSlotAllocatorFactory(SchedulerTestingUtils.newSlotSharingExecutionSlotAllocatorFactory(TestingPhysicalSlotProvider.create((ignored) -> CompletableFuture.completedFuture(TestingPhysicalSlot.builder().build())))).setDelayExecutor(delayExecutor).setRestartBackoffTimeStrategy(new FixedDelayRestartBackoffTimeStrategy.FixedDelayRestartBackoffTimeStrategyFactory(1, 0).create()).build();
final ExecutionGraph eg = scheduler.getExecutionGraph();
// enable the queued scheduling for the slot pool
assertEquals(JobStatus.CREATED, eg.getState());
scheduler.startScheduling();
Predicate<AccessExecution> isDeploying = ExecutionGraphTestUtils.isInExecutionState(ExecutionState.DEPLOYING);
ExecutionGraphTestUtils.waitForAllExecutionsPredicate(eg, isDeploying, timeout);
assertEquals(JobStatus.RUNNING, eg.getState());
// sanity checks
validateConstraints(eg);
eg.getAllExecutionVertices().iterator().next().fail(new FlinkException("Test exception"));
assertEquals(JobStatus.RESTARTING, eg.getState());
// trigger registration of restartTasks(...) callback to cancelFuture before completing the
// cancellation. This ensures the restarting actions to be performed in main thread.
delayExecutor.triggerNonPeriodicScheduledTask();
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
if (vertex.getExecutionState() == ExecutionState.CANCELING) {
vertex.getCurrentExecutionAttempt().completeCancelling();
}
}
// wait until we have restarted
ExecutionGraphTestUtils.waitUntilJobStatus(eg, JobStatus.RUNNING, timeout);
ExecutionGraphTestUtils.waitForAllExecutionsPredicate(eg, isDeploying, timeout);
// checking execution vertex properties
validateConstraints(eg);
ExecutionGraphTestUtils.finishAllVertices(eg);
assertThat(eg.getState(), is(FINISHED));
}
Aggregations