use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class OneInputStreamTaskTest method testSnapshottingAndRestoring.
/**
* Tests that the stream operator can snapshot and restore the operator state of chained
* operators.
*/
@Test
public void testSnapshottingAndRestoring() throws Exception {
final Deadline deadline = Deadline.fromNow(Duration.ofMinutes(2));
final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
testHarness.setupOutputForSingletonOperatorChain();
IdentityKeySelector<String> keySelector = new IdentityKeySelector<>();
testHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
long checkpointId = 1L;
long checkpointTimestamp = 1L;
int numberChainedTasks = 11;
StreamConfig streamConfig = testHarness.getStreamConfig();
configureChainedTestingStreamOperator(streamConfig, numberChainedTasks);
TestTaskStateManager taskStateManager = testHarness.taskStateManager;
// reset number of restore calls
TestingStreamOperator.numberRestoreCalls = 0;
testHarness.invoke();
testHarness.waitForTaskRunning();
final OneInputStreamTask<String, String> streamTask = testHarness.getTask();
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, checkpointTimestamp);
streamTask.triggerCheckpointAsync(checkpointMetaData, CheckpointOptions.forCheckpointWithDefaultLocation()).get();
// since no state was set, there shouldn't be restore calls
assertEquals(0, TestingStreamOperator.numberRestoreCalls);
taskStateManager.getWaitForReportLatch().await();
assertEquals(checkpointId, taskStateManager.getReportedCheckpointId());
testHarness.endInput();
testHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
final OneInputStreamTaskTestHarness<String, String> restoredTaskHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
restoredTaskHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
restoredTaskHarness.setTaskStateSnapshot(checkpointId, taskStateManager.getLastJobManagerTaskStateSnapshot());
StreamConfig restoredTaskStreamConfig = restoredTaskHarness.getStreamConfig();
configureChainedTestingStreamOperator(restoredTaskStreamConfig, numberChainedTasks);
TaskStateSnapshot stateHandles = taskStateManager.getLastJobManagerTaskStateSnapshot();
Assert.assertEquals(numberChainedTasks, stateHandles.getSubtaskStateMappings().size());
TestingStreamOperator.numberRestoreCalls = 0;
// transfer state to new harness
restoredTaskHarness.taskStateManager.restoreLatestCheckpointState(taskStateManager.getJobManagerTaskStateSnapshotsByCheckpointId());
restoredTaskHarness.invoke();
restoredTaskHarness.endInput();
restoredTaskHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
// restore of every chained operator should have been called
assertEquals(numberChainedTasks, TestingStreamOperator.numberRestoreCalls);
TestingStreamOperator.numberRestoreCalls = 0;
TestingStreamOperator.numberSnapshotCalls = 0;
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class TaskCancelAsyncProducerConsumerITCase method testCancelAsyncProducerAndConsumer.
/**
* Tests that a task waiting on an async producer/consumer that is stuck in a blocking buffer
* request can be properly cancelled.
*
* <p>This is currently required for the Flink Kafka sources, which spawn a separate Thread
* consuming from Kafka and producing the intermediate streams in the spawned Thread instead of
* the main task Thread.
*/
@Test
public void testCancelAsyncProducerAndConsumer(@InjectMiniCluster MiniCluster flink) throws Exception {
Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));
// Job with async producer and consumer
JobVertex producer = new JobVertex("AsyncProducer");
producer.setParallelism(1);
producer.setInvokableClass(AsyncProducer.class);
JobVertex consumer = new JobVertex("AsyncConsumer");
consumer.setParallelism(1);
consumer.setInvokableClass(AsyncConsumer.class);
consumer.connectNewDataSetAsInput(producer, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slot = new SlotSharingGroup();
producer.setSlotSharingGroup(slot);
consumer.setSlotSharingGroup(slot);
JobGraph jobGraph = JobGraphTestUtils.streamingJobGraph(producer, consumer);
// Submit job and wait until running
flink.runDetached(jobGraph);
FutureUtils.retrySuccessfulWithDelay(() -> flink.getJobStatus(jobGraph.getJobID()), Time.milliseconds(10), deadline, status -> status == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor()).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
boolean producerBlocked = false;
for (int i = 0; i < 50; i++) {
Thread thread = ASYNC_PRODUCER_THREAD;
if (thread != null && thread.isAlive()) {
StackTraceElement[] stackTrace = thread.getStackTrace();
producerBlocked = isInBlockingBufferRequest(stackTrace);
}
if (producerBlocked) {
break;
} else {
// Retry
Thread.sleep(500L);
}
}
// Verify that async producer is in blocking request
assertTrue("Producer thread is not blocked: " + Arrays.toString(ASYNC_PRODUCER_THREAD.getStackTrace()), producerBlocked);
boolean consumerWaiting = false;
for (int i = 0; i < 50; i++) {
Thread thread = ASYNC_CONSUMER_THREAD;
if (thread != null && thread.isAlive()) {
consumerWaiting = thread.getState() == Thread.State.WAITING;
}
if (consumerWaiting) {
break;
} else {
// Retry
Thread.sleep(500L);
}
}
// Verify that async consumer is in blocking request
assertTrue("Consumer thread is not blocked.", consumerWaiting);
flink.cancelJob(jobGraph.getJobID()).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// wait until the job is canceled
FutureUtils.retrySuccessfulWithDelay(() -> flink.getJobStatus(jobGraph.getJobID()), Time.milliseconds(10), deadline, status -> status == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor()).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Verify the expected Exceptions
assertNotNull(ASYNC_PRODUCER_EXCEPTION);
assertEquals(CancelTaskException.class, ASYNC_PRODUCER_EXCEPTION.getClass());
assertNotNull(ASYNC_CONSUMER_EXCEPTION);
assertEquals(IllegalStateException.class, ASYNC_CONSUMER_EXCEPTION.getClass());
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.
@Test
public void testDispatcherProcessFailure() throws Exception {
final Time timeout = Time.seconds(30L);
final File zookeeperStoragePath = temporaryFolder.newFolder();
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Job managers
final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
// Task managers
TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
HighAvailabilityServices highAvailabilityServices = null;
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
// Cluster config
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
// Task manager configuration
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
TaskExecutorResourceUtils.adjustForLocalExecution(config);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
try {
final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
// Coordination directory
coordinateTempDir = temporaryFolder.newFolder();
// Start first process
dispatcherProcesses[0] = new DispatcherProcess(0, config);
dispatcherProcesses[0].startProcess();
highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
taskManagerRunners[i].start();
}
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
// Wait for all task managers to connect to the leading job manager
waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
// start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
dispatcherProcesses[0].destroy();
dispatcherProcesses[1] = new DispatcherProcess(1, config);
dispatcherProcesses[1].startProcess();
// we create the marker file which signals the program functions tasks that they can
// complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (DispatcherProcess p : dispatcherProcesses) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (taskManagerRunners[i] != null) {
taskManagerRunners[i].close();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
if (dispatcherProcess != null) {
dispatcherProcess.destroy();
}
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
RpcUtils.terminateRpcService(rpcService, timeout);
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class CheckpointedInputGateTest method testPriorityBeforeClose.
/**
* Tests a priority notification happening right before cancellation. The mail would be
* processed while draining mailbox but can't pull any data anymore.
*/
@Test
public void testPriorityBeforeClose() throws IOException, InterruptedException {
NetworkBufferPool bufferPool = new NetworkBufferPool(10, 1024);
try (Closer closer = Closer.create()) {
closer.register(bufferPool::destroy);
for (int repeat = 0; repeat < 100; repeat++) {
setUp();
SingleInputGate singleInputGate = new SingleInputGateBuilder().setNumberOfChannels(2).setBufferPoolFactory(bufferPool.createBufferPool(2, Integer.MAX_VALUE)).setSegmentProvider(bufferPool).setChannelFactory(InputChannelBuilder::buildRemoteChannel).build();
singleInputGate.setup();
((RemoteInputChannel) singleInputGate.getChannel(0)).requestSubpartition();
final TaskMailboxImpl mailbox = new TaskMailboxImpl();
MailboxExecutorImpl mailboxExecutor = new MailboxExecutorImpl(mailbox, 0, StreamTaskActionExecutor.IMMEDIATE);
ValidatingCheckpointHandler validatingHandler = new ValidatingCheckpointHandler(1);
SingleCheckpointBarrierHandler barrierHandler = TestBarrierHandlerFactory.forTarget(validatingHandler).create(singleInputGate, new MockChannelStateWriter());
CheckpointedInputGate checkpointedInputGate = new CheckpointedInputGate(singleInputGate, barrierHandler, mailboxExecutor, UpstreamRecoveryTracker.forInputGate(singleInputGate));
final int oldSize = mailbox.size();
enqueue(checkpointedInputGate, 0, barrier(1));
// wait for priority mail to be enqueued
Deadline deadline = Deadline.fromNow(Duration.ofMinutes(1));
while (deadline.hasTimeLeft() && oldSize >= mailbox.size()) {
Thread.sleep(1);
}
// test the race condition
// either priority event could be handled, then we expect a checkpoint to be
// triggered or closing came first in which case we expect a CancelTaskException
CountDownLatch beforeLatch = new CountDownLatch(2);
final CheckedThread canceler = new CheckedThread("Canceler") {
@Override
public void go() throws IOException {
beforeLatch.countDown();
singleInputGate.close();
}
};
canceler.start();
beforeLatch.countDown();
try {
while (mailboxExecutor.tryYield()) {
}
assertEquals(1L, validatingHandler.triggeredCheckpointCounter);
} catch (CancelTaskException e) {
}
canceler.join();
}
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class BlobsCleanupITCase method waitForEmptyBlobDir.
/**
* Waits until the given {@link org.apache.flink.runtime.blob.BlobService} storage directory
* does not contain any job-related folders any more.
*
* @param blobDir directory of a {@link org.apache.flink.runtime.blob.BlobServer} or {@link
* org.apache.flink.runtime.blob.BlobCacheService}
* @param remaining remaining time for this test
* @see org.apache.flink.runtime.blob.BlobUtils
*/
private static void waitForEmptyBlobDir(File blobDir, Duration remaining) throws InterruptedException {
long deadline = System.currentTimeMillis() + remaining.toMillis();
String[] blobDirContents;
final FilenameFilter jobDirFilter = (dir, name) -> name.startsWith("job_");
do {
blobDirContents = blobDir.list(jobDirFilter);
if (blobDirContents == null || blobDirContents.length == 0) {
return;
}
Thread.sleep(RETRY_INTERVAL);
} while (System.currentTimeMillis() < deadline);
fail("Timeout while waiting for " + blobDir.getAbsolutePath() + " to become empty. Current contents: " + Arrays.toString(blobDirContents));
}
Aggregations