use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class BlobsCleanupITCase method testBlobServerCleanup.
private void testBlobServerCleanup(final TestCase testCase) throws Exception {
final MiniCluster miniCluster = miniClusterResource.getMiniCluster();
final int numTasks = 2;
final Deadline timeout = Deadline.fromNow(Duration.ofSeconds(30L));
final JobGraph jobGraph = createJobGraph(testCase, numTasks);
final JobID jid = jobGraph.getJobID();
// upload a blob
final File tempBlob = File.createTempFile("Required", ".jar");
final int blobPort = miniCluster.getClusterInformation().getBlobServerPort();
List<PermanentBlobKey> keys = BlobClient.uploadFiles(new InetSocketAddress("localhost", blobPort), configuration, jid, Collections.singletonList(new Path(tempBlob.getAbsolutePath())));
assertThat(keys, hasSize(1));
jobGraph.addUserJarBlobKey(keys.get(0));
if (testCase == TestCase.JOB_SUBMISSION_FAILS) {
// add an invalid key so that the submission fails
jobGraph.addUserJarBlobKey(new PermanentBlobKey());
}
final CompletableFuture<JobSubmissionResult> submissionFuture = miniCluster.submitJob(jobGraph);
if (testCase == TestCase.JOB_SUBMISSION_FAILS) {
try {
submissionFuture.get();
fail("Expected job submission failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, JobSubmissionException.class).isPresent(), is(true));
}
} else {
final JobSubmissionResult jobSubmissionResult = submissionFuture.get();
assertThat(jobSubmissionResult.getJobID(), is(jid));
final CompletableFuture<JobResult> resultFuture = miniCluster.requestJobResult(jid);
if (testCase == TestCase.JOB_FAILS) {
// fail a task so that the job is going to be recovered (we actually do not
// need the blocking part of the invokable and can start throwing right away)
FailingBlockingInvokable.unblock();
// job will get restarted, BlobCache may re-download the BLOB if already deleted
// then the tasks will fail again and the restart strategy will finalise the job
final JobResult jobResult = resultFuture.get();
assertThat(jobResult.isSuccess(), is(false));
assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.FAILED));
} else if (testCase == TestCase.JOB_IS_CANCELLED) {
miniCluster.cancelJob(jid);
final JobResult jobResult = resultFuture.get();
assertThat(jobResult.isSuccess(), is(false));
assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
} else {
final JobResult jobResult = resultFuture.get();
Throwable cause = jobResult.getSerializedThrowable().map(throwable -> throwable.deserializeError(getClass().getClassLoader())).orElse(null);
assertThat(ExceptionUtils.stringifyException(cause), jobResult.isSuccess(), is(true));
}
}
// both BlobServer and BlobCache should eventually delete all files
File[] blobDirs = blobBaseDir.listFiles((dir, name) -> name.startsWith("blobStore-"));
assertNotNull(blobDirs);
for (File blobDir : blobDirs) {
waitForEmptyBlobDir(blobDir, timeout.timeLeft());
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingPartitionedOperatorState.
/**
* Tests rescaling of partitioned operator state. More specific, we test the mechanism with
* {@link ListCheckpointed} as it subsumes {@link
* org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
*/
public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut, OperatorCheckpointMethod checkpointMethod) throws Exception {
final int parallelism = scaleOut ? numSlots : numSlots / 2;
final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
final int maxParallelism = 13;
Duration timeout = Duration.ofMinutes(3);
Deadline deadline = Deadline.now().plus(timeout);
ClusterClient<?> client = cluster.getClusterClient();
int counterSize = Math.max(parallelism, parallelism2);
if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION || checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
PartitionedStateSource.checkCorrectSnapshot = new int[counterSize];
PartitionedStateSource.checkCorrectRestore = new int[counterSize];
} else {
PartitionedStateSourceListCheckpointed.checkCorrectSnapshot = new int[counterSize];
PartitionedStateSourceListCheckpointed.checkCorrectRestore = new int[counterSize];
}
try {
JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, checkpointMethod);
// make sure the job does not finish before we take the savepoint
StateSourceBase.canFinishLatch = new CountDownLatch(1);
final JobID jobID = jobGraph.getJobID();
client.submitJob(jobGraph).get();
// wait until the operator is started
waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
// wait until the operator handles some data
StateSourceBase.workStartedLatch.await();
CompletableFuture<String> savepointPathFuture = FutureUtils.retryWithDelay(() -> client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL), (int) deadline.timeLeft().getSeconds() / 10, Time.seconds(10), (throwable) -> true, TestingUtils.defaultScheduledExecutor());
final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// we took a savepoint, the job can finish now
StateSourceBase.canFinishLatch.countDown();
client.cancel(jobID).get();
while (!getRunningJobs(client).isEmpty()) {
Thread.sleep(50);
}
JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, checkpointMethod);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
int sumExp = 0;
int sumAct = 0;
if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION) {
for (int c : PartitionedStateSource.checkCorrectSnapshot) {
sumExp += c;
}
for (int c : PartitionedStateSource.checkCorrectRestore) {
sumAct += c;
}
} else if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
for (int c : PartitionedStateSource.checkCorrectSnapshot) {
sumExp += c;
}
for (int c : PartitionedStateSource.checkCorrectRestore) {
sumAct += c;
}
sumExp *= parallelism2;
} else {
for (int c : PartitionedStateSourceListCheckpointed.checkCorrectSnapshot) {
sumExp += c;
}
for (int c : PartitionedStateSourceListCheckpointed.checkCorrectRestore) {
sumAct += c;
}
}
assertEquals(sumExp, sumAct);
} finally {
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.
/**
* FLINK-5985
*
* <p>This test ensures we can restore from a savepoint under modifications to the job graph
* that only concern stateless operators.
*/
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
// Config
int numTaskManagers = 2;
int numSlotsPerTaskManager = 2;
int parallelism = 2;
// Test deadline
final Deadline deadline = Deadline.now().plus(Duration.ofMinutes(5));
// Flink configuration
final Configuration config = new Configuration();
config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());
String savepointPath;
LOG.info("Flink configuration: " + config + ".");
// Start Flink
MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder().setConfiguration(config).setNumberTaskManagers(numTaskManagers).setNumberSlotsPerTaskManager(numSlotsPerTaskManager).build());
LOG.info("Shutting down Flink cluster.");
cluster.before();
ClusterClient<?> client = cluster.getClusterClient();
try {
final StatefulCounter statefulCounter = new StatefulCounter();
StatefulCounter.resetForTest(parallelism);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
env.addSource(new InfiniteTestSource()).shuffle().map(value -> 4 * value).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(value -> 2 * value).addSink(new DiscardingSink<>());
JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
JobID jobID = client.submitJob(originalJobGraph).get();
// wait for the Tasks to be ready
waitForAllTaskRunning(cluster.getMiniCluster(), jobID, false);
assertTrue(StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
savepointPath = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL).get();
LOG.info("Retrieved savepoint: " + savepointPath + ".");
} finally {
// Shut down the Flink cluster (thereby canceling the job)
LOG.info("Shutting down Flink cluster.");
cluster.after();
}
// create a new MiniCluster to make sure we start with completely
// new resources
cluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder().setConfiguration(config).setNumberTaskManagers(numTaskManagers).setNumberSlotsPerTaskManager(numSlotsPerTaskManager).build());
LOG.info("Restarting Flink cluster.");
cluster.before();
client = cluster.getClusterClient();
try {
// Reset static test helpers
StatefulCounter.resetForTest(parallelism);
// Gather all task deployment descriptors
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
// generate a modified job graph that adds a stateless op
env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(value -> value).addSink(new DiscardingSink<>());
JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
// Set the savepoint path
modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
client.submitJob(modifiedJobGraph).get();
// Await state is restored
assertTrue(StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
// Await some progress after restore
assertTrue(StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
} finally {
cluster.after();
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class UnalignedCheckpointTestBase method waitForCleanShutdown.
private void waitForCleanShutdown() throws InterruptedException {
// slow down when half the memory is taken and wait for gc
if (PlatformDependent.usedDirectMemory() > PlatformDependent.maxDirectMemory() / 2) {
final Duration waitTime = Duration.ofSeconds(10);
Deadline deadline = Deadline.fromNow(waitTime);
while (PlatformDependent.usedDirectMemory() > 0 && deadline.hasTimeLeft()) {
System.gc();
Thread.sleep(100);
}
final Duration timeLeft = deadline.timeLeft();
if (timeLeft.isNegative()) {
LOG.warn("Waited 10s for clean shutdown of previous runs but there is still direct memory in use: " + PlatformDependent.usedDirectMemory());
} else {
LOG.info("Needed to wait {} ms for full cleanup of previous runs.", waitTime.minus(timeLeft).toMillis());
}
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class KinesisTableApiITCase method readAllOrdersFromKinesis.
private List<Order> readAllOrdersFromKinesis(final KinesisPubsubClient client) throws Exception {
Deadline deadline = Deadline.fromNow(Duration.ofSeconds(5));
List<Order> orders;
do {
Thread.sleep(1000);
orders = client.readAllMessages(LARGE_ORDERS_STREAM).stream().map(order -> fromJson(order, Order.class)).collect(Collectors.toList());
} while (deadline.hasTimeLeft() && orders.size() < 3);
return orders;
}
Aggregations