use of scala.concurrent.duration.Deadline in project flink by apache.
the class AbstractQueryableStateITCase method testQueryNonStartedJobState.
/**
* Similar tests as {@link #testValueState()} but before submitting the
* job, we already issue one request which fails.
*/
@Test
public void testQueryNonStartedJobState() throws Exception {
// Config
final Deadline deadline = TEST_TIMEOUT.fromNow();
final int numElements = 1024;
final QueryableStateClient client = new QueryableStateClient(cluster.configuration());
JobID jobId = null;
try {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(stateBackend);
env.setParallelism(NUM_SLOTS);
// Very important, because cluster is shared between tests and we
// don't explicitly check that all slots are available before
// submitting.
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 1000));
DataStream<Tuple2<Integer, Long>> source = env.addSource(new TestAscendingValueSource(numElements));
// Value state
ValueStateDescriptor<Tuple2<Integer, Long>> valueState = new ValueStateDescriptor<>("any", source.getType(), null);
QueryableStateStream<Integer, Tuple2<Integer, Long>> queryableState = source.keyBy(new KeySelector<Tuple2<Integer, Long>, Integer>() {
@Override
public Integer getKey(Tuple2<Integer, Long> value) throws Exception {
return value.f0;
}
}).asQueryableState("hakuna", valueState);
// Submit the job graph
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
jobId = jobGraph.getJobID();
// Now query
long expected = numElements;
// query once
client.getKvState(jobId, queryableState.getQueryableStateName(), 0, KvStateRequestSerializer.serializeKeyAndNamespace(0, queryableState.getKeySerializer(), VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE));
cluster.submitJobDetached(jobGraph);
executeValueQuery(deadline, client, jobId, queryableState, expected);
} finally {
// Free cluster resources
if (jobId != null) {
Future<CancellationSuccess> cancellation = cluster.getLeaderGateway(deadline.timeLeft()).ask(new JobManagerMessages.CancelJob(jobId), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<CancellationSuccess>apply(CancellationSuccess.class));
Await.ready(cancellation, deadline.timeLeft());
}
client.shutDown();
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ChaosMonkeyITCase method testChaosMonkey.
@Test
public void testChaosMonkey() throws Exception {
// Test config
final int numberOfJobManagers = 3;
final int numberOfTaskManagers = 3;
final int numberOfSlotsPerTaskManager = 2;
// The final count each source is counting to: 1...n
final int n = 5000;
// Parallelism for the program
final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
// The test should not run longer than this
final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
// Every x seconds a random job or task manager is killed
//
// The job will will be running for $killEvery seconds and then a random Job/TaskManager
// will be killed. On recovery (which takes some time to bring up the new process etc.),
// this test will wait for task managers to reconnect before starting the next count down.
// Therefore the delay between retries is not important in this setup.
final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
// Trigger a checkpoint every
final int checkpointingIntervalMs = 1000;
// Total number of kills
final int totalNumberOfKills = 10;
// -----------------------------------------------------------------------------------------
// Setup
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
// Akka and restart timeouts
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
if (checkpointingIntervalMs >= killEvery.toMillis()) {
throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
}
// Task manager
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
ActorSystem testActorSystem = null;
LeaderRetrievalService leaderRetrievalService = null;
List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
try {
// Initial state
for (int i = 0; i < numberOfJobManagers; i++) {
jobManagerProcesses.add(createAndStartJobManagerProcess(config));
}
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerProcesses.add(createAndStartTaskManagerProcess(config));
}
testActorSystem = AkkaUtils.createDefaultActorSystem();
// Leader listener
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
TestingListener leaderListener = new TestingListener();
leaderRetrievalService.start(leaderListener);
Deadline deadline = testDuration.fromNow();
// Wait for the new leader
int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// The job
JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);
LOG.info("Submitting job {}", jobGraph.getJobID());
submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());
LOG.info("Waiting for a checkpoint to complete before kicking off chaos");
// Wait for a checkpoint to complete
TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());
LOG.info("Checkpoint completed... ready for chaos");
int currentKillNumber = 1;
int currentJobManagerKills = 0;
int currentTaskManagerKills = 0;
for (int i = 0; i < totalNumberOfKills; i++) {
LOG.info("Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
Thread.sleep(killEvery.toMillis());
LOG.info("Checking job status...");
JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
// Wait for it to run
LOG.info("Waiting for job status {}", JobStatus.RUNNING);
waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else if (jobStatus == JobStatus.FINISHED) {
// Early finish
LOG.info("Job finished");
return;
} else {
LOG.info("Job status is {}", jobStatus);
}
if (rand.nextBoolean()) {
LOG.info("Killing the leading JobManager");
JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
leader.destroy();
currentJobManagerKills++;
LOG.info("Killed {}", leader);
// Make sure to add the new job manager before looking for a new leader
jobManagerProcesses.add(newJobManager);
// Wait for the new leader
leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else {
LOG.info("Killing a random TaskManager");
TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
// Wait for this new task manager to be connected
waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// Now it's safe to kill a process
int next = rand.nextInt(numberOfTaskManagers);
TaskManagerProcess taskManager = taskManagerProcesses.remove(next);
LOG.info("{} has been chosen. Killing process...", taskManager);
taskManager.destroy();
currentTaskManagerKills++;
// Add the new task manager after killing an old one
taskManagerProcesses.add(newTaskManager);
}
}
LOG.info("Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
// Signal the job to speed up (if it is not done yet)
TestJvmProcess.touchFile(ProceedCoordination);
// Wait for the job to finish
LOG.info("Waiting for job status {}", JobStatus.FINISHED);
waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job finished");
LOG.info("Waiting for job removal");
waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job removed");
LOG.info("Checking clean recovery state...");
checkCleanRecoveryState(config);
LOG.info("Recovery state clean");
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
System.out.println("#################################################");
System.out.println(" TASK MANAGERS");
System.out.println("#################################################");
for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
taskManagerProcess.printProcessLog();
}
System.out.println("#################################################");
System.out.println(" JOB MANAGERS");
System.out.println("#################################################");
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
jobManagerProcess.printProcessLog();
}
throw t;
} finally {
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
if (jobManagerProcess != null) {
jobManagerProcess.destroy();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ChaosMonkeyITCase method waitForJobRemoved.
private void waitForJobRemoved(JobID jobId, JobManagerProcess jobManager, ActorSystem actorSystem, FiniteDuration timeout) throws Exception {
ActorRef jobManagerRef = jobManager.getActorRef(actorSystem, timeout);
AkkaActorGateway jobManagerGateway = new AkkaActorGateway(jobManagerRef, null);
Future<Object> archiveFuture = jobManagerGateway.ask(JobManagerMessages.getRequestArchive(), timeout);
ActorRef archive = ((JobManagerMessages.ResponseArchive) Await.result(archiveFuture, timeout)).actor();
AkkaActorGateway archiveGateway = new AkkaActorGateway(archive, null);
Deadline deadline = timeout.fromNow();
while (deadline.hasTimeLeft()) {
JobManagerMessages.JobStatusResponse resp = JobManagerActorTestUtils.requestJobStatus(jobId, archiveGateway, deadline.timeLeft());
if (resp instanceof JobManagerMessages.JobNotFound) {
Thread.sleep(100);
} else {
return;
}
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderReelection.
/**
* Tests that a job can be executed after a new leader has been elected. For all except for the
* last leader, the job is blocking. The JobManager will be terminated while executing the
* blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
* successfully executed.
*/
@Test
public void testJobExecutionOnClusterWithLeaderReelection() throws Exception {
int numJMs = 10;
int numTMs = 2;
int numSlotsPerTM = 3;
int parallelism = numTMs * numSlotsPerTM;
File rootFolder = tempFolder.getRoot();
Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTM);
// we "effectively" disable the automatic RecoverAllJobs message and sent it manually to make
// sure that all TMs have registered to the JM prior to issueing the RecoverAllJobs message
configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, AkkaUtils.INF_TIMEOUT().toString());
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(true);
JobVertex sender = new JobVertex("sender");
JobVertex receiver = new JobVertex("receiver");
sender.setInvokableClass(Tasks.Sender.class);
receiver.setInvokableClass(Tasks.BlockingOnceReceiver.class);
sender.setParallelism(parallelism);
receiver.setParallelism(parallelism);
receiver.connectNewDataSetAsInput(sender, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slotSharingGroup = new SlotSharingGroup();
sender.setSlotSharingGroup(slotSharingGroup);
receiver.setSlotSharingGroup(slotSharingGroup);
final JobGraph graph = new JobGraph("Blocking test job", sender, receiver);
final TestingCluster cluster = new TestingCluster(configuration);
ActorSystem clientActorSystem = null;
Thread thread = null;
JobSubmitterRunnable jobSubmission = null;
try {
cluster.start();
clientActorSystem = cluster.startJobClientActorSystem(graph.getJobID());
final ActorSystem clientAS = clientActorSystem;
jobSubmission = new JobSubmitterRunnable(clientAS, cluster, graph);
thread = new Thread(jobSubmission);
thread.start();
Deadline deadline = timeout.$times(3).fromNow();
// Kill all JobManager except for two
for (int i = 0; i < numJMs; i++) {
ActorGateway jm = cluster.getLeaderGateway(deadline.timeLeft());
cluster.waitForTaskManagersToBeRegisteredAtJobManager(jm.actor());
// recover all jobs, sent manually
log.info("Sent recover all jobs manually to job manager {}.", jm.path());
jm.tell(JobManagerMessages.getRecoverAllJobs());
if (i < numJMs - 1) {
Future<Object> future = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(graph.getJobID()), deadline.timeLeft());
Await.ready(future, deadline.timeLeft());
cluster.clearLeader();
if (i == numJMs - 2) {
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
}
log.info("Kill job manager {}.", jm.path());
jm.tell(TestingJobManagerMessages.getDisablePostStop());
jm.tell(Kill.getInstance());
}
}
log.info("Waiting for submitter thread to terminate.");
thread.join(deadline.timeLeft().toMillis());
log.info("Submitter thread has terminated.");
if (thread.isAlive()) {
fail("The job submission thread did not stop (meaning it did not succeeded in" + "executing the test job.");
}
Await.result(jobSubmission.resultPromise.future(), deadline.timeLeft());
} finally {
if (clientActorSystem != null) {
cluster.shutdownJobClientActorSystem(clientActorSystem);
}
if (thread != null && thread.isAlive()) {
jobSubmission.finished = true;
}
cluster.stop();
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class AbstractQueryableStateITCase method testValueStateDefault.
/**
* Tests simple value state queryable state instance with a default value
* set. Each source emits (subtaskIndex, 0)..(subtaskIndex, numElements)
* tuples, the key is mapped to 1 but key 0 is queried which should throw
* a {@link UnknownKeyOrNamespace} exception.
*
* @throws UnknownKeyOrNamespace thrown due querying a non-existent key
*/
@Test(expected = UnknownKeyOrNamespace.class)
public void testValueStateDefault() throws Exception, UnknownKeyOrNamespace {
// Config
final Deadline deadline = TEST_TIMEOUT.fromNow();
final int numElements = 1024;
final QueryableStateClient client = new QueryableStateClient(cluster.configuration());
JobID jobId = null;
try {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(stateBackend);
env.setParallelism(NUM_SLOTS);
// Very important, because cluster is shared between tests and we
// don't explicitly check that all slots are available before
// submitting.
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 1000));
DataStream<Tuple2<Integer, Long>> source = env.addSource(new TestAscendingValueSource(numElements));
// Value state
ValueStateDescriptor<Tuple2<Integer, Long>> valueState = new ValueStateDescriptor<>("any", source.getType(), Tuple2.of(0, 1337l));
// only expose key "1"
QueryableStateStream<Integer, Tuple2<Integer, Long>> queryableState = source.keyBy(new KeySelector<Tuple2<Integer, Long>, Integer>() {
@Override
public Integer getKey(Tuple2<Integer, Long> value) throws Exception {
return 1;
}
}).asQueryableState("hakuna", valueState);
// Submit the job graph
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
jobId = jobGraph.getJobID();
cluster.submitJobDetached(jobGraph);
// Now query
int key = 0;
final byte[] serializedKey = KvStateRequestSerializer.serializeKeyAndNamespace(key, queryableState.getKeySerializer(), VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE);
Future<byte[]> future = getKvStateWithRetries(client, jobId, queryableState.getQueryableStateName(), key, serializedKey, QUERY_RETRY_DELAY, true);
Await.result(future, deadline.timeLeft());
} finally {
// Free cluster resources
if (jobId != null) {
Future<CancellationSuccess> cancellation = cluster.getLeaderGateway(deadline.timeLeft()).ask(new JobManagerMessages.CancelJob(jobId), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<CancellationSuccess>apply(CancellationSuccess.class));
Await.ready(cancellation, deadline.timeLeft());
}
client.shutDown();
}
}
Aggregations