use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class TaskManagerRegistrationTest method testTaskManagerResumesConnectAfterRefusedRegistration.
/**
* Make sure that the TaskManager keeps trying to register, even after
* registration attempts have been refused.
*/
@Test
public void testTaskManagerResumesConnectAfterRefusedRegistration() {
new JavaTestKit(actorSystem) {
{
ActorGateway jm = null;
ActorGateway taskManager = null;
try {
jm = TestingUtils.createForwardingActor(actorSystem, getTestActor(), Option.<String>empty());
final ActorGateway jmGateway = jm;
FiniteDuration refusedRegistrationPause = new FiniteDuration(500, TimeUnit.MILLISECONDS);
Configuration tmConfig = new Configuration(config);
tmConfig.setString(ConfigConstants.TASK_MANAGER_REFUSED_REGISTRATION_PAUSE, refusedRegistrationPause.toString());
// we make the test actor (the test kit) the JobManager to intercept
// the messages
taskManager = createTaskManager(actorSystem, jmGateway, tmConfig, true, false);
final ActorGateway taskManagerGateway = taskManager;
// check and decline initial registration
new Within(timeout) {
@Override
protected void run() {
// the TaskManager should try to register
expectMsgClass(RegisterTaskManager.class);
// we decline the registration
taskManagerGateway.tell(new RefuseRegistration(new Exception("test reason")), jmGateway);
}
};
// the TaskManager should wait a bit an retry...
FiniteDuration maxDelay = (FiniteDuration) refusedRegistrationPause.$times(3.0);
new Within(maxDelay) {
@Override
protected void run() {
expectMsgClass(RegisterTaskManager.class);
}
};
} catch (Throwable e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
stopActor(taskManager);
stopActor(jm);
}
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class TaskManagerTest method testLocalPartitionNotFound.
/**
* Tests that repeated local {@link PartitionNotFoundException}s ultimately fail the receiver.
*/
@Test
public void testLocalPartitionNotFound() throws Exception {
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final IntermediateDataSetID resultId = new IntermediateDataSetID();
// Create the JM
ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
final Configuration config = new Configuration();
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
taskManager = TestingUtils.createTaskManager(system, jobManager, config, true, true);
// ---------------------------------------------------------------------------------
final ActorGateway tm = taskManager;
final JobID jid = new JobID();
final JobVertexID vid = new JobVertexID();
final ExecutionAttemptID eid = new ExecutionAttemptID();
final ResultPartitionID partitionId = new ResultPartitionID();
// Local location (on the same TM though) for the partition
final ResultPartitionLocation loc = ResultPartitionLocation.createLocal();
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
new Within(new FiniteDuration(120, TimeUnit.SECONDS)) {
@Override
protected void run() {
// Submit the task
tm.tell(new SubmitTask(tdd), testActorGateway);
expectMsgClass(Acknowledge.get().getClass());
// Wait to be notified about the final execution state by the mock JM
TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
// The task should fail after repeated requests
assertEquals(msg.getExecutionState(), ExecutionState.FAILED);
Throwable error = msg.getError(getClass().getClassLoader());
if (error.getClass() != PartitionNotFoundException.class) {
error.printStackTrace();
fail("Wrong exception: " + error.getMessage());
}
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class AccumulatorLiveITCase method verifyResults.
private static void verifyResults() {
new JavaTestKit(system) {
{
ActorGateway selfGateway = new AkkaActorGateway(getRef(), jobManagerGateway.leaderSessionID());
// register for accumulator changes
jobManagerGateway.tell(new TestingJobManagerMessages.NotifyWhenAccumulatorChange(jobID), selfGateway);
expectMsgEquals(TIMEOUT, true);
// submit job
jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT), selfGateway);
expectMsgClass(TIMEOUT, JobManagerMessages.JobSubmitSuccess.class);
TestingJobManagerMessages.UpdatedAccumulators msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
Map<String, Accumulator<?, ?>> userAccumulators = msg.userAccumulators();
ExecutionAttemptID mapperTaskID = null;
ExecutionAttemptID sinkTaskID = null;
/* Check for accumulator values */
if (checkUserAccumulators(0, userAccumulators)) {
LOG.info("Passed initial check for map task.");
} else {
fail("Wrong accumulator results when map task begins execution.");
}
int expectedAccVal = 0;
/* for mapper task */
for (int i = 1; i <= NUM_ITERATIONS; i++) {
expectedAccVal += i;
// receive message
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
LOG.info("{}", userAccumulators);
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed round #" + i);
} else if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
// we determined the wrong task id and need to switch the two here
ExecutionAttemptID temp = mapperTaskID;
mapperTaskID = sinkTaskID;
sinkTaskID = temp;
LOG.info("Passed round #" + i);
} else {
fail("Failed in round #" + i);
}
}
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed initial check for sink task.");
} else {
fail("Wrong accumulator results when sink task begins execution.");
}
/* for sink task */
for (int i = 1; i <= NUM_ITERATIONS; i++) {
// receive message
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
LOG.info("{}", userAccumulators);
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed round #" + i);
} else {
fail("Failed in round #" + i);
}
}
expectMsgClass(TIMEOUT, JobManagerMessages.JobResultSuccess.class);
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.
/**
* Triggers a savepoint for a job that uses the FsStateBackend. We expect
* that all checkpoint files are written to a new savepoint directory.
*
* <ol>
* <li>Submit job, wait for some progress</li>
* <li>Trigger savepoint and verify that savepoint has been created</li>
* <li>Shut down the cluster, re-submit the job from the savepoint,
* verify that the initial state has been reset, and
* all tasks are running again</li>
* <li>Cancel job, dispose the savepoint, and verify that everything
* has been cleaned up</li>
* </ol>
*/
@Test
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
// Config
final int numTaskManagers = 2;
final int numSlotsPerTaskManager = 2;
final int parallelism = numTaskManagers * numSlotsPerTaskManager;
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File testRoot = folder.newFolder();
TestingCluster flink = null;
try {
// Create a test actor system
ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
final File checkpointDir = new File(testRoot, "checkpoints");
final File savepointRootDir = new File(testRoot, "savepoints");
if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
fail("Test setup failed: failed to create temporary directories.");
}
// Use file based checkpoints
config.setString(CoreOptions.STATE_BACKEND, "filesystem");
config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
// Start Flink
flink = new TestingCluster(config);
flink.start(true);
// Submit the job
final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
final JobID jobId = jobGraph.getJobID();
// Reset the static test job helpers
StatefulCounter.resetForTest(parallelism);
// Retrieve the job manager
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");
flink.submitJobDetached(jobGraph);
LOG.info("Waiting for some progress.");
// wait for the JobManager to be ready
Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
Await.ready(allRunning, deadline.timeLeft());
// wait for the Tasks to be ready
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
LOG.info("Triggering a savepoint.");
Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
LOG.info("Retrieved savepoint path: " + savepointPath + ".");
// Retrieve the savepoint from the testing job manager
LOG.info("Requesting the savepoint.");
Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
LOG.info("Retrieved savepoint: " + savepointPath + ".");
// Shut down the Flink cluster (thereby canceling the job)
LOG.info("Shutting down Flink cluster.");
flink.shutdown();
flink.awaitTermination();
// - Verification START -------------------------------------------
// Only one savepoint should exist
File[] files = savepointRootDir.listFiles();
if (files != null) {
assertEquals("Savepoint not created in expected directory", 1, files.length);
assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
File savepointDir = files[0];
File[] savepointFiles = savepointDir.listFiles();
assertNotNull(savepointFiles);
// Expect one metadata file and one checkpoint file per stateful
// parallel subtask
String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
} else {
fail("Savepoint not created in expected directory");
}
// We currently have the following directory layout: checkpointDir/jobId/chk-ID
File jobCheckpoints = new File(checkpointDir, jobId.toString());
if (jobCheckpoints.exists()) {
files = jobCheckpoints.listFiles();
assertNotNull("Checkpoint directory empty", files);
assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
}
// - Verification END ---------------------------------------------
// Restart the cluster
LOG.info("Restarting Flink cluster.");
flink.start();
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
// Reset static test helpers
StatefulCounter.resetForTest(parallelism);
// Gather all task deployment descriptors
final Throwable[] error = new Throwable[1];
final TestingCluster finalFlink = flink;
final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
new JavaTestKit(testActorSystem) {
{
new Within(deadline.timeLeft()) {
@Override
protected void run() {
try {
// Register to all submit task messages for job
for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
}
// Set the savepoint path
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
LOG.info("Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
finalFlink.submitJobDetached(jobGraph);
int numTasks = 0;
for (JobVertex jobVertex : jobGraph.getVertices()) {
numTasks += jobVertex.getParallelism();
}
// Gather the task deployment descriptors
LOG.info("Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
for (int i = 0; i < numTasks; i++) {
ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
TaskDeploymentDescriptor tdd = resp.tdd();
LOG.info("Received: " + tdd.toString() + ".");
TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
tdds.put(taskInformation.getJobVertexId(), tdd);
}
} catch (Throwable t) {
error[0] = t;
}
}
};
}
};
// - Verification START -------------------------------------------
String errMsg = "Error during gathering of TaskDeploymentDescriptors";
assertNull(errMsg, error[0]);
// have a matching task deployment descriptor.
for (TaskState taskState : savepoint.getTaskStates()) {
Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
assertTrue(errMsg, taskTdds.size() > 0);
assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
for (TaskDeploymentDescriptor tdd : taskTdds) {
SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
assertNotNull(subtaskState);
errMsg = "Initial operator state mismatch.";
assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
}
}
// Await state is restored
StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Await some progress after restore
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// - Verification END ---------------------------------------------
LOG.info("Cancelling job " + jobId + ".");
jobManager.tell(new CancelJob(jobId));
LOG.info("Disposing savepoint " + savepointPath + ".");
Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
errMsg = "Failed to dispose savepoint " + savepointPath + ".";
Object resp = Await.result(disposeFuture, deadline.timeLeft());
assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
// - Verification START -------------------------------------------
// The checkpoint files
List<File> checkpointFiles = new ArrayList<>();
for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
for (int i = 0; i < streamTaskState.getLength(); i++) {
if (streamTaskState.get(i) != null) {
FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
}
}
}
}
// The checkpoint files of the savepoint should have been discarded
for (File f : checkpointFiles) {
errMsg = "Checkpoint file " + f + " not cleaned up properly.";
assertFalse(errMsg, f.exists());
}
if (checkpointFiles.size() > 0) {
File parent = checkpointFiles.get(0).getParentFile();
errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
assertFalse(errMsg, parent.exists());
}
// All savepoints should have been cleaned up
errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
// - Verification END ---------------------------------------------
} finally {
if (flink != null) {
flink.shutdown();
}
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class SavepointMigrationTestBase method executeAndSavepoint.
@SafeVarargs
protected final void executeAndSavepoint(StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception {
// Retrieve the job manager
ActorGateway jobManager = Await.result(cluster.leaderGateway().future(), DEADLINE.timeLeft());
// Submit the job
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
JobSubmissionResult jobSubmissionResult = cluster.submitJobDetached(jobGraph);
LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());
StandaloneClusterClient clusterClient = new StandaloneClusterClient(cluster.configuration());
boolean done = false;
while (DEADLINE.hasTimeLeft()) {
Thread.sleep(100);
Map<String, Object> accumulators = clusterClient.getAccumulators(jobSubmissionResult.getJobID());
boolean allDone = true;
for (Tuple2<String, Integer> acc : expectedAccumulators) {
Integer numFinished = (Integer) accumulators.get(acc.f0);
if (numFinished == null) {
allDone = false;
break;
}
if (!numFinished.equals(acc.f1)) {
allDone = false;
break;
}
}
if (allDone) {
done = true;
break;
}
}
if (!done) {
fail("Did not see the expected accumulator results within time limit.");
}
LOG.info("Triggering savepoint.");
// Flink 1.2
final Future<Object> savepointResultFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobSubmissionResult.getJobID(), Option.<String>empty()), DEADLINE.timeLeft());
// Flink 1.1
// final Future<Object> savepointResultFuture =
// jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobSubmissionResult.getJobID()), DEADLINE.timeLeft());
Object savepointResult = Await.result(savepointResultFuture, DEADLINE.timeLeft());
if (savepointResult instanceof JobManagerMessages.TriggerSavepointFailure) {
fail("Error drawing savepoint: " + ((JobManagerMessages.TriggerSavepointFailure) savepointResult).cause());
}
// jobmanager will store savepoint in heap, we have to retrieve it
final String jobmanagerSavepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResult).savepointPath();
LOG.info("Saved savepoint: " + jobmanagerSavepointPath);
// Flink 1.2
FileUtils.moveFile(new File(new URI(jobmanagerSavepointPath).getPath()), new File(savepointPath));
// Flink 1.1
// Retrieve the savepoint from the testing job manager
// LOG.info("Requesting the savepoint.");
// Future<Object> savepointFuture = jobManager.ask(new TestingJobManagerMessages.RequestSavepoint(jobmanagerSavepointPath), DEADLINE.timeLeft());
//
// Savepoint savepoint = ((TestingJobManagerMessages.ResponseSavepoint) Await.result(savepointFuture, DEADLINE.timeLeft())).savepoint();
// LOG.info("Retrieved savepoint: " + jobmanagerSavepointPath + ".");
//
// LOG.info("Storing savepoint to file.");
// Configuration config = new Configuration();
// config.setString(org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.SAVEPOINT_BACKEND_KEY, "filesystem");
// config.setString(org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.SAVEPOINT_DIRECTORY_KEY, "file:///Users/aljoscha/Downloads");
// String path = org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.createFromConfig(config).storeSavepoint(savepoint);
//
// FileUtils.moveFile(new File(new URI(path).getPath()), new File(savepointPath));
}
Aggregations