use of org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService in project flink by apache.
the class JobManagerTest method testKvStateMessages.
/**
* Tests that the JobManager handles {@link org.apache.flink.runtime.query.KvStateMessage}
* instances as expected.
*/
@Test
public void testKvStateMessages() throws Exception {
Deadline deadline = new FiniteDuration(100, TimeUnit.SECONDS).fromNow();
Configuration config = new Configuration();
config.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100ms");
UUID leaderSessionId = null;
ActorGateway jobManager = new AkkaActorGateway(JobManager.startJobManagerActors(config, system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), TestingJobManager.class, MemoryArchivist.class)._1(), leaderSessionId);
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(system, jobManager.actor()));
Configuration tmConfig = new Configuration();
tmConfig.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
tmConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 8);
ActorRef taskManager = TaskManager.startTaskManagerComponentsAndActor(tmConfig, ResourceID.generate(), system, "localhost", scala.Option.<String>empty(), scala.Option.apply(leaderRetrievalService), true, TestingTaskManager.class);
Future<Object> registrationFuture = jobManager.ask(new NotifyWhenAtLeastNumTaskManagerAreRegistered(1), deadline.timeLeft());
Await.ready(registrationFuture, deadline.timeLeft());
//
// Location lookup
//
LookupKvStateLocation lookupNonExistingJob = new LookupKvStateLocation(new JobID(), "any-name");
Future<KvStateLocation> lookupFuture = jobManager.ask(lookupNonExistingJob, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
try {
Await.result(lookupFuture, deadline.timeLeft());
fail("Did not throw expected Exception");
} catch (IllegalStateException ignored) {
// Expected
}
JobGraph jobGraph = new JobGraph("croissant");
JobVertex jobVertex1 = new JobVertex("cappuccino");
jobVertex1.setParallelism(4);
jobVertex1.setMaxParallelism(16);
jobVertex1.setInvokableClass(BlockingNoOpInvokable.class);
JobVertex jobVertex2 = new JobVertex("americano");
jobVertex2.setParallelism(4);
jobVertex2.setMaxParallelism(16);
jobVertex2.setInvokableClass(BlockingNoOpInvokable.class);
jobGraph.addVertex(jobVertex1);
jobGraph.addVertex(jobVertex2);
Future<JobSubmitSuccess> submitFuture = jobManager.ask(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<JobSubmitSuccess>apply(JobSubmitSuccess.class));
Await.result(submitFuture, deadline.timeLeft());
Object lookupUnknownRegistrationName = new LookupKvStateLocation(jobGraph.getJobID(), "unknown");
lookupFuture = jobManager.ask(lookupUnknownRegistrationName, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
try {
Await.result(lookupFuture, deadline.timeLeft());
fail("Did not throw expected Exception");
} catch (UnknownKvStateLocation ignored) {
// Expected
}
//
// Registration
//
NotifyKvStateRegistered registerNonExistingJob = new NotifyKvStateRegistered(new JobID(), new JobVertexID(), new KeyGroupRange(0, 0), "any-name", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1233));
jobManager.tell(registerNonExistingJob);
LookupKvStateLocation lookupAfterRegistration = new LookupKvStateLocation(registerNonExistingJob.getJobId(), registerNonExistingJob.getRegistrationName());
lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
try {
Await.result(lookupFuture, deadline.timeLeft());
fail("Did not throw expected Exception");
} catch (IllegalStateException ignored) {
// Expected
}
NotifyKvStateRegistered registerForExistingJob = new NotifyKvStateRegistered(jobGraph.getJobID(), jobVertex1.getID(), new KeyGroupRange(0, 0), "register-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
jobManager.tell(registerForExistingJob);
lookupAfterRegistration = new LookupKvStateLocation(registerForExistingJob.getJobId(), registerForExistingJob.getRegistrationName());
lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
KvStateLocation location = Await.result(lookupFuture, deadline.timeLeft());
assertNotNull(location);
assertEquals(jobGraph.getJobID(), location.getJobId());
assertEquals(jobVertex1.getID(), location.getJobVertexId());
assertEquals(jobVertex1.getMaxParallelism(), location.getNumKeyGroups());
assertEquals(1, location.getNumRegisteredKeyGroups());
KeyGroupRange keyGroupRange = registerForExistingJob.getKeyGroupRange();
assertEquals(1, keyGroupRange.getNumberOfKeyGroups());
assertEquals(registerForExistingJob.getKvStateId(), location.getKvStateID(keyGroupRange.getStartKeyGroup()));
assertEquals(registerForExistingJob.getKvStateServerAddress(), location.getKvStateServerAddress(keyGroupRange.getStartKeyGroup()));
//
// Unregistration
//
NotifyKvStateUnregistered unregister = new NotifyKvStateUnregistered(registerForExistingJob.getJobId(), registerForExistingJob.getJobVertexId(), registerForExistingJob.getKeyGroupRange(), registerForExistingJob.getRegistrationName());
jobManager.tell(unregister);
lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
try {
Await.result(lookupFuture, deadline.timeLeft());
fail("Did not throw expected Exception");
} catch (UnknownKvStateLocation ignored) {
// Expected
}
//
// Duplicate registration fails task
//
NotifyKvStateRegistered register = new NotifyKvStateRegistered(jobGraph.getJobID(), jobVertex1.getID(), new KeyGroupRange(0, 0), "duplicate-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
NotifyKvStateRegistered duplicate = new NotifyKvStateRegistered(jobGraph.getJobID(), // <--- different operator, but...
jobVertex2.getID(), new KeyGroupRange(0, 0), // ...same name
"duplicate-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
Future<TestingJobManagerMessages.JobStatusIs> failedFuture = jobManager.ask(new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.FAILED), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<JobStatusIs>apply(JobStatusIs.class));
jobManager.tell(register);
jobManager.tell(duplicate);
// Wait for failure
JobStatusIs jobStatus = Await.result(failedFuture, deadline.timeLeft());
assertEquals(JobStatus.FAILED, jobStatus.state());
}
use of org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService in project flink by apache.
the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.
/**
* Tests the metric registry life cycle on JobManager re-connects.
*/
@Test
public void testMetricRegistryLifeCycle() throws Exception {
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
// ================================================================
// Start JobManager
// ================================================================
final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
// ================================================================
// Start TaskManager
// ================================================================
final Configuration config = new Configuration();
final ResourceID tmResourceID = ResourceID.generate();
TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
// create the task manager
final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
// wait for the TM to be registered
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
// trigger re-registration of TM; this should include a disconnect from the current JM
taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
// wait for re-registration to be completed
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// verify that the registry was not shutdown due to the disconnect
Assert.assertFalse(tmRegistry.isShutdown());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService in project flink by apache.
the class TaskManagerProcessReapingTestBase method testReapProcessOnFailure.
@Test
public void testReapProcessOnFailure() {
Process taskManagerProcess = null;
ActorSystem jmActorSystem = null;
final StringWriter processOutput = new StringWriter();
try {
String javaCommand = getJavaCommandPath();
// is available on this machine
if (javaCommand == null) {
System.out.println("---- Skipping TaskManagerProcessReapingTest : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile("testlogconfig", "properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
jmActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<Tuple2<String, Object>>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(new Configuration(), jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1;
// start a ResourceManager
StandaloneLeaderRetrievalService standaloneLeaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(jmActorSystem, jmActor));
FlinkResourceManager.startResourceManagerActors(new Configuration(), jmActorSystem, standaloneLeaderRetrievalService, StandaloneResourceManager.class);
final int taskManagerPort = NetUtils.getAvailablePort();
// start the task manager process
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms256m", "-Xmx256m", "-classpath", getCurrentClasspath(), TaskManagerTestEntryPoint.class.getName(), String.valueOf(jobManagerPort), String.valueOf(taskManagerPort) };
ProcessBuilder bld = new ProcessBuilder(command);
taskManagerProcess = bld.start();
new PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// grab the reference to the TaskManager. try multiple times, until the process
// is started and the TaskManager is up
String taskManagerActorName = String.format("akka.tcp://flink@%s/user/%s", "localhost:" + taskManagerPort, TaskManager.TASK_MANAGER_NAME());
ActorRef taskManagerRef = null;
Throwable lastError = null;
for (int i = 0; i < 40; i++) {
try {
taskManagerRef = TaskManager.getTaskManagerRemoteReference(taskManagerActorName, jmActorSystem, new FiniteDuration(25, TimeUnit.SECONDS));
break;
} catch (Throwable t) {
// TaskManager probably not ready yet
lastError = t;
}
Thread.sleep(500);
}
assertTrue("TaskManager process died", isProcessAlive(taskManagerProcess));
if (taskManagerRef == null) {
if (lastError != null) {
lastError.printStackTrace();
}
fail("TaskManager process did not launch the TaskManager properly. Failed to look up " + taskManagerActorName);
}
// kill the TaskManager actor
onTaskManagerProcessRunning(taskManagerRef);
// wait for max 5 seconds for the process to terminate
{
long now = System.currentTimeMillis();
long deadline = now + 10000;
while (now < deadline && isProcessAlive(taskManagerProcess)) {
Thread.sleep(100);
now = System.currentTimeMillis();
}
}
assertFalse("TaskManager process did not terminate upon actor death", isProcessAlive(taskManagerProcess));
int returnCode = taskManagerProcess.exitValue();
assertEquals("TaskManager died, but not because of the process reaper", TaskManager.RUNTIME_FAILURE_RETURN_CODE(), returnCode);
onTaskManagerProcessTerminated(processOutput.toString());
} catch (Exception e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
throw e;
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService in project flink by apache.
the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.
/**
* Makes sure that all components are shut down when the TaskManager
* actor is shut down.
*/
@Test
public void testComponentsStartupShutdown() {
final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
final Time timeout = Time.seconds(100);
final int BUFFER_SIZE = 32 * 1024;
Configuration config = new Configuration();
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(config);
final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
final int numberOfSlots = 1;
// create the components for the TaskManager manually
final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
1000000, config, // exit-jvm-on-fatal-error
false);
final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
ResourceID taskManagerId = ResourceID.generate();
final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
final IOManager ioManager = new IOManagerAsync(TMP_DIR);
final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
network.start();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
// create the task manager
final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
// wait for the TaskManager to be registered
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// shut down all actors and the actor system
// Kill the Task down the JobManager
taskManager.tell(Kill.getInstance(), ActorRef.noSender());
jobManager.tell(Kill.getInstance(), ActorRef.noSender());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
actorSystem = null;
// now that the TaskManager is shut down, the components should be shut down as well
assertTrue(network.isShutdown());
assertTrue(ioManager.isProperlyShutDown());
assertTrue(memManager.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService in project flink by apache.
the class StandaloneUtils method createLeaderRetrievalService.
/**
* Creates a {@link StandaloneLeaderRetrievalService} form the given configuration and the
* JobManager name. The host and port for the remote Akka URL are retrieved from the provided
* configuration. Instead of using the standard JobManager Akka name, the provided one is used
* for the remote Akka URL.
*
* @param configuration Configuration instance containing hte host and port information
* @param resolveInitialHostName If true, resolves the hostname of the StandaloneLeaderRetrievalService
* @param jobManagerName Name of the JobManager actor
* @return StandaloneLeaderRetrievalService
* @throws UnknownHostException if the host name cannot be resolved into an {@link InetAddress}
*/
public static StandaloneLeaderRetrievalService createLeaderRetrievalService(Configuration configuration, boolean resolveInitialHostName, String jobManagerName) throws UnknownHostException {
Tuple3<String, String, Object> stringIntPair = TaskManager.getAndCheckJobManagerAddress(configuration);
String protocol = stringIntPair._1();
String jobManagerHostname = stringIntPair._2();
int jobManagerPort = (Integer) stringIntPair._3();
// Do not try to resolve a hostname to prevent resolving to the wrong IP address
String hostPort = NetUtils.unresolvedHostAndPortToNormalizedString(jobManagerHostname, jobManagerPort);
if (resolveInitialHostName) {
try {
//noinspection ResultOfMethodCallIgnored
InetAddress.getByName(jobManagerHostname);
} catch (UnknownHostException e) {
throw new UnknownHostException("Cannot resolve the JobManager hostname '" + jobManagerHostname + "' specified in the configuration");
}
}
String jobManagerAkkaUrl = JobManager.getRemoteJobManagerAkkaURL(protocol, hostPort, Option.apply(jobManagerName));
return new StandaloneLeaderRetrievalService(jobManagerAkkaUrl);
}
Aggregations