use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.
the class AbstractTaskManagerProcessFailureRecoveryTest method testTaskManagerProcessFailure.
@Test
public void testTaskManagerProcessFailure() {
final StringWriter processOutput1 = new StringWriter();
final StringWriter processOutput2 = new StringWriter();
final StringWriter processOutput3 = new StringWriter();
ActorSystem jmActorSystem = null;
Process taskManagerProcess1 = null;
Process taskManagerProcess2 = null;
Process taskManagerProcess3 = null;
File coordinateTempDir = null;
try {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// coordination between the processes goes through a directory
coordinateTempDir = CommonTestUtils.createTempDirectory();
// find a free port to start the JobManager
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
Configuration jmConfig = new Configuration();
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
jmConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "10 s");
jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
// the TaskManager java command
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
// start the first two TaskManager processes
taskManagerProcess1 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1);
taskManagerProcess2 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2);
// we wait for the JobManager to have the two TaskManagers available
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 2, 120000);
// the program will set a marker file in each of its parallel tasks once they are ready, so that
// this coordinating code is aware of this.
// the program will very slowly consume elements until the marker file (later created by the
// test driver code) is present
final File coordinateDirClosure = coordinateTempDir;
final AtomicReference<Throwable> errorRef = new AtomicReference<>();
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testTaskManagerFailure(jobManagerPort, coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef.set(t);
}
}
};
//start the test program
programTrigger.start();
// max 20 seconds
if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) {
// check if the program failed for some reason
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
} else {
// no error occurred, simply a timeout
fail("The tasks were not started within time (" + 120000 + "msecs)");
}
}
// start the third TaskManager
taskManagerProcess3 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3);
// we wait for the third TaskManager to register
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 3, 120000);
// kill one of the previous TaskManagers, triggering a failure and recovery
taskManagerProcess1.destroy();
taskManagerProcess1 = null;
// we create the marker file which signals the program functions tasks that they can complete
touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
// wait for at most 5 minutes for the program to complete
programTrigger.join(300000);
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
// all seems well :-)
} catch (Exception e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput1.toString());
printProcessLog("TaskManager 2", processOutput2.toString());
printProcessLog("TaskManager 3", processOutput3.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput1.toString());
printProcessLog("TaskManager 2", processOutput2.toString());
printProcessLog("TaskManager 3", processOutput3.toString());
throw e;
} finally {
if (taskManagerProcess1 != null) {
taskManagerProcess1.destroy();
}
if (taskManagerProcess2 != null) {
taskManagerProcess2.destroy();
}
if (taskManagerProcess3 != null) {
taskManagerProcess3.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable t) {
// we can ignore this
}
}
}
}
use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.
the class TaskManagerProcessReapingTestBase method testReapProcessOnFailure.
@Test
public void testReapProcessOnFailure() {
Process taskManagerProcess = null;
ActorSystem jmActorSystem = null;
final StringWriter processOutput = new StringWriter();
try {
String javaCommand = getJavaCommandPath();
// is available on this machine
if (javaCommand == null) {
System.out.println("---- Skipping TaskManagerProcessReapingTest : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile("testlogconfig", "properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
jmActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<Tuple2<String, Object>>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(new Configuration(), jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1;
// start a ResourceManager
StandaloneLeaderRetrievalService standaloneLeaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(jmActorSystem, jmActor));
FlinkResourceManager.startResourceManagerActors(new Configuration(), jmActorSystem, standaloneLeaderRetrievalService, StandaloneResourceManager.class);
final int taskManagerPort = NetUtils.getAvailablePort();
// start the task manager process
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms256m", "-Xmx256m", "-classpath", getCurrentClasspath(), TaskManagerTestEntryPoint.class.getName(), String.valueOf(jobManagerPort), String.valueOf(taskManagerPort) };
ProcessBuilder bld = new ProcessBuilder(command);
taskManagerProcess = bld.start();
new PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// grab the reference to the TaskManager. try multiple times, until the process
// is started and the TaskManager is up
String taskManagerActorName = String.format("akka.tcp://flink@%s/user/%s", "localhost:" + taskManagerPort, TaskManager.TASK_MANAGER_NAME());
ActorRef taskManagerRef = null;
Throwable lastError = null;
for (int i = 0; i < 40; i++) {
try {
taskManagerRef = TaskManager.getTaskManagerRemoteReference(taskManagerActorName, jmActorSystem, new FiniteDuration(25, TimeUnit.SECONDS));
break;
} catch (Throwable t) {
// TaskManager probably not ready yet
lastError = t;
}
Thread.sleep(500);
}
assertTrue("TaskManager process died", isProcessAlive(taskManagerProcess));
if (taskManagerRef == null) {
if (lastError != null) {
lastError.printStackTrace();
}
fail("TaskManager process did not launch the TaskManager properly. Failed to look up " + taskManagerActorName);
}
// kill the TaskManager actor
onTaskManagerProcessRunning(taskManagerRef);
// wait for max 5 seconds for the process to terminate
{
long now = System.currentTimeMillis();
long deadline = now + 10000;
while (now < deadline && isProcessAlive(taskManagerProcess)) {
Thread.sleep(100);
now = System.currentTimeMillis();
}
}
assertFalse("TaskManager process did not terminate upon actor death", isProcessAlive(taskManagerProcess));
int returnCode = taskManagerProcess.exitValue();
assertEquals("TaskManager died, but not because of the process reaper", TaskManager.RUNTIME_FAILURE_RETURN_CODE(), returnCode);
onTaskManagerProcessTerminated(processOutput.toString());
} catch (Exception e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
throw e;
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.
the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.
/**
* Tests the metric registry life cycle on JobManager re-connects.
*/
@Test
public void testMetricRegistryLifeCycle() throws Exception {
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
// ================================================================
// Start JobManager
// ================================================================
final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
// ================================================================
// Start TaskManager
// ================================================================
final Configuration config = new Configuration();
final ResourceID tmResourceID = ResourceID.generate();
TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
// create the task manager
final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
// wait for the TM to be registered
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
// trigger re-registration of TM; this should include a disconnect from the current JM
taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
// wait for re-registration to be completed
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// verify that the registry was not shutdown due to the disconnect
Assert.assertFalse(tmRegistry.isShutdown());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.
the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.
/**
* Makes sure that all components are shut down when the TaskManager
* actor is shut down.
*/
@Test
public void testComponentsStartupShutdown() {
final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
final Time timeout = Time.seconds(100);
final int BUFFER_SIZE = 32 * 1024;
Configuration config = new Configuration();
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(config);
final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
final int numberOfSlots = 1;
// create the components for the TaskManager manually
final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
1000000, config, // exit-jvm-on-fatal-error
false);
final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
ResourceID taskManagerId = ResourceID.generate();
final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
final IOManager ioManager = new IOManagerAsync(TMP_DIR);
final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
network.start();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
// create the task manager
final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
// wait for the TaskManager to be registered
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// shut down all actors and the actor system
// Kill the Task down the JobManager
taskManager.tell(Kill.getInstance(), ActorRef.noSender());
jobManager.tell(Kill.getInstance(), ActorRef.noSender());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
actorSystem = null;
// now that the TaskManager is shut down, the components should be shut down as well
assertTrue(network.isShutdown());
assertTrue(ioManager.isProperlyShutDown());
assertTrue(memManager.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.
the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.
@Test
public void testCancelingOnProcessFailure() {
final StringWriter processOutput = new StringWriter();
ActorSystem jmActorSystem = null;
Process taskManagerProcess = null;
try {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// find a free port to start the JobManager
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
Configuration jmConfig = new Configuration();
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "5 s");
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2000 s");
jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 10);
jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
// the TaskManager java command
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), AbstractTaskManagerProcessFailureRecoveryTest.TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
// start the first two TaskManager processes
taskManagerProcess = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// we wait for the JobManager to have the two TaskManagers available
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 1, 120000);
final Throwable[] errorRef = new Throwable[1];
// start the test program, which infinitely blocks
Runnable programRunner = new Runnable() {
@Override
public void run() {
try {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort);
env.setParallelism(2);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
synchronized (this) {
wait();
}
return 0L;
}
}).output(new DiscardingOutputFormat<Long>());
env.execute();
} catch (Throwable t) {
errorRef[0] = t;
}
}
};
Thread programThread = new Thread(programRunner);
// kill the TaskManager
taskManagerProcess.destroy();
taskManagerProcess = null;
// immediately submit the job. this should hit the case
// where the JobManager still thinks it has the TaskManager and tries to send it tasks
programThread.start();
// try to cancel the job
cancelRunningJob(jmActor);
// we should see a failure within reasonable time (10s is the ask timeout).
// since the CI environment is often slow, we conservatively give it up to 2 minutes,
// to fail, which is much lower than the failure time given by the heartbeats ( > 2000s)
programThread.join(120000);
assertFalse("The program did not cancel in time (2 minutes)", programThread.isAlive());
Throwable error = errorRef[0];
assertNotNull("The program did not fail properly", error);
assertTrue(error instanceof ProgramInvocationException);
// all seems well :-)
} catch (Exception e) {
e.printStackTrace();
printProcessLog("TaskManager", processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput.toString());
throw e;
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
}
}
Aggregations