use of org.apache.flink.test.util.TestProcessBuilder.TestProcess in project flink by apache.
the class AbstractTaskManagerProcessFailureRecoveryTest method testTaskManagerProcessFailure.
@Test
public void testTaskManagerProcessFailure() throws Exception {
TestProcess taskManagerProcess1 = null;
TestProcess taskManagerProcess2 = null;
TestProcess taskManagerProcess3 = null;
File coordinateTempDir = null;
Configuration config = new Configuration();
config.setString(JobManagerOptions.ADDRESS, "localhost");
config.setString(RestOptions.BIND_PORT, "0");
config.setLong(HeartbeatManagerOptions.HEARTBEAT_INTERVAL, 200L);
config.setLong(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT, 10000L);
config.set(HeartbeatManagerOptions.HEARTBEAT_RPC_FAILURE_THRESHOLD, 1);
config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
config.setString(JobManagerOptions.EXECUTION_FAILOVER_STRATEGY, "full");
config.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofSeconds(30L));
try (final StandaloneSessionClusterEntrypoint clusterEntrypoint = new StandaloneSessionClusterEntrypoint(config)) {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
clusterEntrypoint.startCluster();
// coordination between the processes goes through a directory
coordinateTempDir = temporaryFolder.newFolder();
TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
// start the first two TaskManager processes
taskManagerProcess1 = taskManagerProcessBuilder.start();
taskManagerProcess2 = taskManagerProcessBuilder.start();
// the program will set a marker file in each of its parallel tasks once they are ready,
// so that
// this coordinating code is aware of this.
// the program will very slowly consume elements until the marker file (later created by
// the
// test driver code) is present
final File coordinateDirClosure = coordinateTempDir;
final AtomicReference<Throwable> errorRef = new AtomicReference<>();
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testTaskManagerFailure(config, coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef.set(t);
}
}
};
// start the test program
programTrigger.start();
// max 20 seconds
if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) {
// check if the program failed for some reason
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
} else {
// no error occurred, simply a timeout
fail("The tasks were not started within time (" + 120000 + "msecs)");
}
}
// start the third TaskManager
taskManagerProcess3 = taskManagerProcessBuilder.start();
// kill one of the previous TaskManagers, triggering a failure and recovery
taskManagerProcess1.destroy();
waitForShutdown("TaskManager 1", taskManagerProcess1);
// we create the marker file which signals the program functions tasks that they can
// complete
touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
// wait for at most 5 minutes for the program to complete
programTrigger.join(300000);
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
// all seems well :-)
} catch (Exception e) {
e.printStackTrace();
printProcessLog("TaskManager 1", taskManagerProcess1);
printProcessLog("TaskManager 2", taskManagerProcess2);
printProcessLog("TaskManager 3", taskManagerProcess3);
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog("TaskManager 1", taskManagerProcess1);
printProcessLog("TaskManager 2", taskManagerProcess2);
printProcessLog("TaskManager 3", taskManagerProcess3);
throw e;
} finally {
if (taskManagerProcess1 != null) {
taskManagerProcess1.destroy();
}
if (taskManagerProcess2 != null) {
taskManagerProcess2.destroy();
}
if (taskManagerProcess3 != null) {
taskManagerProcess3.destroy();
}
waitForShutdown("TaskManager 1", taskManagerProcess1);
waitForShutdown("TaskManager 2", taskManagerProcess2);
waitForShutdown("TaskManager 3", taskManagerProcess3);
}
}
use of org.apache.flink.test.util.TestProcessBuilder.TestProcess in project flink by apache.
the class ExceptionUtilsITCase method run.
private static RunResult run(String className, Iterable<String> args, long directMemorySize, long metaspaceSize) throws InterruptedException, IOException {
TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(className);
if (directMemorySize > 0) {
taskManagerProcessBuilder.addJvmArg(String.format("-XX:MaxDirectMemorySize=%d", directMemorySize));
}
if (metaspaceSize > 0) {
taskManagerProcessBuilder.addJvmArg("-XX:-UseCompressedOops");
taskManagerProcessBuilder.addJvmArg(String.format("-XX:MaxMetaspaceSize=%d", metaspaceSize));
}
for (String arg : args) {
taskManagerProcessBuilder.addMainClassArg(arg);
}
// JAVA_TOOL_OPTIONS is configured on CI which would affect the process output
taskManagerProcessBuilder.withCleanEnvironment();
TestProcess p = taskManagerProcessBuilder.start();
p.getProcess().waitFor();
return new RunResult(p.getErrorOutput().toString().trim(), p.getProcessOutput().toString().trim());
}
use of org.apache.flink.test.util.TestProcessBuilder.TestProcess in project flink by apache.
the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.
@Test
public void testCancelingOnProcessFailure() throws Throwable {
Assume.assumeTrue("---- Skipping Process Failure test : Could not find java executable ----", getJavaCommandPath() != null);
TestProcess taskManagerProcess = null;
final TestingFatalErrorHandler fatalErrorHandler = new TestingFatalErrorHandler();
Configuration config = new Configuration();
config.setString(JobManagerOptions.ADDRESS, "localhost");
config.set(AkkaOptions.ASK_TIMEOUT_DURATION, Duration.ofSeconds(100));
config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
config.setInteger(RestOptions.PORT, 0);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
final int jobManagerPort = rpcService.getPort();
config.setInteger(JobManagerOptions.PORT, jobManagerPort);
final DispatcherResourceManagerComponentFactory resourceManagerComponentFactory = DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(StandaloneResourceManagerFactory.getInstance());
DispatcherResourceManagerComponent dispatcherResourceManagerComponent = null;
final ScheduledExecutorService ioExecutor = TestingUtils.defaultExecutor();
final HighAvailabilityServices haServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(config, ioExecutor, AddressResolution.NO_ADDRESS_RESOLUTION, RpcSystem.load(), NoOpFatalErrorHandler.INSTANCE);
final AtomicReference<Throwable> programException = new AtomicReference<>();
try {
dispatcherResourceManagerComponent = resourceManagerComponentFactory.create(config, ResourceID.generate(), ioExecutor, rpcService, haServices, blobServerResource.getBlobServer(), new HeartbeatServices(100L, 10000L, 2), NoOpMetricRegistry.INSTANCE, new MemoryExecutionGraphInfoStore(), VoidMetricQueryServiceRetriever.INSTANCE, fatalErrorHandler);
TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
taskManagerProcess = taskManagerProcessBuilder.start();
// start the test program, which infinitely blocks
Runnable programRunner = new Runnable() {
@Override
public void run() {
try {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", 1337, config);
env.setParallelism(2);
env.setRestartStrategy(RestartStrategies.noRestart());
env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
synchronized (this) {
System.out.println(TASK_DEPLOYED_MARKER);
wait();
}
return 0L;
}
}).output(new DiscardingOutputFormat<>());
env.execute();
} catch (Throwable t) {
programException.set(t);
}
}
};
Thread programThread = new Thread(programRunner);
programThread.start();
waitUntilAtLeastOneTaskHasBeenDeployed(taskManagerProcess);
// kill the TaskManager after the job started to run
taskManagerProcess.destroy();
taskManagerProcess = null;
// the job should fail within a few seconds due to heartbeat timeouts
// since the CI environment is often slow, we conservatively give it up to 2 minutes
programThread.join(TIMEOUT.toMillis());
assertFalse("The program did not cancel in time", programThread.isAlive());
Throwable error = programException.get();
assertNotNull("The program did not fail properly", error);
assertTrue(error instanceof ProgramInvocationException);
// all seems well :-)
} catch (Exception | Error e) {
if (taskManagerProcess != null) {
printOutput("TaskManager OUT", taskManagerProcess.getProcessOutput().toString());
printOutput("TaskManager ERR", taskManagerProcess.getErrorOutput().toString());
}
throw ExceptionUtils.firstOrSuppressed(e, programException.get());
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (dispatcherResourceManagerComponent != null) {
dispatcherResourceManagerComponent.stopApplication(ApplicationStatus.SUCCEEDED, null);
}
fatalErrorHandler.rethrowError();
RpcUtils.terminateRpcService(rpcService, Time.seconds(100L));
haServices.closeAndCleanupAllData();
}
}
Aggregations