use of org.apache.hadoop.yarn.api.records.ApplicationReport in project flink by apache.
the class AbstractYarnClusterDescriptor method deployInternal.
/**
* This method will block until the ApplicationMaster/JobManager have been
* deployed on YARN.
*/
protected YarnClusterClient deployInternal() throws Exception {
isReadyForDeployment();
LOG.info("Using values:");
LOG.info("\tTaskManager count = {}", taskManagerCount);
LOG.info("\tJobManager memory = {}", jobManagerMemoryMb);
LOG.info("\tTaskManager memory = {}", taskManagerMemoryMb);
final YarnClient yarnClient = getYarnClient();
try {
List<QueueInfo> queues = yarnClient.getAllQueues();
if (queues.size() > 0 && this.yarnQueue != null) {
// check only if there are queues configured in yarn and for this session.
boolean queueFound = false;
for (QueueInfo queue : queues) {
if (queue.getQueueName().equals(this.yarnQueue)) {
queueFound = true;
break;
}
}
if (!queueFound) {
String queueNames = "";
for (QueueInfo queue : queues) {
queueNames += queue.getQueueName() + ", ";
}
LOG.warn("The specified queue '" + this.yarnQueue + "' does not exist. " + "Available queues: " + queueNames);
}
} else {
LOG.debug("The YARN cluster does not have any queues configured");
}
} catch (Throwable e) {
LOG.warn("Error while getting queue information from YARN: " + e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug("Error details", e);
}
}
// ------------------ Add dynamic properties to local flinkConfiguraton ------
Map<String, String> dynProperties = getDynamicProperties(dynamicPropertiesEncoded);
for (Map.Entry<String, String> dynProperty : dynProperties.entrySet()) {
flinkConfiguration.setString(dynProperty.getKey(), dynProperty.getValue());
}
// ------------------ Check if the YARN ClusterClient has the requested resources --------------
// the yarnMinAllocationMB specifies the smallest possible container allocation size.
// all allocations below this value are automatically set to this value.
final int yarnMinAllocationMB = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
if (jobManagerMemoryMb < yarnMinAllocationMB || taskManagerMemoryMb < yarnMinAllocationMB) {
LOG.warn("The JobManager or TaskManager memory is below the smallest possible YARN Container size. " + "The value of 'yarn.scheduler.minimum-allocation-mb' is '" + yarnMinAllocationMB + "'. Please increase the memory size." + "YARN will allocate the smaller containers but the scheduler will account for the minimum-allocation-mb, maybe not all instances " + "you requested will start.");
}
// set the memory to minAllocationMB to do the next checks correctly
if (jobManagerMemoryMb < yarnMinAllocationMB) {
jobManagerMemoryMb = yarnMinAllocationMB;
}
if (taskManagerMemoryMb < yarnMinAllocationMB) {
taskManagerMemoryMb = yarnMinAllocationMB;
}
// Create application via yarnClient
final YarnClientApplication yarnApplication = yarnClient.createApplication();
GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse();
Resource maxRes = appResponse.getMaximumResourceCapability();
final String NOTE = "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n";
if (jobManagerMemoryMb > maxRes.getMemory()) {
failSessionDuringDeployment(yarnClient, yarnApplication);
throw new YarnDeploymentException("The cluster does not have the requested resources for the JobManager available!\n" + "Maximum Memory: " + maxRes.getMemory() + "MB Requested: " + jobManagerMemoryMb + "MB. " + NOTE);
}
if (taskManagerMemoryMb > maxRes.getMemory()) {
failSessionDuringDeployment(yarnClient, yarnApplication);
throw new YarnDeploymentException("The cluster does not have the requested resources for the TaskManagers available!\n" + "Maximum Memory: " + maxRes.getMemory() + " Requested: " + taskManagerMemoryMb + "MB. " + NOTE);
}
final String NOTE_RSC = "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + "connecting from the beginning because the resources are currently not available in the cluster. " + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + "the resources become available.";
int totalMemoryRequired = jobManagerMemoryMb + taskManagerMemoryMb * taskManagerCount;
ClusterResourceDescription freeClusterMem = getCurrentFreeClusterResources(yarnClient);
if (freeClusterMem.totalFreeMemory < totalMemoryRequired) {
LOG.warn("This YARN session requires " + totalMemoryRequired + "MB of memory in the cluster. " + "There are currently only " + freeClusterMem.totalFreeMemory + "MB available." + NOTE_RSC);
}
if (taskManagerMemoryMb > freeClusterMem.containerLimit) {
LOG.warn("The requested amount of memory for the TaskManagers (" + taskManagerMemoryMb + "MB) is more than " + "the largest possible YARN container: " + freeClusterMem.containerLimit + NOTE_RSC);
}
if (jobManagerMemoryMb > freeClusterMem.containerLimit) {
LOG.warn("The requested amount of memory for the JobManager (" + jobManagerMemoryMb + "MB) is more than " + "the largest possible YARN container: " + freeClusterMem.containerLimit + NOTE_RSC);
}
// ----------------- check if the requested containers fit into the cluster.
int[] nmFree = Arrays.copyOf(freeClusterMem.nodeManagersFree, freeClusterMem.nodeManagersFree.length);
// first, allocate the jobManager somewhere.
if (!allocateResource(nmFree, jobManagerMemoryMb)) {
LOG.warn("Unable to find a NodeManager that can fit the JobManager/Application master. " + "The JobManager requires " + jobManagerMemoryMb + "MB. NodeManagers available: " + Arrays.toString(freeClusterMem.nodeManagersFree) + NOTE_RSC);
}
// allocate TaskManagers
for (int i = 0; i < taskManagerCount; i++) {
if (!allocateResource(nmFree, taskManagerMemoryMb)) {
LOG.warn("There is not enough memory available in the YARN cluster. " + "The TaskManager(s) require " + taskManagerMemoryMb + "MB each. " + "NodeManagers available: " + Arrays.toString(freeClusterMem.nodeManagersFree) + "\n" + "After allocating the JobManager (" + jobManagerMemoryMb + "MB) and (" + i + "/" + taskManagerCount + ") TaskManagers, " + "the following NodeManagers are available: " + Arrays.toString(nmFree) + NOTE_RSC);
}
}
ApplicationReport report = startAppMaster(null, yarnClient, yarnApplication);
String host = report.getHost();
int port = report.getRpcPort();
// Correctly initialize the Flink config
flinkConfiguration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, host);
flinkConfiguration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, port);
// the Flink cluster is deployed in YARN. Represent cluster
return createYarnClusterClient(this, yarnClient, report, flinkConfiguration, sessionFilesDir, true);
}
use of org.apache.hadoop.yarn.api.records.ApplicationReport in project flink by apache.
the class YARNSessionCapacitySchedulerITCase method testDetachedPerJobYarnClusterInternal.
private void testDetachedPerJobYarnClusterInternal(String job) {
YarnClient yc = YarnClient.createYarnClient();
yc.init(yarnConfiguration);
yc.start();
// get temporary folder for writing output of wordcount example
File tmpOutFolder = null;
try {
tmpOutFolder = tmp.newFolder();
} catch (IOException e) {
throw new RuntimeException(e);
}
// get temporary file for reading input data for wordcount example
File tmpInFile;
try {
tmpInFile = tmp.newFile();
FileUtils.writeStringToFile(tmpInFile, WordCountData.TEXT);
} catch (IOException e) {
throw new RuntimeException(e);
}
Runner runner = startWithArgs(new String[] { "run", "-m", "yarn-cluster", "-yj", flinkUberjar.getAbsolutePath(), "-yt", flinkLibFolder.getAbsolutePath(), "-yn", "1", "-yjm", "768", // test if the cutoff is passed correctly
"-yD", // test if the cutoff is passed correctly
"yarn.heap-cutoff-ratio=0.5", "-yD", "yarn.tags=test-tag", "-ytm", "1024", // test requesting slots from YARN.
"-ys", // test requesting slots from YARN.
"2", "--yarndetached", job, "--input", tmpInFile.getAbsoluteFile().toString(), "--output", tmpOutFolder.getAbsoluteFile().toString() }, "Job has been submitted with JobID", RunTypes.CLI_FRONTEND);
// it should usually be 2, but on slow machines, the number varies
Assert.assertTrue("There should be at most 2 containers running", getRunningContainers() <= 2);
// give the runner some time to detach
for (int attempt = 0; runner.isAlive() && attempt < 5; attempt++) {
try {
Thread.sleep(500);
} catch (InterruptedException e) {
}
}
Assert.assertFalse("The runner should detach.", runner.isAlive());
LOG.info("CLI Frontend has returned, so the job is running");
// find out the application id and wait until it has finished.
try {
List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING));
ApplicationId tmpAppId;
if (apps.size() == 1) {
// Better method to find the right appId. But sometimes the app is shutting down very fast
// Only one running
tmpAppId = apps.get(0).getApplicationId();
LOG.info("waiting for the job with appId {} to finish", tmpAppId);
// wait until the app has finished
while (yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)).size() > 0) {
sleep(500);
}
} else {
// get appId by finding the latest finished appid
apps = yc.getApplications();
Collections.sort(apps, new Comparator<ApplicationReport>() {
@Override
public int compare(ApplicationReport o1, ApplicationReport o2) {
return o1.getApplicationId().compareTo(o2.getApplicationId()) * -1;
}
});
tmpAppId = apps.get(0).getApplicationId();
LOG.info("Selected {} as the last appId from {}", tmpAppId, Arrays.toString(apps.toArray()));
}
final ApplicationId id = tmpAppId;
// now it has finished.
// check the output files.
File[] listOfOutputFiles = tmpOutFolder.listFiles();
Assert.assertNotNull("Taskmanager output not found", listOfOutputFiles);
LOG.info("The job has finished. TaskManager output files found in {}", tmpOutFolder);
// read all output files in output folder to one output string
String content = "";
for (File f : listOfOutputFiles) {
if (f.isFile()) {
content += FileUtils.readFileToString(f) + "\n";
}
}
//String content = FileUtils.readFileToString(taskmanagerOut);
// check for some of the wordcount outputs.
Assert.assertTrue("Expected string 'da 5' or '(all,2)' not found in string '" + content + "'", content.contains("da 5") || content.contains("(da,5)") || content.contains("(all,2)"));
Assert.assertTrue("Expected string 'der 29' or '(mind,1)' not found in string'" + content + "'", content.contains("der 29") || content.contains("(der,29)") || content.contains("(mind,1)"));
// check if the heap size for the TaskManager was set correctly
File jobmanagerLog = YarnTestBase.findFile("..", new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.contains("jobmanager.log") && dir.getAbsolutePath().contains(id.toString());
}
});
Assert.assertNotNull("Unable to locate JobManager log", jobmanagerLog);
content = FileUtils.readFileToString(jobmanagerLog);
// TM was started with 1024 but we cut off 50% (NOT THE DEFAULT VALUE)
String expected = "Starting TaskManagers with command: $JAVA_HOME/bin/java -Xms424m -Xmx424m";
Assert.assertTrue("Expected string '" + expected + "' not found in JobManager log: '" + jobmanagerLog + "'", content.contains(expected));
expected = " (2/2) (attempt #0) to ";
Assert.assertTrue("Expected string '" + expected + "' not found in JobManager log." + "This string checks that the job has been started with a parallelism of 2. Log contents: '" + jobmanagerLog + "'", content.contains(expected));
// make sure the detached app is really finished.
LOG.info("Checking again that app has finished");
ApplicationReport rep;
do {
sleep(500);
rep = yc.getApplicationReport(id);
LOG.info("Got report {}", rep);
} while (rep.getYarnApplicationState() == YarnApplicationState.RUNNING);
verifyApplicationTags(rep);
} catch (Throwable t) {
LOG.warn("Error while detached yarn session was running", t);
Assert.fail(t.getMessage());
} finally {
//cleanup the yarn-properties file
String confDirPath = System.getenv("FLINK_CONF_DIR");
File configDirectory = new File(confDirPath);
LOG.info("testDetachedPerJobYarnClusterInternal: Using configuration directory " + configDirectory.getAbsolutePath());
// load the configuration
LOG.info("testDetachedPerJobYarnClusterInternal: Trying to load configuration file");
GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
try {
File yarnPropertiesFile = FlinkYarnSessionCli.getYarnPropertiesLocation(GlobalConfiguration.loadConfiguration());
if (yarnPropertiesFile.exists()) {
LOG.info("testDetachedPerJobYarnClusterInternal: Cleaning up temporary Yarn address reference: {}", yarnPropertiesFile.getAbsolutePath());
yarnPropertiesFile.delete();
}
} catch (Exception e) {
LOG.warn("testDetachedPerJobYarnClusterInternal: Exception while deleting the JobManager address file", e);
}
}
}
use of org.apache.hadoop.yarn.api.records.ApplicationReport in project flink by apache.
the class YARNSessionCapacitySchedulerITCase method verifyApplicationTags.
/**
* Ensures that the YARN application tags were set properly.
*
* Since YARN application tags were only added in Hadoop 2.4, but Flink still supports Hadoop 2.3, reflection is
* required to invoke the methods. If the method does not exist, this test passes.
*/
private void verifyApplicationTags(final ApplicationReport report) throws InvocationTargetException, IllegalAccessException {
final Method applicationTagsMethod;
Class<ApplicationReport> clazz = ApplicationReport.class;
try {
// this method is only supported by Hadoop 2.4.0 onwards
applicationTagsMethod = clazz.getMethod("getApplicationTags");
} catch (NoSuchMethodException e) {
// only verify the tags if the method exists
return;
}
@SuppressWarnings("unchecked") Set<String> applicationTags = (Set<String>) applicationTagsMethod.invoke(report);
Assert.assertEquals(applicationTags, Sets.newHashSet("test-tag"));
}
use of org.apache.hadoop.yarn.api.records.ApplicationReport in project flink by apache.
the class YarnClusterClient method getApplicationStatus.
public ApplicationStatus getApplicationStatus() {
if (!isConnected) {
throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
}
ApplicationReport lastReport = null;
if (pollingRunner == null) {
LOG.warn("YarnClusterClient.getApplicationStatus() has been called on an uninitialized cluster." + "The system might be in an erroneous state");
} else {
lastReport = pollingRunner.getLastReport();
}
if (lastReport == null) {
LOG.warn("YarnClusterClient.getApplicationStatus() has been called on a cluster that didn't receive a status so far." + "The system might be in an erroneous state");
return ApplicationStatus.UNKNOWN;
} else {
YarnApplicationState appState = lastReport.getYarnApplicationState();
ApplicationStatus status = (appState == YarnApplicationState.FAILED || appState == YarnApplicationState.KILLED) ? ApplicationStatus.FAILED : ApplicationStatus.SUCCEEDED;
if (status != ApplicationStatus.SUCCEEDED) {
LOG.warn("YARN reported application state {}", appState);
LOG.warn("Diagnostics: {}", lastReport.getDiagnostics());
}
return status;
}
}
use of org.apache.hadoop.yarn.api.records.ApplicationReport in project flink by apache.
the class YarnClusterClient method shutdownCluster.
/**
* Shuts down the Yarn application
*/
public void shutdownCluster() {
if (hasBeenShutDown.getAndSet(true)) {
return;
}
if (!isConnected) {
throw new IllegalStateException("The cluster has been not been connected to the ApplicationMaster.");
}
try {
Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
} catch (IllegalStateException e) {
// we are already in the shutdown hook
}
LOG.info("Sending shutdown request to the Application Master");
try {
Future<Object> response = Patterns.ask(applicationClient.get(), new YarnMessages.LocalStopYarnSession(getApplicationStatus(), "Flink YARN Client requested shutdown"), new Timeout(akkaDuration));
Await.ready(response, akkaDuration);
} catch (Exception e) {
LOG.warn("Error while stopping YARN cluster.", e);
}
try {
File propertiesFile = FlinkYarnSessionCli.getYarnPropertiesLocation(flinkConfig);
if (propertiesFile.isFile()) {
if (propertiesFile.delete()) {
LOG.info("Deleted Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
} else {
LOG.warn("Couldn't delete Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
}
}
} catch (Exception e) {
LOG.warn("Exception while deleting the JobManager address file", e);
}
if (sessionFilesDir != null) {
LOG.info("Deleting files in " + sessionFilesDir);
try {
FileSystem shutFS = FileSystem.get(hadoopConfig);
// delete conf and jar file.
shutFS.delete(sessionFilesDir, true);
shutFS.close();
} catch (IOException e) {
LOG.error("Could not delete the Flink jar and configuration files in HDFS..", e);
}
} else {
LOG.warn("Session file directory not set. Not deleting session files");
}
try {
pollingRunner.stopRunner();
pollingRunner.join(1000);
} catch (InterruptedException e) {
LOG.warn("Shutdown of the polling runner was interrupted", e);
Thread.currentThread().interrupt();
}
try {
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
LOG.info("Application " + appId + " finished with state " + appReport.getYarnApplicationState() + " and final state " + appReport.getFinalApplicationStatus() + " at " + appReport.getFinishTime());
if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED || appReport.getYarnApplicationState() == YarnApplicationState.KILLED) {
LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics());
LOG.warn("If log aggregation is activated in the Hadoop cluster, we recommend to retrieve " + "the full application log using this command:" + System.lineSeparator() + "\tyarn logs -applicationId " + appReport.getApplicationId() + System.lineSeparator() + "(It sometimes takes a few seconds until the logs are aggregated)");
}
} catch (Exception e) {
LOG.warn("Couldn't get final report", e);
}
LOG.info("YARN Client is shutting down");
// actorRunner is using the yarnClient.
yarnClient.stop();
// set null to clearly see if somebody wants to access it afterwards.
yarnClient = null;
}
Aggregations