use of com.datatorrent.api.StorageAgent in project apex-core by apache.
the class StreamingContainerAgent method createOperatorDeployInfo.
/**
* Create deploy info for operator.
* <p>
*
* @return {@link com.datatorrent.stram.api.OperatorDeployInfo}
*/
private OperatorDeployInfo createOperatorDeployInfo(PTOperator oper) {
OperatorDeployInfo ndi;
if (oper.isUnifier()) {
UnifierDeployInfo udi = new UnifierDeployInfo();
/* the constructor auto sets the type */
try {
udi.operatorAttributes = oper.getUnifiedOperatorMeta().getAttributes().clone();
} catch (CloneNotSupportedException ex) {
throw new RuntimeException("Cannot clone unifier attributes", ex);
}
ndi = udi;
} else {
ndi = new OperatorDeployInfo();
Operator operator = oper.getOperatorMeta().getOperator();
if (operator instanceof InputOperator) {
ndi.type = OperatorType.INPUT;
if (!oper.getInputs().isEmpty()) {
//we check if any input port is connected which would make it a Generic operator.
for (PTOperator.PTInput ptInput : oper.getInputs()) {
if (ptInput.logicalStream != null && ptInput.logicalStream.getSource() != null) {
ndi.type = OperatorType.GENERIC;
break;
}
}
}
} else {
ndi.type = OperatorType.GENERIC;
}
}
Checkpoint checkpoint = oper.getRecoveryCheckpoint();
ProcessingMode pm = oper.getOperatorMeta().getValue(OperatorContext.PROCESSING_MODE);
if (pm == ProcessingMode.AT_MOST_ONCE || pm == ProcessingMode.EXACTLY_ONCE) {
// TODO: following should be handled in the container at deploy time
// for exactly once container should also purge previous checkpoint
// whenever new checkpoint is written.
StorageAgent agent = oper.getOperatorMeta().getAttributes().get(OperatorContext.STORAGE_AGENT);
if (agent == null) {
agent = initCtx.getValue(OperatorContext.STORAGE_AGENT);
}
// pick checkpoint most recently written
try {
long[] windowIds = agent.getWindowIds(oper.getId());
long checkpointId = Stateless.WINDOW_ID;
for (long windowId : windowIds) {
if (windowId > checkpointId) {
checkpointId = windowId;
}
}
if (checkpoint == null || checkpoint.windowId != checkpointId) {
checkpoint = new Checkpoint(checkpointId, 0, 0);
}
} catch (Exception e) {
throw new RuntimeException("Failed to determine checkpoint window id " + oper, e);
}
}
LOG.debug("{} recovery checkpoint {}", oper, checkpoint);
ndi.checkpoint = checkpoint;
ndi.name = oper.getOperatorMeta().getName();
ndi.id = oper.getId();
try {
// clone map before modifying it
ndi.contextAttributes = oper.getOperatorMeta().getAttributes().clone();
} catch (CloneNotSupportedException ex) {
throw new RuntimeException("Cannot clone operator attributes", ex);
}
if (oper.isOperatorStateLess()) {
ndi.contextAttributes.put(OperatorContext.STATELESS, true);
}
return ndi;
}
use of com.datatorrent.api.StorageAgent in project apex-core by apache.
the class Node method checkpoint.
void checkpoint(long windowId) {
if (!context.stateless) {
if (operator instanceof Operator.CheckpointNotificationListener) {
((Operator.CheckpointNotificationListener) operator).beforeCheckpoint(windowId);
}
StorageAgent ba = context.getValue(OperatorContext.STORAGE_AGENT);
if (ba != null) {
try {
checkpointStats = new Stats.CheckpointStats();
checkpointStats.checkpointStartTime = System.currentTimeMillis();
ba.save(operator, id, windowId);
if (ba instanceof AsyncStorageAgent) {
AsyncStorageAgent asyncStorageAgent = (AsyncStorageAgent) ba;
if (!asyncStorageAgent.isSyncCheckpoint()) {
if (PROCESSING_MODE != ProcessingMode.EXACTLY_ONCE) {
CheckpointWindowInfo checkpointWindowInfo = new CheckpointWindowInfo();
checkpointWindowInfo.windowId = windowId;
checkpointWindowInfo.applicationWindowCount = applicationWindowCount;
checkpointWindowInfo.checkpointWindowCount = checkpointWindowCount;
CheckpointHandler checkpointHandler = new CheckpointHandler();
checkpointHandler.agent = asyncStorageAgent;
checkpointHandler.operatorId = id;
checkpointHandler.windowId = windowId;
checkpointHandler.stats = checkpointStats;
FutureTask<Stats.CheckpointStats> futureTask = new FutureTask<>(checkpointHandler);
taskQueue.add(new Pair<>(futureTask, checkpointWindowInfo));
executorService.submit(futureTask);
checkpoint = null;
checkpointStats = null;
return;
} else {
asyncStorageAgent.flush(id, windowId);
}
}
}
checkpointStats.checkpointTime = System.currentTimeMillis() - checkpointStats.checkpointStartTime;
} catch (IOException ie) {
try {
logger.warn("Rolling back checkpoint {} for Operator {} due to the exception {}", Codec.getStringWindowId(windowId), operator, ie);
ba.delete(id, windowId);
} catch (IOException ex) {
logger.warn("Error while rolling back checkpoint", ex);
}
throw new RuntimeException(ie);
}
}
}
calculateNextCheckpointWindow();
dagCheckpointOffsetCount = 0;
checkpoint = new Checkpoint(windowId, applicationWindowCount, checkpointWindowCount);
if (operator instanceof Operator.CheckpointListener) {
((Operator.CheckpointListener) operator).checkpointed(windowId);
}
}
use of com.datatorrent.api.StorageAgent in project apex-core by apache.
the class StramRecoveryTest method checkpoint.
public static void checkpoint(StreamingContainerManager scm, PTOperator oper, Checkpoint checkpoint) throws Exception {
// write checkpoint while AM is out,
// it needs to be picked up as part of restore
StorageAgent sa = oper.getOperatorMeta().getValue(OperatorContext.STORAGE_AGENT);
sa.save(oper.getOperatorMeta().getOperator(), oper.getId(), checkpoint.windowId);
}
use of com.datatorrent.api.StorageAgent in project apex-core by apache.
the class StreamingContainer method deployNodes.
private void deployNodes(List<OperatorDeployInfo> nodeList) throws IOException {
for (OperatorDeployInfo ndi : nodeList) {
StorageAgent backupAgent = getValue(OperatorContext.STORAGE_AGENT, ndi);
assert (backupAgent != null);
Context parentContext;
if (ndi instanceof UnifierDeployInfo) {
OperatorContext unifiedOperatorContext = new OperatorContext(0, ndi.name, ((UnifierDeployInfo) ndi).operatorAttributes, containerContext);
parentContext = new PortContext(ndi.inputs.get(0).contextAttributes, unifiedOperatorContext);
massageUnifierDeployInfo(ndi);
} else {
parentContext = containerContext;
}
OperatorContext ctx = new OperatorContext(ndi.id, ndi.name, ndi.contextAttributes, parentContext);
ctx.attributes.put(OperatorContext.ACTIVATION_WINDOW_ID, ndi.checkpoint.windowId);
logger.debug("Restoring operator {} to checkpoint {} stateless={}.", ndi.id, Codec.getStringWindowId(ndi.checkpoint.windowId), ctx.stateless);
Node<?> node = Node.retrieveNode(backupAgent.load(ndi.id, ctx.stateless ? Stateless.WINDOW_ID : ndi.checkpoint.windowId), ctx, ndi.type);
node.currentWindowId = ndi.checkpoint.windowId;
node.applicationWindowCount = ndi.checkpoint.applicationWindowCount;
node.firstWindowMillis = firstWindowMillis;
node.windowWidthMillis = windowWidthMillis;
node.setId(ndi.id);
nodes.put(ndi.id, node);
logger.debug("Marking operator {} as deployed.", node);
}
}
use of com.datatorrent.api.StorageAgent in project apex-core by apache.
the class StramClient method startApplication.
/**
* Launch application for the dag represented by this client.
*
* @throws YarnException
* @throws IOException
*/
public void startApplication() throws YarnException, IOException {
Class<?>[] defaultClasses;
if (applicationType.equals(YARN_APPLICATION_TYPE)) {
//TODO restrict the security check to only check if security is enabled for webservices.
if (UserGroupInformation.isSecurityEnabled()) {
defaultClasses = APEX_SECURITY_CLASSES;
} else {
defaultClasses = APEX_CLASSES;
}
} else {
throw new IllegalStateException(applicationType + " is not a valid application type.");
}
LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses);
if (resources != null) {
localJarFiles.addAll(resources);
}
YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics();
LOG.info("Got Cluster metric info from ASM, numNodeManagers={}", clusterMetrics.getNumNodeManagers());
//GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class);
//GetClusterNodesResponse clusterNodesResp = rmClient.clientRM.getClusterNodes(clusterNodesReq);
//LOG.info("Got Cluster node info from ASM");
//for (NodeReport node : clusterNodesResp.getNodeReports()) {
// LOG.info("Got node report from ASM for"
// + ", nodeId=" + node.getNodeId()
// + ", nodeAddress" + node.getHttpAddress()
// + ", nodeRackName" + node.getRackName()
// + ", nodeNumContainers" + node.getNumContainers()
// + ", nodeHealthStatus" + node.getHealthReport());
//}
List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo();
for (QueueUserACLInfo aclInfo : listAclInfo) {
for (QueueACL userAcl : aclInfo.getUserAcls()) {
LOG.info("User ACL Info for Queue queueName={}, userAcl={}", aclInfo.getQueueName(), userAcl.name());
}
}
// Get a new application id
YarnClientApplication newApp = yarnClient.createApplication();
appId = newApp.getNewApplicationResponse().getApplicationId();
// Dump out information about cluster capability as seen by the resource manager
int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory();
LOG.info("Max mem capability of resources in this cluster " + maxMem);
int amMemory = dag.getMasterMemoryMB();
if (amMemory > maxMem) {
LOG.info("AM memory specified above max threshold of cluster. Using max value, specified={}, max={}", amMemory, maxMem);
amMemory = maxMem;
}
if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) {
dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString());
}
// Create launch context for app master
LOG.info("Setting up application submission context for ASM");
ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class);
// set the application id
appContext.setApplicationId(appId);
// set the application name
appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME));
appContext.setApplicationType(this.applicationType);
if (YARN_APPLICATION_TYPE.equals(this.applicationType)) {
//appContext.setMaxAppAttempts(1); // no retries until Stram is HA
}
appContext.setKeepContainersAcrossApplicationAttempts(true);
// Set up the container launch context for the application master
ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class);
// application master launch.
if (UserGroupInformation.isSecurityEnabled()) {
Credentials credentials = new Credentials();
String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
if (tokenRenewer == null || tokenRenewer.length() == 0) {
throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer");
}
// For now, only getting tokens for the default file-system.
try (FileSystem fs = StramClientUtils.newFileSystemInstance(conf)) {
final Token<?>[] tokens = fs.addDelegationTokens(tokenRenewer, credentials);
if (tokens != null) {
for (Token<?> token : tokens) {
LOG.info("Got dt for " + fs.getUri() + "; " + token);
}
}
}
new ClientRMHelper(yarnClient, conf).addRMDelegationToken(tokenRenewer, credentials);
DataOutputBuffer dob = new DataOutputBuffer();
credentials.writeTokenStorageToStream(dob);
ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
amContainer.setTokens(fsTokens);
}
// Setup ACLs for the impersonating user
LOG.debug("ACL login user {} current user {}", UserGroupInformation.getLoginUser(), UserGroupInformation.getCurrentUser());
if (!UserGroupInformation.getCurrentUser().equals(UserGroupInformation.getLoginUser())) {
ACLManager.setupUserACLs(amContainer, UserGroupInformation.getLoginUser().getShortUserName(), conf);
}
// set local resources for the application master
// local files or archives as needed
// In this scenario, the jar file for the application master is part of the local resources
Map<String, LocalResource> localResources = new HashMap<>();
// copy required jar files to dfs, to be localized for containers
try (FileSystem fs = StramClientUtils.newFileSystemInstance(conf)) {
Path appsBasePath = new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS);
Path appPath;
String configuredAppPath = dag.getValue(LogicalPlan.APPLICATION_PATH);
if (configuredAppPath == null) {
appPath = new Path(appsBasePath, appId.toString());
} else {
appPath = new Path(configuredAppPath);
}
String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {}));
LOG.info("libjars: {}", libJarsCsv);
dag.getAttributes().put(Context.DAGContext.LIBRARY_JARS, libJarsCsv);
LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.FILE, libJarsCsv, localResources, fs);
if (archives != null) {
String[] localFiles = archives.split(",");
String archivesCsv = copyFromLocal(fs, appPath, localFiles);
LOG.info("archives: {}", archivesCsv);
dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv);
LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.ARCHIVE, archivesCsv, localResources, fs);
}
if (files != null) {
String[] localFiles = files.split(",");
String filesCsv = copyFromLocal(fs, appPath, localFiles);
LOG.info("files: {}", filesCsv);
dag.getAttributes().put(LogicalPlan.FILES, filesCsv);
LaunchContainerRunnable.addFilesToLocalResources(LocalResourceType.FILE, filesCsv, localResources, fs);
}
dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString());
StorageAgent agent = dag.getAttributes().get(OperatorContext.STORAGE_AGENT);
if (agent != null && agent instanceof StorageAgent.ApplicationAwareStorageAgent) {
((StorageAgent.ApplicationAwareStorageAgent) agent).setApplicationAttributes(dag.getAttributes());
}
if (dag.getAttributes().get(OperatorContext.STORAGE_AGENT) == null) {
/* which would be the most likely case */
Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS);
// use conf client side to pickup any proxy settings from dt-site.xml
dag.setAttribute(OperatorContext.STORAGE_AGENT, new AsyncFSStorageAgent(checkpointPath.toString(), conf));
}
if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) {
dag.setAttribute(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator());
}
// Set the log4j properties if needed
if (!log4jPropFile.isEmpty()) {
Path log4jSrc = new Path(log4jPropFile);
Path log4jDst = new Path(appPath, "log4j.props");
fs.copyFromLocalFile(false, true, log4jSrc, log4jDst);
FileStatus log4jFileStatus = fs.getFileStatus(log4jDst);
LocalResource log4jRsrc = Records.newRecord(LocalResource.class);
log4jRsrc.setType(LocalResourceType.FILE);
log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri()));
log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime());
log4jRsrc.setSize(log4jFileStatus.getLen());
localResources.put("log4j.properties", log4jRsrc);
}
if (originalAppId != null) {
Path origAppPath = new Path(appsBasePath, this.originalAppId);
LOG.info("Restart from {}", origAppPath);
copyInitialState(origAppPath);
}
// push logical plan to DFS location
Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME);
FSDataOutputStream outStream = fs.create(cfgDst, true);
LogicalPlan.write(this.dag, outStream);
outStream.close();
Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME);
outStream = fs.create(launchConfigDst, true);
conf.writeXml(outStream);
outStream.close();
LaunchContainerRunnable.addFileToLocalResources(LogicalPlan.SER_FILE_NAME, fs.getFileStatus(cfgDst), LocalResourceType.FILE, localResources);
// Set local resource info into app master container launch context
amContainer.setLocalResources(localResources);
// Set the necessary security tokens as needed
//amContainer.setContainerTokens(containerToken);
// Set the env variables to be setup in the env where the application master will be run
LOG.info("Set the environment for the application master");
Map<String, String> env = new HashMap<>();
// Add application jar(s) location to classpath
// At some point we should not be required to add
// the hadoop specific classpaths to the env.
// It should be provided out of the box.
// For now setting all required classpaths including
// the classpath to "." for the application jar(s)
// including ${CLASSPATH} will duplicate the class path in app master, removing it for now
//StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*");
StringBuilder classPathEnv = new StringBuilder("./*");
String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH);
for (String c : StringUtils.isBlank(classpath) ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH : classpath.split(",")) {
if (c.equals("$HADOOP_CLIENT_CONF_DIR")) {
// SPOI-2501
continue;
}
classPathEnv.append(':');
classPathEnv.append(c.trim());
}
env.put("CLASSPATH", classPathEnv.toString());
// propagate to replace node managers user name (effective in non-secure mode)
// also to indicate original login user during impersonation and important for setting ACLs
env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName());
amContainer.setEnvironment(env);
// Set the necessary command to execute the application master
ArrayList<CharSequence> vargs = new ArrayList<>(30);
// Set java executable command
LOG.info("Setting up app master command");
vargs.add(javaCmd);
if (dag.isDebug()) {
vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n");
}
// default heap size 75% of total memory
if (dag.getMasterJVMOptions() != null) {
vargs.add(dag.getMasterJVMOptions());
}
Path tmpDir = new Path(ApplicationConstants.Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR);
vargs.add("-Djava.io.tmpdir=" + tmpDir);
vargs.add("-Xmx" + (amMemory * 3 / 4) + "m");
vargs.add("-XX:+HeapDumpOnOutOfMemoryError");
vargs.add("-XX:HeapDumpPath=" + System.getProperty("java.io.tmpdir") + "/dt-heap-" + appId.getId() + ".bin");
vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA");
vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR);
vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath()));
StramClientUtils.addAttributeToArgs(LogicalPlan.APPLICATION_NAME, dag, vargs);
StramClientUtils.addAttributeToArgs(LogicalPlan.LOGGER_APPENDER, dag, vargs);
if (dag.isDebug()) {
vargs.add("-Dlog4j.debug=true");
}
String loggersLevel = conf.get(StramUtils.DT_LOGGERS_LEVEL);
if (loggersLevel != null) {
vargs.add(String.format("-D%s=%s", StramUtils.DT_LOGGERS_LEVEL, loggersLevel));
}
vargs.add(StreamingAppMaster.class.getName());
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr");
// Get final command
StringBuilder command = new StringBuilder(9 * vargs.size());
for (CharSequence str : vargs) {
command.append(str).append(" ");
}
LOG.info("Completed setting up app master command " + command.toString());
List<String> commands = new ArrayList<>();
commands.add(command.toString());
amContainer.setCommands(commands);
// Set up resource type requirements
// For now, only memory is supported so we set memory requirements
Resource capability = Records.newRecord(Resource.class);
capability.setMemory(amMemory);
appContext.setResource(capability);
// Service data is a binary blob that can be passed to the application
// Not needed in this scenario
// amContainer.setServiceData(serviceData);
appContext.setAMContainerSpec(amContainer);
// Set the priority for the application master
Priority pri = Records.newRecord(Priority.class);
pri.setPriority(amPriority);
appContext.setPriority(pri);
// Set the queue to which this application is to be submitted in the RM
appContext.setQueue(queueName);
// set the application tags
appContext.setApplicationTags(tags);
// Submit the application to the applications manager
// SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest);
// Ignore the response as either a valid response object is returned on success
// or an exception thrown to denote some form of a failure
String specStr = Objects.toStringHelper("Submitting application: ").add("name", appContext.getApplicationName()).add("queue", appContext.getQueue()).add("user", UserGroupInformation.getLoginUser()).add("resource", appContext.getResource()).toString();
LOG.info(specStr);
if (dag.isDebug()) {
//LOG.info("Full submission context: " + appContext);
}
yarnClient.submitApplication(appContext);
}
}
Aggregations