use of org.apache.oozie.SLAEventBean in project oozie by apache.
the class CoordActionStartXCommand method execute.
@Override
protected Void execute() throws CommandException {
boolean makeFail = true;
String errCode = "";
String errMsg = "";
ParamChecker.notEmpty(user, "user");
log.debug("actionid=" + actionId + ", status=" + coordAction.getStatus());
if (coordAction.getStatus() == CoordinatorAction.Status.SUBMITTED) {
// log.debug("getting.. job id: " + coordAction.getJobId());
// create merged runConf to pass to WF Engine
Configuration runConf = mergeConfig(coordAction);
coordAction.setRunConf(XmlUtils.prettyPrint(runConf).toString());
// log.debug("%%% merged runconf=" +
// XmlUtils.prettyPrint(runConf).toString());
DagEngine dagEngine = Services.get().get(DagEngineService.class).getDagEngine(user);
try {
Configuration conf = new XConfiguration(new StringReader(coordAction.getRunConf()));
SLAEventBean slaEvent = SLADbOperations.createStatusEvent(coordAction.getSlaXml(), coordAction.getId(), Status.STARTED, SlaAppType.COORDINATOR_ACTION, log);
if (slaEvent != null) {
insertList.add(slaEvent);
}
if (OozieJobInfo.isJobInfoEnabled()) {
conf.set(OozieJobInfo.COORD_ID, actionId);
conf.set(OozieJobInfo.COORD_NAME, appName);
conf.set(OozieJobInfo.COORD_NOMINAL_TIME, coordAction.getNominalTimestamp().toString());
}
// Normalize workflow appPath here;
JobUtils.normalizeAppPath(conf.get(OozieClient.USER_NAME), conf.get(OozieClient.GROUP_NAME), conf);
if (coordAction.getExternalId() != null) {
conf.setBoolean(OozieClient.RERUN_FAIL_NODES, true);
dagEngine.reRun(coordAction.getExternalId(), conf);
} else {
// Pushing the nominal time in conf to use for launcher tag search
conf.set(OOZIE_COORD_ACTION_NOMINAL_TIME, String.valueOf(coordAction.getNominalTime().getTime()));
String wfId = dagEngine.submitJobFromCoordinator(conf, actionId);
coordAction.setExternalId(wfId);
}
coordAction.setStatus(CoordinatorAction.Status.RUNNING);
coordAction.incrementAndGetPending();
// store.updateCoordinatorAction(coordAction);
JPAService jpaService = Services.get().get(JPAService.class);
if (jpaService != null) {
log.debug("Updating WF record for WFID :" + coordAction.getExternalId() + " with parent id: " + actionId);
WorkflowJobBean wfJob = WorkflowJobQueryExecutor.getInstance().get(WorkflowJobQuery.GET_WORKFLOW_STARTTIME, coordAction.getExternalId());
wfJob.setParentId(actionId);
wfJob.setLastModifiedTime(new Date());
BatchQueryExecutor executor = BatchQueryExecutor.getInstance();
updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_PARENT_MODIFIED, wfJob));
updateList.add(new UpdateEntry<CoordActionQuery>(CoordActionQuery.UPDATE_COORD_ACTION_FOR_START, coordAction));
try {
executor.executeBatchInsertUpdateDelete(insertList, updateList, null);
queue(new CoordActionNotificationXCommand(coordAction), 100);
if (EventHandlerService.isEnabled()) {
generateEvent(coordAction, user, appName, wfJob.getStartTime());
}
} catch (JPAExecutorException je) {
throw new CommandException(je);
}
} else {
log.error(ErrorCode.E0610);
}
makeFail = false;
} catch (DagEngineException dee) {
errMsg = dee.getMessage();
errCode = dee.getErrorCode().toString();
log.warn("can not create DagEngine for submitting jobs", dee);
} catch (CommandException ce) {
errMsg = ce.getMessage();
errCode = ce.getErrorCode().toString();
log.warn("command exception occurred ", ce);
} catch (java.io.IOException ioe) {
errMsg = ioe.getMessage();
errCode = "E1005";
log.warn("Configuration parse error. read from DB :" + coordAction.getRunConf(), ioe);
} catch (Exception ex) {
errMsg = ex.getMessage();
errCode = "E1005";
log.warn("can not create DagEngine for submitting jobs", ex);
} finally {
if (makeFail == true) {
// No DB exception occurs
log.error("Failing the action " + coordAction.getId() + ". Because " + errCode + " : " + errMsg);
coordAction.setStatus(CoordinatorAction.Status.FAILED);
if (errMsg.length() > 254) {
// Because table column size is 255
errMsg = errMsg.substring(0, 255);
}
coordAction.setErrorMessage(errMsg);
coordAction.setErrorCode(errCode);
updateList = new ArrayList<UpdateEntry>();
updateList.add(new UpdateEntry<CoordActionQuery>(CoordActionQuery.UPDATE_COORD_ACTION_FOR_START, coordAction));
insertList = new ArrayList<JsonBean>();
SLAEventBean slaEvent = SLADbOperations.createStatusEvent(coordAction.getSlaXml(), coordAction.getId(), Status.FAILED, SlaAppType.COORDINATOR_ACTION, log);
if (slaEvent != null) {
// Update SLA events
insertList.add(slaEvent);
}
try {
// call JPAExecutor to do the bulk writes
BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null);
if (EventHandlerService.isEnabled()) {
generateEvent(coordAction, user, appName, null);
}
} catch (JPAExecutorException je) {
throw new CommandException(je);
}
queue(new CoordActionReadyXCommand(coordAction.getJobId()));
}
}
}
return null;
}
use of org.apache.oozie.SLAEventBean in project oozie by apache.
the class ActionEndXCommand method execute.
@Override
protected Void execute() throws CommandException {
LOG.debug("STARTED ActionEndXCommand for action " + actionId);
Configuration conf = wfJob.getWorkflowInstance().getConf();
int maxRetries = 0;
long retryInterval = 0;
if (!(executor instanceof ControlNodeActionExecutor)) {
maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries());
retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval());
}
executor.setMaxRetries(maxRetries);
executor.setRetryInterval(retryInterval);
boolean isRetry = false;
if (wfAction.getStatus() == WorkflowActionBean.Status.END_RETRY || wfAction.getStatus() == WorkflowActionBean.Status.END_MANUAL) {
isRetry = true;
}
boolean isUserRetry = false;
ActionExecutorContext context = new ActionXCommand.ActionExecutorContext(wfJob, wfAction, isRetry, isUserRetry);
try {
LOG.debug("End, name [{0}] type [{1}] status[{2}] external status [{3}] signal value [{4}]", wfAction.getName(), wfAction.getType(), wfAction.getStatus(), wfAction.getExternalStatus(), wfAction.getSignalValue());
Instrumentation.Cron cron = new Instrumentation.Cron();
cron.start();
executor.end(context, wfAction);
cron.stop();
addActionCron(wfAction.getType(), cron);
incrActionCounter(wfAction.getType(), 1);
if (!context.isEnded()) {
LOG.warn(XLog.OPS, "Action Ended, ActionExecutor [{0}] must call setEndData()", executor.getType());
wfAction.setErrorInfo(END_DATA_MISSING, "Execution Ended, but End Data Missing from Action");
failJob(context);
} else {
wfAction.setRetries(0);
wfAction.setEndTime(new Date());
boolean shouldHandleUserRetry = false;
Status slaStatus = null;
switch(wfAction.getStatus()) {
case OK:
slaStatus = Status.SUCCEEDED;
break;
case KILLED:
slaStatus = Status.KILLED;
break;
case FAILED:
slaStatus = Status.FAILED;
shouldHandleUserRetry = true;
break;
case ERROR:
LOG.info("ERROR is considered as FAILED for SLA");
slaStatus = Status.KILLED;
shouldHandleUserRetry = true;
break;
default:
slaStatus = Status.FAILED;
shouldHandleUserRetry = true;
break;
}
if (!shouldHandleUserRetry || !handleUserRetry(context, wfAction)) {
SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), slaStatus, SlaAppType.WORKFLOW_ACTION);
if (slaEvent != null) {
insertList.add(slaEvent);
}
}
}
WorkflowInstance wfInstance = wfJob.getWorkflowInstance();
DagELFunctions.setActionInfo(wfInstance, wfAction);
wfJob.setWorkflowInstance(wfInstance);
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_END, wfAction));
wfJob.setLastModifiedTime(new Date());
updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_STATUS_INSTANCE_MODIFIED, wfJob));
} catch (ActionExecutorException ex) {
LOG.warn("Error ending action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]", wfAction.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage());
wfAction.setErrorInfo(ex.getErrorCode(), ex.getMessage());
wfAction.setEndTime(null);
switch(ex.getErrorType()) {
case TRANSIENT:
if (!handleTransient(context, executor, WorkflowAction.Status.END_RETRY)) {
handleNonTransient(context, executor, WorkflowAction.Status.END_MANUAL);
wfAction.setPendingAge(new Date());
wfAction.setRetries(0);
}
wfAction.setEndTime(null);
break;
case NON_TRANSIENT:
handleNonTransient(context, executor, WorkflowAction.Status.END_MANUAL);
wfAction.setEndTime(null);
break;
case ERROR:
handleError(context, executor, COULD_NOT_END, false, WorkflowAction.Status.ERROR);
break;
case FAILED:
failJob(context);
break;
}
WorkflowInstance wfInstance = wfJob.getWorkflowInstance();
DagELFunctions.setActionInfo(wfInstance, wfAction);
wfJob.setWorkflowInstance(wfInstance);
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_END, wfAction));
wfJob.setLastModifiedTime(new Date());
updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_STATUS_INSTANCE_MODIFIED, wfJob));
} finally {
try {
BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null);
} catch (JPAExecutorException e) {
throw new CommandException(e);
}
if (!(executor instanceof ControlNodeActionExecutor) && EventHandlerService.isEnabled()) {
generateEvent(wfAction, wfJob.getUser());
}
new SignalXCommand(jobId, actionId).call();
}
LOG.debug("ENDED ActionEndXCommand for action " + actionId);
return null;
}
use of org.apache.oozie.SLAEventBean in project oozie by apache.
the class ActionKillXCommand method execute.
@Override
protected Void execute() throws CommandException {
LOG.debug("STARTED WorkflowActionKillXCommand for action " + actionId);
if (wfAction.isPending()) {
ActionExecutor executor = Services.get().get(ActionService.class).getExecutor(wfAction.getType());
if (executor != null) {
ActionExecutorContext context = null;
try {
boolean isRetry = false;
boolean isUserRetry = false;
context = new ActionXCommand.ActionExecutorContext(wfJob, wfAction, isRetry, isUserRetry);
incrActionCounter(wfAction.getType(), 1);
Instrumentation.Cron cron = new Instrumentation.Cron();
cron.start();
executor.kill(context, wfAction);
cron.stop();
addActionCron(wfAction.getType(), cron);
wfAction.resetPending();
wfAction.setStatus(WorkflowActionBean.Status.KILLED);
wfAction.setEndTime(new Date());
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_END, wfAction));
wfJob.setLastModifiedTime(new Date());
updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_MODTIME, wfJob));
// Add SLA status event (KILLED) for WF_ACTION
SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.KILLED, SlaAppType.WORKFLOW_ACTION);
if (slaEvent != null) {
insertList.add(slaEvent);
}
queue(new WorkflowNotificationXCommand(wfJob, wfAction));
} catch (ActionExecutorException ex) {
wfAction.resetPending();
wfAction.setStatus(WorkflowActionBean.Status.FAILED);
wfAction.setErrorInfo(ex.getErrorCode().toString(), "KILL COMMAND FAILED - exception while executing job kill");
wfAction.setEndTime(new Date());
wfJob.setStatus(WorkflowJobBean.Status.KILLED);
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_END, wfAction));
wfJob.setLastModifiedTime(new Date());
updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_STATUS_MODTIME, wfJob));
// What will happen to WF and COORD_ACTION, NOTIFICATION?
SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.FAILED, SlaAppType.WORKFLOW_ACTION);
if (slaEvent != null) {
insertList.add(slaEvent);
}
LOG.warn("Exception while executing kill(). Error Code [{0}], Message[{1}]", ex.getErrorCode(), ex.getMessage(), ex);
} finally {
try {
cleanupActionDir(context);
BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null);
if (!(executor instanceof ControlNodeActionExecutor) && EventHandlerService.isEnabled()) {
generateEvent(wfAction, wfJob.getUser());
}
} catch (JPAExecutorException e) {
throw new CommandException(e);
}
}
}
}
LOG.debug("ENDED WorkflowActionKillXCommand for action " + actionId);
return null;
}
use of org.apache.oozie.SLAEventBean in project oozie by apache.
the class ActionStartXCommand method execute.
@Override
protected ActionExecutorContext execute() throws CommandException {
LOG.debug("STARTED ActionStartXCommand for wf actionId=" + actionId);
Configuration conf = wfJob.getWorkflowInstance().getConf();
int maxRetries = 0;
long retryInterval = 0;
boolean execSynchronous = false;
if (!(executor instanceof ControlNodeActionExecutor)) {
maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries());
retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval());
}
executor.setMaxRetries(maxRetries);
executor.setRetryInterval(retryInterval);
try {
boolean isRetry = false;
if (wfAction.getStatus() == WorkflowActionBean.Status.START_RETRY || wfAction.getStatus() == WorkflowActionBean.Status.START_MANUAL) {
isRetry = true;
prepareForRetry(wfAction);
}
boolean isUserRetry = false;
if (wfAction.getStatus() == WorkflowActionBean.Status.USER_RETRY) {
isUserRetry = true;
prepareForRetry(wfAction);
}
context = getContext(isRetry, isUserRetry);
boolean caught = false;
try {
if (!(executor instanceof ControlNodeActionExecutor)) {
String tmpActionConf = XmlUtils.removeComments(wfAction.getConf());
String actionConf = context.getELEvaluator().evaluate(tmpActionConf, String.class);
wfAction.setConf(actionConf);
LOG.debug("Start, name [{0}] type [{1}] configuration{E}{E}{2}{E}", wfAction.getName(), wfAction.getType(), actionConf);
}
} catch (ELEvaluationException ex) {
caught = true;
throw new ActionExecutorException(ActionExecutorException.ErrorType.TRANSIENT, EL_EVAL_ERROR, ex.getMessage(), ex);
} catch (ELException ex) {
caught = true;
context.setErrorInfo(EL_ERROR, ex.getMessage());
LOG.warn("ELException in ActionStartXCommand ", ex.getMessage(), ex);
handleError(context, wfJob, wfAction);
} catch (org.jdom.JDOMException je) {
caught = true;
context.setErrorInfo("ParsingError", je.getMessage());
LOG.warn("JDOMException in ActionStartXCommand ", je.getMessage(), je);
handleError(context, wfJob, wfAction);
} catch (Exception ex) {
caught = true;
context.setErrorInfo(EL_ERROR, ex.getMessage());
LOG.warn("Exception in ActionStartXCommand ", ex.getMessage(), ex);
handleError(context, wfJob, wfAction);
}
if (!caught) {
wfAction.setErrorInfo(null, null);
incrActionCounter(wfAction.getType(), 1);
LOG.info("Start action [{0}] with user-retry state : userRetryCount [{1}], userRetryMax [{2}], userRetryInterval" + " [{3}]", wfAction.getId(), wfAction.getUserRetryCount(), wfAction.getUserRetryMax(), wfAction.getUserRetryInterval());
Instrumentation.Cron cron = new Instrumentation.Cron();
cron.start();
// do not override starttime for retries
if (wfAction.getStartTime() == null) {
context.setStartTime();
}
context.setVar(JobUtils.getRetryKey(wfAction, JsonTags.WORKFLOW_ACTION_START_TIME), String.valueOf(new Date().getTime()));
executor.start(context, wfAction);
cron.stop();
FaultInjection.activate("org.apache.oozie.command.SkipCommitFaultInjection");
addActionCron(wfAction.getType(), cron);
wfAction.setRetries(0);
if (wfAction.isExecutionComplete()) {
if (!context.isExecuted()) {
LOG.warn(XLog.OPS, "Action Completed, ActionExecutor [{0}] must call setExecutionData()", executor.getType());
wfAction.setErrorInfo(EXEC_DATA_MISSING, "Execution Complete, but Execution Data Missing from Action");
failJob(context);
} else {
wfAction.setPending();
if (!(executor instanceof ControlNodeActionExecutor)) {
queue(new ActionEndXCommand(wfAction.getId(), wfAction.getType()));
} else {
execSynchronous = true;
}
}
} else {
if (!context.isStarted()) {
LOG.warn(XLog.OPS, "Action Started, ActionExecutor [{0}] must call setStartData()", executor.getType());
wfAction.setErrorInfo(START_DATA_MISSING, "Execution Started, but Start Data Missing from Action");
failJob(context);
} else {
queue(new WorkflowNotificationXCommand(wfJob, wfAction));
}
}
LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action status=" + wfAction.getStatusStr());
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction));
updateJobLastModified();
// Add SLA status event (STARTED) for WF_ACTION
SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.STARTED, SlaAppType.WORKFLOW_ACTION);
if (slaEvent != null) {
insertList.add(slaEvent);
}
LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action updated in DB!");
}
} catch (ActionExecutorException ex) {
LOG.warn("Error starting action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]", wfAction.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage(), ex);
wfAction.setErrorInfo(ex.getErrorCode(), ex.getMessage());
switch(ex.getErrorType()) {
case TRANSIENT:
if (!handleTransient(context, executor, WorkflowAction.Status.START_RETRY)) {
handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL);
wfAction.setPendingAge(new Date());
wfAction.setRetries(0);
wfAction.setStartTime(null);
}
break;
case NON_TRANSIENT:
handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL);
break;
case ERROR:
handleError(context, executor, WorkflowAction.Status.ERROR.toString(), true, WorkflowAction.Status.DONE);
break;
case FAILED:
try {
failJob(context);
endWF();
SLAEventBean slaEvent1 = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.FAILED, SlaAppType.WORKFLOW_ACTION);
if (slaEvent1 != null) {
insertList.add(slaEvent1);
}
} catch (XException x) {
LOG.warn("ActionStartXCommand - case:FAILED ", x.getMessage());
}
break;
}
updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction));
updateJobLastModified();
} finally {
try {
BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null);
if (!(executor instanceof ControlNodeActionExecutor) && EventHandlerService.isEnabled()) {
generateEvent(wfAction, wfJob.getUser());
}
if (execSynchronous) {
// Changing to synchronous call from asynchronous queuing to prevent
// undue delay from ::start:: to action due to queuing
callActionEnd();
}
} catch (JPAExecutorException e) {
throw new CommandException(e);
}
}
LOG.debug("ENDED ActionStartXCommand for wf actionId=" + actionId + ", jobId=" + jobId);
return null;
}
use of org.apache.oozie.SLAEventBean in project oozie by apache.
the class SubmitXCommand method writeSLARegistration.
private void writeSLARegistration(Element eWfJob, String slaXml, String jobId, String parentId, String user, String group, String appName, XLog log, ELEvaluator evalSla) throws CommandException {
try {
if (slaXml != null && slaXml.length() > 0) {
Element eSla = XmlUtils.parseXml(slaXml);
SLAEventBean slaEvent = SLADbOperations.createSlaRegistrationEvent(eSla, jobId, SlaAppType.WORKFLOW_JOB, user, group, log);
if (slaEvent != null) {
insertList.add(slaEvent);
}
// insert into new table
SLAOperations.createSlaRegistrationEvent(eSla, jobId, parentId, AppType.WORKFLOW_JOB, user, appName, log, false);
}
// Add sla for wf actions
for (Element action : (List<Element>) eWfJob.getChildren("action", eWfJob.getNamespace())) {
Element actionSla = XmlUtils.getSLAElement(action);
if (actionSla != null) {
String actionSlaXml = SubmitXCommand.resolveSla(actionSla, evalSla);
actionSla = XmlUtils.parseXml(actionSlaXml);
String actionId = Services.get().get(UUIDService.class).generateChildId(jobId, action.getAttributeValue("name") + "");
SLAOperations.createSlaRegistrationEvent(actionSla, actionId, jobId, AppType.WORKFLOW_ACTION, user, appName, log, false);
}
}
} catch (Exception e) {
e.printStackTrace();
throw new CommandException(ErrorCode.E1007, "workflow " + jobId, e.getMessage(), e);
}
}
Aggregations