Search in sources :

Example 1 with RUNNING

use of org.apache.tez.dag.api.client.DAGStatus.State.RUNNING in project hive by apache.

the class TezJobMonitor method monitorExecution.

public int monitorExecution() {
    boolean done = false;
    boolean success = false;
    int failedCounter = 0;
    final StopWatch failureTimer = new StopWatch();
    int rc = 0;
    DAGStatus status = null;
    Map<String, Progress> vertexProgressMap = null;
    long monitorStartTime = System.currentTimeMillis();
    synchronized (shutdownList) {
        shutdownList.add(dagClient);
    }
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.TEZ_RUN_DAG);
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.TEZ_SUBMIT_TO_RUNNING);
    DAGStatus.State lastState = null;
    boolean running = false;
    long checkInterval = HiveConf.getTimeVar(hiveConf, HiveConf.ConfVars.TEZ_DAG_STATUS_CHECK_INTERVAL, TimeUnit.MILLISECONDS);
    WmContext wmContext = null;
    while (true) {
        try {
            if (context != null) {
                context.checkHeartbeaterLockException();
            }
            wmContext = context.getWmContext();
            EnumSet<StatusGetOpts> opts = null;
            if (wmContext != null) {
                Set<String> desiredCounters = wmContext.getSubscribedCounters();
                if (desiredCounters != null && !desiredCounters.isEmpty()) {
                    opts = EnumSet.of(StatusGetOpts.GET_COUNTERS);
                }
            }
            status = dagClient.getDAGStatus(opts, checkInterval);
            vertexProgressMap = status.getVertexProgress();
            List<String> vertexNames = vertexProgressMap.keySet().stream().map(k -> k.replaceAll(" ", "_")).collect(Collectors.toList());
            if (wmContext != null) {
                Set<String> desiredCounters = wmContext.getSubscribedCounters();
                TezCounters dagCounters = status.getDAGCounters();
                // if initial counters exists, merge it with dag counters to get aggregated view
                TezCounters mergedCounters = counters == null ? dagCounters : Utils.mergeTezCounters(dagCounters, counters);
                if (mergedCounters != null && desiredCounters != null && !desiredCounters.isEmpty()) {
                    Map<String, Long> currentCounters = getCounterValues(mergedCounters, vertexNames, vertexProgressMap, desiredCounters, done);
                    LOG.debug("Requested DAG status. checkInterval: {}. currentCounters: {}", checkInterval, currentCounters);
                    wmContext.setCurrentCounters(currentCounters);
                }
            }
            DAGStatus.State state = status.getState();
            // AM is responsive again (recovery?)
            failedCounter = 0;
            failureTimer.reset();
            if (state != lastState || state == RUNNING) {
                lastState = state;
                switch(state) {
                    case SUBMITTED:
                        console.printInfo("Status: Submitted");
                        break;
                    case INITING:
                        console.printInfo("Status: Initializing");
                        this.executionStartTime = System.currentTimeMillis();
                        break;
                    case RUNNING:
                        if (!running) {
                            perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_SUBMIT_TO_RUNNING);
                            console.printInfo("Status: Running (" + dagClient.getExecutionContext() + ")\n");
                            this.executionStartTime = System.currentTimeMillis();
                            running = true;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        break;
                    case SUCCEEDED:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        success = true;
                        running = false;
                        done = true;
                        break;
                    case KILLED:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        console.printInfo("Status: Killed");
                        running = false;
                        done = true;
                        rc = 1;
                        break;
                    case FAILED:
                    case ERROR:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        console.printError("Status: Failed");
                        running = false;
                        done = true;
                        rc = 2;
                        break;
                }
            }
            if (wmContext != null && done) {
                wmContext.setQueryCompleted(true);
            }
        } catch (Exception e) {
            console.printInfo("Exception: " + e.getMessage());
            boolean isInterrupted = hasInterruptedException(e);
            if (failedCounter == 0) {
                failureTimer.reset();
                failureTimer.start();
            }
            if (isInterrupted || (++failedCounter >= MAX_RETRY_FAILURES && failureTimer.now(TimeUnit.MILLISECONDS) > MAX_RETRY_INTERVAL)) {
                try {
                    if (isInterrupted) {
                        console.printInfo("Killing DAG...");
                    } else {
                        console.printInfo(String.format("Killing DAG... after %d seconds", failureTimer.now(TimeUnit.SECONDS)));
                    }
                    dagClient.tryKillDAG();
                } catch (IOException | TezException tezException) {
                // best effort
                }
                console.printError("Execution has failed. stack trace: " + ExceptionUtils.getStackTrace(e));
                rc = 1;
                done = true;
            } else {
                console.printInfo("Retrying...");
            }
            if (wmContext != null && done) {
                wmContext.setQueryCompleted(true);
            }
        } finally {
            if (done) {
                if (wmContext != null && done) {
                    wmContext.setQueryCompleted(true);
                }
                if (rc != 0 && status != null) {
                    for (String diag : status.getDiagnostics()) {
                        console.printError(diag);
                        diagnostics.append(diag);
                    }
                }
                synchronized (shutdownList) {
                    shutdownList.remove(dagClient);
                }
                break;
            }
        }
    }
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_RUN_DAG);
    printSummary(success, vertexProgressMap);
    return rc;
}
Also used : DAGClient(org.apache.tez.dag.api.client.DAGClient) StatusGetOpts(org.apache.tez.dag.api.client.StatusGetOpts) TezCounter(org.apache.tez.common.counters.TezCounter) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) InterruptedIOException(java.io.InterruptedIOException) TimeCounterLimit(org.apache.hadoop.hive.ql.wm.TimeCounterLimit) VertexCounterLimit(org.apache.hadoop.hive.ql.wm.VertexCounterLimit) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) ProgressMonitor(org.apache.hadoop.hive.common.log.ProgressMonitor) Map(java.util.Map) InPlaceUpdate(org.apache.hadoop.hive.common.log.InPlaceUpdate) Context(org.apache.hadoop.hive.ql.Context) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) CounterGroup(org.apache.tez.common.counters.CounterGroup) LinkedList(java.util.LinkedList) EnumSet(java.util.EnumSet) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) Progress(org.apache.tez.dag.api.client.Progress) Logger(org.slf4j.Logger) StringWriter(java.io.StringWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Set(java.util.Set) StopWatch(org.apache.tez.util.StopWatch) IOException(java.io.IOException) DAG(org.apache.tez.dag.api.DAG) Collectors(java.util.stream.Collectors) SessionState(org.apache.hadoop.hive.ql.session.SessionState) RUNNING(org.apache.tez.dag.api.client.DAGStatus.State.RUNNING) ShutdownHookManager(org.apache.hive.common.util.ShutdownHookManager) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) TezException(org.apache.tez.dag.api.TezException) TimeUnit(java.util.concurrent.TimeUnit) TezCounters(org.apache.tez.common.counters.TezCounters) List(java.util.List) WmContext(org.apache.hadoop.hive.ql.wm.WmContext) Utils(org.apache.hadoop.hive.ql.exec.tez.Utils) Preconditions(com.google.common.base.Preconditions) TezSessionPoolManager(org.apache.hadoop.hive.ql.exec.tez.TezSessionPoolManager) ExceptionUtils(org.apache.commons.lang3.exception.ExceptionUtils) Progress(org.apache.tez.dag.api.client.Progress) WmContext(org.apache.hadoop.hive.ql.wm.WmContext) TezCounters(org.apache.tez.common.counters.TezCounters) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TezException(org.apache.tez.dag.api.TezException) StopWatch(org.apache.tez.util.StopWatch) StatusGetOpts(org.apache.tez.dag.api.client.StatusGetOpts) DAGStatus(org.apache.tez.dag.api.client.DAGStatus)

Aggregations

Preconditions (com.google.common.base.Preconditions)1 IOException (java.io.IOException)1 InterruptedIOException (java.io.InterruptedIOException)1 StringWriter (java.io.StringWriter)1 EnumSet (java.util.EnumSet)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 TimeUnit (java.util.concurrent.TimeUnit)1 Collectors (java.util.stream.Collectors)1 ExceptionUtils (org.apache.commons.lang3.exception.ExceptionUtils)1 InPlaceUpdate (org.apache.hadoop.hive.common.log.InPlaceUpdate)1 ProgressMonitor (org.apache.hadoop.hive.common.log.ProgressMonitor)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 ConfVars (org.apache.hadoop.hive.conf.HiveConf.ConfVars)1 Context (org.apache.hadoop.hive.ql.Context)1 Utilities (org.apache.hadoop.hive.ql.exec.Utilities)1 TezSessionPoolManager (org.apache.hadoop.hive.ql.exec.tez.TezSessionPoolManager)1