Search in sources :

Example 1 with TezException

use of org.apache.tez.dag.api.TezException in project hive by apache.

the class TezJobMonitor method monitorExecution.

public int monitorExecution() {
    boolean done = false;
    boolean success = false;
    int failedCounter = 0;
    int rc = 0;
    DAGStatus status = null;
    Map<String, Progress> vertexProgressMap = null;
    long monitorStartTime = System.currentTimeMillis();
    synchronized (shutdownList) {
        shutdownList.add(dagClient);
    }
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_RUN_DAG);
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_SUBMIT_TO_RUNNING);
    DAGStatus.State lastState = null;
    boolean running = false;
    while (true) {
        try {
            if (context != null) {
                context.checkHeartbeaterLockException();
            }
            status = dagClient.getDAGStatus(new HashSet<StatusGetOpts>(), CHECK_INTERVAL);
            vertexProgressMap = status.getVertexProgress();
            DAGStatus.State state = status.getState();
            if (state != lastState || state == RUNNING) {
                lastState = state;
                switch(state) {
                    case SUBMITTED:
                        console.printInfo("Status: Submitted");
                        break;
                    case INITING:
                        console.printInfo("Status: Initializing");
                        this.executionStartTime = System.currentTimeMillis();
                        break;
                    case RUNNING:
                        if (!running) {
                            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_SUBMIT_TO_RUNNING);
                            console.printInfo("Status: Running (" + dagClient.getExecutionContext() + ")\n");
                            this.executionStartTime = System.currentTimeMillis();
                            running = true;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        break;
                    case SUCCEEDED:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        success = true;
                        running = false;
                        done = true;
                        break;
                    case KILLED:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        console.printInfo("Status: Killed");
                        running = false;
                        done = true;
                        rc = 1;
                        break;
                    case FAILED:
                    case ERROR:
                        if (!running) {
                            this.executionStartTime = monitorStartTime;
                        }
                        updateFunction.update(status, vertexProgressMap);
                        console.printError("Status: Failed");
                        running = false;
                        done = true;
                        rc = 2;
                        break;
                }
            }
        } catch (Exception e) {
            console.printInfo("Exception: " + e.getMessage());
            boolean isInterrupted = hasInterruptedException(e);
            if (isInterrupted || (++failedCounter % MAX_RETRY_INTERVAL / CHECK_INTERVAL == 0)) {
                try {
                    console.printInfo("Killing DAG...");
                    dagClient.tryKillDAG();
                } catch (IOException | TezException tezException) {
                // best effort
                }
                console.printError("Execution has failed. stack trace: " + ExceptionUtils.getStackTrace(e));
                rc = 1;
                done = true;
            } else {
                console.printInfo("Retrying...");
            }
        } finally {
            if (done) {
                if (rc != 0 && status != null) {
                    for (String diag : status.getDiagnostics()) {
                        console.printError(diag);
                        diagnostics.append(diag);
                    }
                }
                synchronized (shutdownList) {
                    shutdownList.remove(dagClient);
                }
                break;
            }
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_RUN_DAG);
    printSummary(success, vertexProgressMap);
    return rc;
}
Also used : Progress(org.apache.tez.dag.api.client.Progress) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TezException(org.apache.tez.dag.api.TezException) HashSet(java.util.HashSet)

Example 2 with TezException

use of org.apache.tez.dag.api.TezException in project hive by apache.

the class TezSessionState method openInternal.

protected void openInternal(final HiveConf conf, Collection<String> additionalFiles, boolean isAsync, LogHelper console, Path scratchDir) throws IOException, LoginException, IllegalArgumentException, URISyntaxException, TezException {
    this.conf = conf;
    // TODO Why is the queue name set again. It has already been setup via setQueueName. Do only one of the two.
    String confQueueName = conf.get(TezConfiguration.TEZ_QUEUE_NAME);
    if (queueName != null && !queueName.equals(confQueueName)) {
        LOG.warn("Resetting a queue name that was already set: was " + queueName + ", now " + confQueueName);
    }
    this.queueName = confQueueName;
    this.doAsEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS);
    final boolean llapMode = "llap".equalsIgnoreCase(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_MODE));
    // TODO This - at least for the session pool - will always be the hive user. How does doAs above this affect things ?
    UserGroupInformation ugi = Utils.getUGI();
    user = ugi.getShortUserName();
    LOG.info("User of session id " + sessionId + " is " + user);
    // create the tez tmp dir
    tezScratchDir = scratchDir == null ? createTezDir(sessionId) : scratchDir;
    additionalFilesNotFromConf.clear();
    if (additionalFiles != null) {
        additionalFilesNotFromConf.addAll(additionalFiles);
    }
    refreshLocalResourcesFromConf(conf);
    // unless already installed on all the cluster nodes, we'll have to
    // localize hive-exec.jar as well.
    appJarLr = createJarLocalResource(utils.getExecJarPathLocal());
    // configuration for the application master
    final Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>();
    commonLocalResources.put(utils.getBaseName(appJarLr), appJarLr);
    for (LocalResource lr : localizedResources) {
        commonLocalResources.put(utils.getBaseName(lr), lr);
    }
    if (llapMode) {
        // localize llap client jars
        addJarLRByClass(LlapTaskSchedulerService.class, commonLocalResources);
        addJarLRByClass(LlapProtocolClientImpl.class, commonLocalResources);
        addJarLRByClass(LlapProtocolClientProxy.class, commonLocalResources);
        addJarLRByClassName("org.apache.hadoop.registry.client.api.RegistryOperations", commonLocalResources);
    }
    // Create environment for AM.
    Map<String, String> amEnv = new HashMap<String, String>();
    MRHelpers.updateEnvBasedOnMRAMEnv(conf, amEnv);
    // and finally we're ready to create and start the session
    // generate basic tez config
    final TezConfiguration tezConfig = new TezConfiguration(conf);
    // set up the staging directory to use
    tezConfig.set(TezConfiguration.TEZ_AM_STAGING_DIR, tezScratchDir.toUri().toString());
    conf.stripHiddenConfigurations(tezConfig);
    ServicePluginsDescriptor servicePluginsDescriptor;
    Credentials llapCredentials = null;
    if (llapMode) {
        if (UserGroupInformation.isSecurityEnabled()) {
            llapCredentials = new Credentials();
            llapCredentials.addToken(LlapTokenIdentifier.KIND_NAME, getLlapToken(user, tezConfig));
        }
        // TODO Change this to not serialize the entire Configuration - minor.
        UserPayload servicePluginPayload = TezUtils.createUserPayloadFromConf(tezConfig);
        // we need plugins to handle llap and uber mode
        servicePluginsDescriptor = ServicePluginsDescriptor.create(true, new TaskSchedulerDescriptor[] { TaskSchedulerDescriptor.create(LLAP_SERVICE, LLAP_SCHEDULER).setUserPayload(servicePluginPayload) }, new ContainerLauncherDescriptor[] { ContainerLauncherDescriptor.create(LLAP_SERVICE, LLAP_LAUNCHER) }, new TaskCommunicatorDescriptor[] { TaskCommunicatorDescriptor.create(LLAP_SERVICE, LLAP_TASK_COMMUNICATOR).setUserPayload(servicePluginPayload) });
    } else {
        servicePluginsDescriptor = ServicePluginsDescriptor.create(true);
    }
    // container prewarming. tell the am how many containers we need
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_PREWARM_ENABLED)) {
        int n = HiveConf.getIntVar(conf, ConfVars.HIVE_PREWARM_NUM_CONTAINERS);
        n = Math.max(tezConfig.getInt(TezConfiguration.TEZ_AM_SESSION_MIN_HELD_CONTAINERS, TezConfiguration.TEZ_AM_SESSION_MIN_HELD_CONTAINERS_DEFAULT), n);
        tezConfig.setInt(TezConfiguration.TEZ_AM_SESSION_MIN_HELD_CONTAINERS, n);
    }
    setupSessionAcls(tezConfig, conf);
    final TezClient session = TezClient.newBuilder("HIVE-" + sessionId, tezConfig).setIsSession(true).setLocalResources(commonLocalResources).setCredentials(llapCredentials).setServicePluginDescriptor(servicePluginsDescriptor).build();
    LOG.info("Opening new Tez Session (id: " + sessionId + ", scratch dir: " + tezScratchDir + ")");
    TezJobMonitor.initShutdownHook();
    if (!isAsync) {
        startSessionAndContainers(session, conf, commonLocalResources, tezConfig, false);
        this.session = session;
    } else {
        FutureTask<TezClient> sessionFuture = new FutureTask<>(new Callable<TezClient>() {

            @Override
            public TezClient call() throws Exception {
                try {
                    return startSessionAndContainers(session, conf, commonLocalResources, tezConfig, true);
                } catch (Throwable t) {
                    LOG.error("Failed to start Tez session", t);
                    throw (t instanceof Exception) ? (Exception) t : new Exception(t);
                }
            }
        });
        new Thread(sessionFuture, "Tez session start thread").start();
        // We assume here nobody will try to get session before open() returns.
        this.console = console;
        this.sessionFuture = sessionFuture;
    }
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) TaskSchedulerDescriptor(org.apache.tez.serviceplugins.api.TaskSchedulerDescriptor) HashMap(java.util.HashMap) ServicePluginsDescriptor(org.apache.tez.serviceplugins.api.ServicePluginsDescriptor) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) TimeoutException(java.util.concurrent.TimeoutException) CancellationException(java.util.concurrent.CancellationException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) TezException(org.apache.tez.dag.api.TezException) ExecutionException(java.util.concurrent.ExecutionException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) TaskCommunicatorDescriptor(org.apache.tez.serviceplugins.api.TaskCommunicatorDescriptor) TezClient(org.apache.tez.client.TezClient) ContainerLauncherDescriptor(org.apache.tez.serviceplugins.api.ContainerLauncherDescriptor) FutureTask(java.util.concurrent.FutureTask) Credentials(org.apache.hadoop.security.Credentials) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Aggregations

IOException (java.io.IOException)2 TezException (org.apache.tez.dag.api.TezException)2 FileNotFoundException (java.io.FileNotFoundException)1 InterruptedIOException (java.io.InterruptedIOException)1 URISyntaxException (java.net.URISyntaxException)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 CancellationException (java.util.concurrent.CancellationException)1 ExecutionException (java.util.concurrent.ExecutionException)1 FutureTask (java.util.concurrent.FutureTask)1 TimeoutException (java.util.concurrent.TimeoutException)1 LoginException (javax.security.auth.login.LoginException)1 Credentials (org.apache.hadoop.security.Credentials)1 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)1 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)1 TezClient (org.apache.tez.client.TezClient)1 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)1 UserPayload (org.apache.tez.dag.api.UserPayload)1 DAGStatus (org.apache.tez.dag.api.client.DAGStatus)1 Progress (org.apache.tez.dag.api.client.Progress)1