Search in sources :

Example 36 with PerfLogger

use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.

the class Driver method execute.

private void execute() throws CommandProcessorResponse {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DRIVER_EXECUTE);
    boolean noName = StringUtils.isEmpty(conf.get(MRJobConfig.JOB_NAME));
    int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
    Metrics metrics = MetricsFactory.getInstance();
    String queryId = queryState.getQueryId();
    // Get the query string from the conf file as the compileInternal() method might
    // hide sensitive information during query redaction.
    String queryStr = conf.getQueryString();
    lDrvState.stateLock.lock();
    try {
        // a combined compile/execute in runInternal, throws the error
        if (lDrvState.driverState != DriverState.COMPILED && lDrvState.driverState != DriverState.EXECUTING) {
            SQLState = "HY008";
            errorMessage = "FAILED: unexpected driverstate: " + lDrvState + ", for query " + queryStr;
            console.printError(errorMessage);
            throw createProcessorResponse(1000);
        } else {
            lDrvState.driverState = DriverState.EXECUTING;
        }
    } finally {
        lDrvState.stateLock.unlock();
    }
    maxthreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.EXECPARALLETHREADNUMBER);
    HookContext hookContext = null;
    // Whether there's any error occurred during query execution. Used for query lifetime hook.
    boolean executionError = false;
    try {
        LOG.info("Executing command(queryId=" + queryId + "): " + queryStr);
        // compile and execute can get called from different threads in case of HS2
        // so clear timing in this thread's Hive object before proceeding.
        Hive.get().clearMetaCallTiming();
        plan.setStarted();
        if (SessionState.get() != null) {
            SessionState.get().getHiveHistory().startQuery(queryStr, queryId);
            SessionState.get().getHiveHistory().logPlanProgress(plan);
        }
        resStream = null;
        SessionState ss = SessionState.get();
        hookContext = new PrivateHookContext(plan, queryState, ctx.getPathToCS(), SessionState.get().getUserName(), ss.getUserIpAddress(), InetAddress.getLocalHost().getHostAddress(), operationId, ss.getSessionId(), Thread.currentThread().getName(), ss.isHiveServerQuery(), perfLogger, queryInfo, ctx);
        hookContext.setHookType(HookContext.HookType.PRE_EXEC_HOOK);
        hookRunner.runPreHooks(hookContext);
        // Trigger query hooks before query execution.
        hookRunner.runBeforeExecutionHook(queryStr, hookContext);
        setQueryDisplays(plan.getRootTasks());
        int mrJobs = Utilities.getMRTasks(plan.getRootTasks()).size();
        int jobs = mrJobs + Utilities.getTezTasks(plan.getRootTasks()).size() + Utilities.getSparkTasks(plan.getRootTasks()).size();
        if (jobs > 0) {
            logMrWarning(mrJobs);
            console.printInfo("Query ID = " + queryId);
            console.printInfo("Total jobs = " + jobs);
        }
        if (SessionState.get() != null) {
            SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_NUM_TASKS, String.valueOf(jobs));
            SessionState.get().getHiveHistory().setIdToTableMap(plan.getIdToTableNameMap());
        }
        String jobname = Utilities.abbreviate(queryStr, maxlen - 6);
        // A runtime that launches runnable tasks as separate Threads through
        // TaskRunners
        // As soon as a task isRunnable, it is put in a queue
        // At any time, at most maxthreads tasks can be running
        // The main thread polls the TaskRunners to check if they have finished.
        checkInterrupted("before running tasks.", hookContext, perfLogger);
        DriverContext driverCxt = new DriverContext(ctx);
        driverCxt.prepare(plan);
        ctx.setHDFSCleanup(true);
        // for canceling the query (should be bound to session?)
        this.driverCxt = driverCxt;
        SessionState.get().setMapRedStats(new LinkedHashMap<>());
        SessionState.get().setStackTraces(new HashMap<>());
        SessionState.get().setLocalMapRedErrors(new HashMap<>());
        // Add root Tasks to runnable
        for (Task<? extends Serializable> tsk : plan.getRootTasks()) {
            // incorrect results.
            assert tsk.getParentTasks() == null || tsk.getParentTasks().isEmpty();
            driverCxt.addToRunnable(tsk);
            if (metrics != null) {
                tsk.updateTaskMetrics(metrics);
            }
        }
        preExecutionCacheActions();
        perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.RUN_TASKS);
        // Loop while you either have tasks running, or tasks queued up
        while (driverCxt.isRunning()) {
            // Launch upto maxthreads tasks
            Task<? extends Serializable> task;
            while ((task = driverCxt.getRunnable(maxthreads)) != null) {
                TaskRunner runner = launchTask(task, queryId, noName, jobname, jobs, driverCxt);
                if (!runner.isRunning()) {
                    break;
                }
            }
            // poll the Tasks to see which one completed
            TaskRunner tskRun = driverCxt.pollFinished();
            if (tskRun == null) {
                continue;
            }
            /*
          This should be removed eventually. HIVE-17814 gives more detail
          explanation of whats happening and HIVE-17815 as to why this is done.
          Briefly for replication the graph is huge and so memory pressure is going to be huge if
          we keep a lot of references around.
        */
            String opName = plan.getOperationName();
            boolean isReplicationOperation = opName.equals(HiveOperation.REPLDUMP.getOperationName()) || opName.equals(HiveOperation.REPLLOAD.getOperationName());
            if (!isReplicationOperation) {
                hookContext.addCompleteTask(tskRun);
            }
            queryDisplay.setTaskResult(tskRun.getTask().getId(), tskRun.getTaskResult());
            Task<? extends Serializable> tsk = tskRun.getTask();
            TaskResult result = tskRun.getTaskResult();
            int exitVal = result.getExitVal();
            checkInterrupted("when checking the execution result.", hookContext, perfLogger);
            if (exitVal != 0) {
                Task<? extends Serializable> backupTask = tsk.getAndInitBackupTask();
                if (backupTask != null) {
                    setErrorMsgAndDetail(exitVal, result.getTaskError(), tsk);
                    console.printError(errorMessage);
                    errorMessage = "ATTEMPT: Execute BackupTask: " + backupTask.getClass().getName();
                    console.printError(errorMessage);
                    // add backup task to runnable
                    if (DriverContext.isLaunchable(backupTask)) {
                        driverCxt.addToRunnable(backupTask);
                    }
                    continue;
                } else {
                    setErrorMsgAndDetail(exitVal, result.getTaskError(), tsk);
                    if (driverCxt.isShutdown()) {
                        errorMessage = "FAILED: Operation cancelled. " + errorMessage;
                    }
                    invokeFailureHooks(perfLogger, hookContext, errorMessage + Strings.nullToEmpty(tsk.getDiagnosticsMessage()), result.getTaskError());
                    SQLState = "08S01";
                    // based on the ErrorMsg set in HiveException.
                    if (result.getTaskError() instanceof HiveException) {
                        ErrorMsg errorMsg = ((HiveException) result.getTaskError()).getCanonicalErrorMsg();
                        if (errorMsg != ErrorMsg.GENERIC_ERROR) {
                            SQLState = errorMsg.getSQLState();
                        }
                    }
                    console.printError(errorMessage);
                    driverCxt.shutdown();
                    // in case we decided to run everything in local mode, restore the
                    // the jobtracker setting to its initial value
                    ctx.restoreOriginalTracker();
                    throw createProcessorResponse(exitVal);
                }
            }
            driverCxt.finished(tskRun);
            if (SessionState.get() != null) {
                SessionState.get().getHiveHistory().setTaskProperty(queryId, tsk.getId(), Keys.TASK_RET_CODE, String.valueOf(exitVal));
                SessionState.get().getHiveHistory().endTask(queryId, tsk);
            }
            if (tsk.getChildTasks() != null) {
                for (Task<? extends Serializable> child : tsk.getChildTasks()) {
                    if (DriverContext.isLaunchable(child)) {
                        driverCxt.addToRunnable(child);
                    }
                }
            }
        }
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.RUN_TASKS);
        postExecutionCacheActions();
        // in case we decided to run everything in local mode, restore the
        // the jobtracker setting to its initial value
        ctx.restoreOriginalTracker();
        if (driverCxt.isShutdown()) {
            SQLState = "HY008";
            errorMessage = "FAILED: Operation cancelled";
            invokeFailureHooks(perfLogger, hookContext, errorMessage, null);
            console.printError(errorMessage);
            throw createProcessorResponse(1000);
        }
        // remove incomplete outputs.
        // Some incomplete outputs may be added at the beginning, for eg: for dynamic partitions.
        // remove them
        HashSet<WriteEntity> remOutputs = new LinkedHashSet<WriteEntity>();
        for (WriteEntity output : plan.getOutputs()) {
            if (!output.isComplete()) {
                remOutputs.add(output);
            }
        }
        for (WriteEntity output : remOutputs) {
            plan.getOutputs().remove(output);
        }
        hookContext.setHookType(HookContext.HookType.POST_EXEC_HOOK);
        hookRunner.runPostExecHooks(hookContext);
        if (SessionState.get() != null) {
            SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_RET_CODE, String.valueOf(0));
            SessionState.get().getHiveHistory().printRowCount(queryId);
        }
        releasePlan(plan);
    } catch (CommandProcessorResponse cpr) {
        executionError = true;
        throw cpr;
    } catch (Throwable e) {
        executionError = true;
        checkInterrupted("during query execution: \n" + e.getMessage(), hookContext, perfLogger);
        ctx.restoreOriginalTracker();
        if (SessionState.get() != null) {
            SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_RET_CODE, String.valueOf(12));
        }
        // TODO: do better with handling types of Exception here
        errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
        if (hookContext != null) {
            try {
                invokeFailureHooks(perfLogger, hookContext, errorMessage, e);
            } catch (Exception t) {
                LOG.warn("Failed to invoke failure hook", t);
            }
        }
        SQLState = "08S01";
        downstreamError = e;
        console.printError(errorMessage + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw createProcessorResponse(12);
    } finally {
        // Trigger query hooks after query completes its execution.
        try {
            hookRunner.runAfterExecutionHook(queryStr, hookContext, executionError);
        } catch (Exception e) {
            LOG.warn("Failed when invoking query after execution hook", e);
        }
        if (SessionState.get() != null) {
            SessionState.get().getHiveHistory().endQuery(queryId);
        }
        if (noName) {
            conf.set(MRJobConfig.JOB_NAME, "");
        }
        double duration = perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DRIVER_EXECUTE) / 1000.00;
        ImmutableMap<String, Long> executionHMSTimings = dumpMetaCallTimingWithoutEx("execution");
        queryDisplay.setHmsTimings(QueryDisplay.Phase.EXECUTION, executionHMSTimings);
        Map<String, MapRedStats> stats = SessionState.get().getMapRedStats();
        if (stats != null && !stats.isEmpty()) {
            long totalCpu = 0;
            console.printInfo("MapReduce Jobs Launched: ");
            for (Map.Entry<String, MapRedStats> entry : stats.entrySet()) {
                console.printInfo("Stage-" + entry.getKey() + ": " + entry.getValue());
                totalCpu += entry.getValue().getCpuMSec();
            }
            console.printInfo("Total MapReduce CPU Time Spent: " + Utilities.formatMsecToStr(totalCpu));
        }
        lDrvState.stateLock.lock();
        try {
            lDrvState.driverState = executionError ? DriverState.ERROR : DriverState.EXECUTED;
        } finally {
            lDrvState.stateLock.unlock();
        }
        if (lDrvState.isAborted()) {
            LOG.info("Executing command(queryId=" + queryId + ") has been interrupted after " + duration + " seconds");
        } else {
            LOG.info("Completed executing command(queryId=" + queryId + "); Time taken: " + duration + " seconds");
        }
    }
    if (console != null) {
        console.printInfo("OK");
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SessionState(org.apache.hadoop.hive.ql.session.SessionState) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) CommandProcessorResponse(org.apache.hadoop.hive.ql.processors.CommandProcessorResponse) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) PrivateHookContext(org.apache.hadoop.hive.ql.hooks.PrivateHookContext) HookContext(org.apache.hadoop.hive.ql.hooks.HookContext) HiveSemanticAnalyzerHookContext(org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext) TaskRunner(org.apache.hadoop.hive.ql.exec.TaskRunner) Metrics(org.apache.hadoop.hive.common.metrics.common.Metrics) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) IOException(java.io.IOException) ParseException(org.apache.hadoop.hive.ql.parse.ParseException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) AuthorizationException(org.apache.hadoop.hive.ql.metadata.AuthorizationException) TaskResult(org.apache.hadoop.hive.ql.exec.TaskResult) PrivateHookContext(org.apache.hadoop.hive.ql.hooks.PrivateHookContext) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap)

Example 37 with PerfLogger

use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.

the class TezCompiler method optimizeTaskPlan.

@Override
protected void optimizeTaskPlan(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, Context ctx) throws SemanticException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    PhysicalContext physicalCtx = new PhysicalContext(conf, pCtx, pCtx.getContext(), rootTasks, pCtx.getFetchTask());
    if (conf.getBoolVar(HiveConf.ConfVars.HIVENULLSCANOPTIMIZE)) {
        physicalCtx = new NullScanOptimizer().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping null scan query optimization");
    }
    if (conf.getBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES)) {
        physicalCtx = new MetadataOnlyOptimizer().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping metadata only query optimization");
    }
    if (conf.getBoolVar(HiveConf.ConfVars.HIVE_CHECK_CROSS_PRODUCT)) {
        physicalCtx = new CrossProductHandler().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping cross product analysis");
    }
    if ("llap".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_MODE))) {
        physicalCtx = new LlapPreVectorizationPass().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping llap pre-vectorization pass");
    }
    if (conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) {
        physicalCtx = new Vectorizer().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping vectorization");
    }
    if (!"none".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVESTAGEIDREARRANGE))) {
        physicalCtx = new StageIDsRearranger().resolve(physicalCtx);
    } else {
        LOG.debug("Skipping stage id rearranger");
    }
    if ((conf.getBoolVar(HiveConf.ConfVars.HIVE_TEZ_ENABLE_MEMORY_MANAGER)) && (conf.getBoolVar(HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN))) {
        physicalCtx = new MemoryDecider().resolve(physicalCtx);
    }
    if ("llap".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_MODE))) {
        LlapClusterStateForCompile llapInfo = LlapClusterStateForCompile.getClusterInfo(conf);
        physicalCtx = new LlapDecider(llapInfo).resolve(physicalCtx);
    } else {
        LOG.debug("Skipping llap decider");
    }
    // This optimizer will serialize all filters that made it to the
    // table scan operator to avoid having to do it multiple times on
    // the backend. If you have a physical optimization that changes
    // table scans or filters, you have to invoke it before this one.
    physicalCtx = new SerializeFilter().resolve(physicalCtx);
    if (physicalCtx.getContext().getExplainAnalyze() != null) {
        new AnnotateRunTimeStatsOptimizer().resolve(physicalCtx);
    }
    perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "optimizeTaskPlan");
    return;
}
Also used : LlapDecider(org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider) LlapClusterStateForCompile(org.apache.hadoop.hive.ql.optimizer.physical.LlapClusterStateForCompile) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) AnnotateRunTimeStatsOptimizer(org.apache.hadoop.hive.ql.optimizer.physical.AnnotateRunTimeStatsOptimizer) MemoryDecider(org.apache.hadoop.hive.ql.optimizer.physical.MemoryDecider) MetadataOnlyOptimizer(org.apache.hadoop.hive.ql.optimizer.physical.MetadataOnlyOptimizer) PhysicalContext(org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext) LlapPreVectorizationPass(org.apache.hadoop.hive.ql.optimizer.physical.LlapPreVectorizationPass) NullScanOptimizer(org.apache.hadoop.hive.ql.optimizer.physical.NullScanOptimizer) Vectorizer(org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer) SerializeFilter(org.apache.hadoop.hive.ql.optimizer.physical.SerializeFilter) CrossProductHandler(org.apache.hadoop.hive.ql.optimizer.physical.CrossProductHandler) StageIDsRearranger(org.apache.hadoop.hive.ql.optimizer.physical.StageIDsRearranger)

Example 38 with PerfLogger

use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.

the class MoveTask method moveFile.

private void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir) throws HiveException {
    try {
        PerfLogger perfLogger = SessionState.getPerfLogger();
        perfLogger.perfLogBegin("MoveTask", PerfLogger.FILE_MOVES);
        String mesg = "Moving data to " + (isDfsDir ? "" : "local ") + "directory " + targetPath.toString();
        String mesg_detail = " from " + sourcePath.toString();
        console.printInfo(mesg, mesg_detail);
        FileSystem fs = sourcePath.getFileSystem(conf);
        if (work.isCTAS() && BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) {
            if (fs.exists(new Path(sourcePath, BLOB_MANIFEST_FILE))) {
                LOG.debug("Attempting to copy using the paths available in {}", new Path(sourcePath, BLOB_MANIFEST_FILE));
                ArrayList<String> filesKept;
                try (FSDataInputStream inStream = fs.open(new Path(sourcePath, BLOB_MANIFEST_FILE))) {
                    String paths = IOUtils.toString(inStream, Charset.defaultCharset());
                    filesKept = new ArrayList(Arrays.asList(paths.split(System.lineSeparator())));
                }
                // Remove the first entry from the list, it is the source path.
                Path srcPath = new Path(filesKept.remove(0));
                LOG.info("Copying files {} from {} to {}", filesKept, srcPath, targetPath);
                // Do the move using the filesKept now directly to the target dir.
                Utilities.moveSpecifiedFilesInParallel(conf, fs, srcPath, targetPath, new HashSet<>(filesKept));
                perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES);
                return;
            }
        // Fallback case, in any case the _blob_files_kept isn't created, we can do the normal logic. The file won't
        // be created in case of empty source table as well
        }
        if (isDfsDir) {
            moveFileInDfs(sourcePath, targetPath, conf);
        } else {
            // This is a local file
            FileSystem dstFs = FileSystem.getLocal(conf);
            moveFileFromDfsToLocal(sourcePath, targetPath, fs, dstFs);
        }
        perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES);
    } catch (Exception e) {
        throw new HiveException("Unable to move source " + sourcePath + " to destination " + targetPath, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSystem(org.apache.hadoop.fs.FileSystem) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException)

Example 39 with PerfLogger

use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.

the class CombineHiveInputFormat method getSplits.

/**
 * Create Hive splits based on CombineFileSplit.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    Path[] paths = getInputPaths(job);
    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM, (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
    try {
        Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
        for (int i = 0; i < paths.length; i++) {
            if (nonCombinablePathIndices.contains(i)) {
                nonCombinablePaths.add(paths[i]);
            } else {
                combinablePaths.add(paths[i]);
            }
        }
    } catch (Exception e) {
        LOG.error("Error checking non-combinable path", e);
        perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
        throw new IOException(e);
    }
    // Store the previous value for the path specification
    String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    LOG.debug("The received input paths are: [{}] against the property {}", oldPaths, org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    // Process the normal splits
    if (nonCombinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
        InputSplit[] splits = super.getSplits(job, numSplits);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // Process the combine splits
    if (combinablePaths.size() > 0) {
        FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[combinablePaths.size()]));
        Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
        InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
        for (InputSplit split : splits) {
            result.add(split);
        }
    }
    // if some application depends on the original value being set.
    if (oldPaths != null) {
        job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
    }
    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
    Utilities.clearWorkMapForConf(job);
    if (result.isEmpty() && paths.length > 0 && job.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
        // If there are no inputs; the Execution engine skips the operator tree.
        // To prevent it from happening; an opaque  ZeroRows input is added here - when needed.
        result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(paths[0]), ZeroRowsInputFormat.class.getName()));
    }
    LOG.info("Number of all splits " + result.size());
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new InputSplit[result.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 40 with PerfLogger

use of org.apache.hadoop.hive.ql.log.PerfLogger in project hive by apache.

the class TezCompiler method optimizeOperatorPlan.

@Override
protected void optimizeOperatorPlan(ParseContext pCtx) throws SemanticException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    // Create the context for the walker
    OptimizeTezProcContext procCtx = new OptimizeTezProcContext(conf, pCtx);
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    runTopNKeyOptimization(procCtx);
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run top n key optimization");
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    // setup dynamic partition pruning where possible
    runDynamicPartitionPruning(procCtx);
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Setup dynamic partition pruning");
    if (procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_MULTICOLUMN)) {
        SemiJoinReductionMerge sjmerge = new SemiJoinReductionMerge();
        sjmerge.beginPerfLogging();
        sjmerge.transform(procCtx.parseContext);
        sjmerge.endPerfLogging("Merge single column semi-join reducers to composite");
    }
    // need to run this; to get consistent filterop conditions(for operator tree matching)
    if (procCtx.conf.getBoolVar(ConfVars.HIVEOPTCONSTANTPROPAGATION)) {
        new ConstantPropagate(ConstantPropagateOption.SHORTCUT).transform(procCtx.parseContext);
    }
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    // setup stats in the operator plan
    runStatsAnnotation(procCtx);
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Setup stats in the operator plan");
    // run Sorted dynamic partition optimization
    if (HiveConf.getBoolVar(procCtx.conf, HiveConf.ConfVars.DYNAMICPARTITIONING) && HiveConf.getVar(procCtx.conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE).equals("nonstrict") && !HiveConf.getBoolVar(procCtx.conf, HiveConf.ConfVars.HIVEOPTLISTBUCKETING)) {
        perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
        new SortedDynPartitionOptimizer().transform(procCtx.parseContext);
        perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Sorted dynamic partition optimization");
    }
    if (HiveConf.getBoolVar(procCtx.conf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) {
        perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
        // Dynamic sort partition adds an extra RS therefore need to de-dup
        new ReduceSinkDeDuplication().transform(procCtx.parseContext);
        // there is an issue with dedup logic wherein SELECT is created with wrong columns
        // NonBlockingOpDeDupProc fixes that
        // (kind of hackish, the issue in de-dup should be fixed but it needs more investigation)
        new NonBlockingOpDeDupProc().transform(procCtx.parseContext);
        perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Reduce Sink de-duplication");
    }
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    // run the optimizations that use stats for optimization
    runStatsDependentOptimizations(procCtx);
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run the optimizations that use stats for optimization");
    // repopulate bucket versions; join conversion may have created some new reducesinks
    new BucketVersionPopulator().transform(pCtx);
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    if (procCtx.conf.getBoolVar(ConfVars.HIVEOPTJOINREDUCEDEDUPLICATION)) {
        new ReduceSinkJoinDeDuplication().transform(procCtx.parseContext);
    }
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run reduce sink after join algorithm selection");
    semijoinRemovalBasedTransformations(procCtx);
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    if (procCtx.conf.getBoolVar(ConfVars.HIVE_SHARED_WORK_OPTIMIZATION)) {
        new SharedWorkOptimizer().transform(procCtx.parseContext);
        new ParallelEdgeFixer().transform(procCtx.parseContext);
    }
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Shared scans optimization");
    // involving constant true/false values.
    if (procCtx.conf.getBoolVar(ConfVars.HIVEOPTCONSTANTPROPAGATION)) {
        new ConstantPropagate(ConstantPropagateOption.SHORTCUT).transform(procCtx.parseContext);
    }
    perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
    AuxOpTreeSignature.linkAuxSignatures(procCtx.parseContext);
    markOperatorsWithUnstableRuntimeStats(procCtx);
    perfLogger.perfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "markOperatorsWithUnstableRuntimeStats");
    if (procCtx.conf.getBoolVar(ConfVars.HIVE_IN_TEST)) {
        bucketingVersionSanityCheck(procCtx);
    }
}
Also used : SemiJoinReductionMerge(org.apache.hadoop.hive.ql.optimizer.SemiJoinReductionMerge) BucketVersionPopulator(org.apache.hadoop.hive.ql.optimizer.BucketVersionPopulator) SortedDynPartitionOptimizer(org.apache.hadoop.hive.ql.optimizer.SortedDynPartitionOptimizer) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ParallelEdgeFixer(org.apache.hadoop.hive.ql.optimizer.ParallelEdgeFixer) ReduceSinkDeDuplication(org.apache.hadoop.hive.ql.optimizer.correlation.ReduceSinkDeDuplication) ReduceSinkJoinDeDuplication(org.apache.hadoop.hive.ql.optimizer.correlation.ReduceSinkJoinDeDuplication) SharedWorkOptimizer(org.apache.hadoop.hive.ql.optimizer.SharedWorkOptimizer) NonBlockingOpDeDupProc(org.apache.hadoop.hive.ql.optimizer.NonBlockingOpDeDupProc) ConstantPropagate(org.apache.hadoop.hive.ql.optimizer.ConstantPropagate)

Aggregations

PerfLogger (org.apache.hadoop.hive.ql.log.PerfLogger)60 ArrayList (java.util.ArrayList)22 IOException (java.io.IOException)21 LockException (org.apache.hadoop.hive.ql.lockmgr.LockException)16 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)13 ExecutionException (java.util.concurrent.ExecutionException)11 Path (org.apache.hadoop.fs.Path)11 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)11 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)11 TException (org.apache.thrift.TException)11 HiveMetaException (org.apache.hadoop.hive.metastore.HiveMetaException)10 AlreadyExistsException (org.apache.hadoop.hive.metastore.api.AlreadyExistsException)9 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)9 FileNotFoundException (java.io.FileNotFoundException)8 UnknownHostException (java.net.UnknownHostException)8 LinkedList (java.util.LinkedList)8 JDODataStoreException (javax.jdo.JDODataStoreException)8 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)8 NoSuchObjectException (org.apache.hadoop.hive.metastore.api.NoSuchObjectException)8 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)8