Search in sources :

Example 1 with FetchTask

use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.

the class Driver method useFetchFromCache.

private void useFetchFromCache(CacheEntry cacheEntry) {
    // Change query FetchTask to use new location specified in results cache.
    FetchTask fetchTaskFromCache = (FetchTask) TaskFactory.get(cacheEntry.getFetchWork());
    fetchTaskFromCache.initialize(queryState, plan, null, ctx.getOpContext());
    plan.setFetchTask(fetchTaskFromCache);
    cacheUsage = new CacheUsage(CacheUsage.CacheStatus.QUERY_USING_CACHE, cacheEntry);
}
Also used : FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) CacheUsage(org.apache.hadoop.hive.ql.cache.results.CacheUsage)

Example 2 with FetchTask

use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.

the class SemanticAnalyzer method useCachedResult.

/**
 * Set the query plan to use cache entry passed in to return the query results.
 * @param cacheEntry The results cache entry that will be used to resolve the query.
 */
private void useCachedResult(QueryResultsCache.CacheEntry cacheEntry) {
    // Change query FetchTask to use new location specified in results cache.
    FetchTask fetchTask = (FetchTask) TaskFactory.get(cacheEntry.getFetchWork());
    setFetchTask(fetchTask);
    queryState.setCommandType(cacheEntry.getQueryInfo().getHiveOperation());
    resultSchema = cacheEntry.getQueryInfo().getResultSchema();
    setTableAccessInfo(cacheEntry.getQueryInfo().getTableAccessInfo());
    setColumnAccessInfo(cacheEntry.getQueryInfo().getColumnAccessInfo());
    inputs.addAll(cacheEntry.getQueryInfo().getInputs());
    // Set recursive traversal in case the cached query was UNION generated by Tez.
    conf.setBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, true);
    // Indicate that the query will use a cached result.
    setCacheUsage(new CacheUsage(CacheUsage.CacheStatus.QUERY_USING_CACHE, cacheEntry));
}
Also used : FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) CacheUsage(org.apache.hadoop.hive.ql.cache.results.CacheUsage)

Example 3 with FetchTask

use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.

the class TaskCompiler method compile.

@SuppressWarnings({ "nls", "unchecked" })
public void compile(final ParseContext pCtx, final List<Task<? extends Serializable>> rootTasks, final HashSet<ReadEntity> inputs, final HashSet<WriteEntity> outputs) throws SemanticException {
    Context ctx = pCtx.getContext();
    GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
    List<Task<MoveWork>> mvTask = new ArrayList<>();
    List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork();
    List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork();
    boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite();
    int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit();
    if (pCtx.getFetchTask() != null) {
        if (pCtx.getFetchTask().getTblDesc() == null) {
            return;
        }
        pCtx.getFetchTask().getWork().setHiveServerQuery(SessionState.get().isHiveServerQuery());
        TableDesc resultTab = pCtx.getFetchTask().getTblDesc();
        // then either the ThriftFormatter or the DefaultFetchFormatter should be used.
        if (!resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
            if (SessionState.get().isHiveServerQuery()) {
                conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, ThriftFormatter.class.getName());
            } else {
                String formatterName = conf.get(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER);
                if (formatterName == null || formatterName.isEmpty()) {
                    conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, DefaultFetchFormatter.class.getName());
                }
            }
        }
        return;
    }
    optimizeOperatorPlan(pCtx, inputs, outputs);
    /*
     * In case of a select, use a fetch task instead of a move task.
     * If the select is from analyze table column rewrite, don't create a fetch task. Instead create
     * a column stats task later.
     */
    if (pCtx.getQueryProperties().isQuery() && !isCStats) {
        if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
            throw new SemanticException(ErrorMsg.INVALID_LOAD_TABLE_FILE_WORK.getMsg());
        }
        LoadFileDesc loadFileDesc = loadFileWork.get(0);
        String cols = loadFileDesc.getColumns();
        String colTypes = loadFileDesc.getColumnTypes();
        String resFileFormat;
        TableDesc resultTab = pCtx.getFetchTableDesc();
        if (resultTab == null) {
            resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
            if (SessionState.get().getIsUsingThriftJDBCBinarySerDe() && (resFileFormat.equalsIgnoreCase("SequenceFile"))) {
                resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, ThriftJDBCBinarySerDe.class);
                // Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
                // read formatted thrift objects from the output SequenceFile written by Tasks.
                conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
            } else {
                resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, LazySimpleSerDe.class);
            }
        } else {
            if (resultTab.getProperties().getProperty(serdeConstants.SERIALIZATION_LIB).equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
                // Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
                // read formatted thrift objects from the output SequenceFile written by Tasks.
                conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
            }
        }
        FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit);
        boolean isHiveServerQuery = SessionState.get().isHiveServerQuery();
        fetch.setHiveServerQuery(isHiveServerQuery);
        fetch.setSource(pCtx.getFetchSource());
        fetch.setSink(pCtx.getFetchSink());
        if (isHiveServerQuery && null != resultTab && resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName()) && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
            fetch.setIsUsingThriftJDBCBinarySerDe(true);
        } else {
            fetch.setIsUsingThriftJDBCBinarySerDe(false);
        }
        pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch));
        // For the FetchTask, the limit optimization requires we fetch all the rows
        // in memory and count how many rows we get. It's not practical if the
        // limit factor is too big
        int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
        if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
            LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit + ". Doesn't qualify limit optimization.");
            globalLimitCtx.disableOpt();
        }
        if (outerQueryLimit == 0) {
            // Believe it or not, some tools do generate queries with limit 0 and than expect
            // query to run quickly. Lets meet their requirement.
            LOG.info("Limit 0. No query execution needed.");
            return;
        }
    } else if (!isCStats) {
        for (LoadTableDesc ltd : loadTableWork) {
            Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
            mvTask.add(tsk);
        }
        boolean oneLoadFileForCtas = true;
        for (LoadFileDesc lfd : loadFileWork) {
            if (pCtx.getQueryProperties().isCTAS() || pCtx.getQueryProperties().isMaterializedView()) {
                if (!oneLoadFileForCtas) {
                    // should not have more than 1 load file for CTAS.
                    throw new SemanticException("One query is not expected to contain multiple CTAS loads statements");
                }
                setLoadFileLocation(pCtx, lfd);
                oneLoadFileForCtas = false;
            }
            mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false)));
        }
    }
    generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);
    // For each task, set the key descriptor for the reducer
    for (Task<? extends Serializable> rootTask : rootTasks) {
        GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
    }
    // to be used, please do so
    for (Task<? extends Serializable> rootTask : rootTasks) {
        setInputFormat(rootTask);
    }
    optimizeTaskPlan(rootTasks, pCtx, ctx);
    /*
     * If the query was the result of analyze table column compute statistics rewrite, create
     * a column stats task instead of a fetch task to persist stats to the metastore.
     * As per HIVE-15903, we will also collect table stats when user computes column stats.
     * That means, if isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()
     * We need to collect table stats
     * if isCStats, we need to include a basic stats task
     * else it is ColumnStatsAutoGather, which should have a move task with a stats task already.
     */
    if (isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()) {
        // map from tablename to task (ColumnStatsTask which includes a BasicStatsTask)
        Map<String, StatsTask> map = new LinkedHashMap<>();
        if (isCStats) {
            if (rootTasks == null || rootTasks.size() != 1 || pCtx.getTopOps() == null || pCtx.getTopOps().size() != 1) {
                throw new SemanticException("Can not find correct root task!");
            }
            try {
                Task<? extends Serializable> root = rootTasks.iterator().next();
                StatsTask tsk = (StatsTask) genTableStats(pCtx, pCtx.getTopOps().values().iterator().next(), root, outputs);
                root.addDependentTask(tsk);
                map.put(extractTableFullName(tsk), tsk);
            } catch (HiveException e) {
                throw new SemanticException(e);
            }
            genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, map, outerQueryLimit, 0);
        } else {
            Set<Task<? extends Serializable>> leafTasks = new LinkedHashSet<Task<? extends Serializable>>();
            getLeafTasks(rootTasks, leafTasks);
            List<Task<? extends Serializable>> nonStatsLeafTasks = new ArrayList<>();
            for (Task<? extends Serializable> tsk : leafTasks) {
                // map table name to the correct ColumnStatsTask
                if (tsk instanceof StatsTask) {
                    map.put(extractTableFullName((StatsTask) tsk), (StatsTask) tsk);
                } else {
                    nonStatsLeafTasks.add(tsk);
                }
            }
            // add cStatsTask as a dependent of all the nonStatsLeafTasks
            for (Task<? extends Serializable> tsk : nonStatsLeafTasks) {
                for (Task<? extends Serializable> cStatsTask : map.values()) {
                    tsk.addDependentTask(cStatsTask);
                }
            }
            for (ColumnStatsAutoGatherContext columnStatsAutoGatherContext : pCtx.getColumnStatsAutoGatherContexts()) {
                if (!columnStatsAutoGatherContext.isInsertInto()) {
                    genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), map, outerQueryLimit, 0);
                } else {
                    int numBitVector;
                    try {
                        numBitVector = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
                    } catch (Exception e) {
                        throw new SemanticException(e.getMessage());
                    }
                    genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), map, outerQueryLimit, numBitVector);
                }
            }
        }
    }
    decideExecMode(rootTasks, ctx, globalLimitCtx);
    if (pCtx.getQueryProperties().isCTAS() && !pCtx.getCreateTable().isMaterialization()) {
        // generate a DDL task and make it a dependent task of the leaf
        CreateTableDesc crtTblDesc = pCtx.getCreateTable();
        crtTblDesc.validate(conf);
        Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc));
        patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtTblTask);
    } else if (pCtx.getQueryProperties().isMaterializedView()) {
        // generate a DDL task and make it a dependent task of the leaf
        CreateViewDesc viewDesc = pCtx.getCreateViewDesc();
        Task<? extends Serializable> crtViewTask = TaskFactory.get(new DDLWork(inputs, outputs, viewDesc));
        patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtViewTask);
    } else if (pCtx.getMaterializedViewUpdateDesc() != null) {
        // If there is a materialized view update desc, we create introduce it at the end
        // of the tree.
        MaterializedViewDesc materializedViewDesc = pCtx.getMaterializedViewUpdateDesc();
        Set<Task<? extends Serializable>> leafTasks = new LinkedHashSet<Task<? extends Serializable>>();
        getLeafTasks(rootTasks, leafTasks);
        Task<? extends Serializable> materializedViewTask = TaskFactory.get(materializedViewDesc, conf);
        for (Task<? extends Serializable> task : leafTasks) {
            task.addDependentTask(materializedViewTask);
        }
    }
    if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
        LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
        pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
    }
    if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
        LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
        globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
    }
    Interner<TableDesc> interner = Interners.newStrongInterner();
    for (Task<? extends Serializable> rootTask : rootTasks) {
        GenMapRedUtils.internTableDesc(rootTask, interner);
        GenMapRedUtils.deriveFinalExplainAttributes(rootTask, pCtx.getConf());
    }
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LinkedHashSet(java.util.LinkedHashSet) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) DDLTask(org.apache.hadoop.hive.ql.exec.DDLTask) Task(org.apache.hadoop.hive.ql.exec.Task) StatsTask(org.apache.hadoop.hive.ql.exec.StatsTask) Serializable(java.io.Serializable) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MaterializedViewDesc(org.apache.hadoop.hive.ql.exec.MaterializedViewDesc) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) StatsTask(org.apache.hadoop.hive.ql.exec.StatsTask) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) ThriftFormatter(org.apache.hadoop.hive.serde2.thrift.ThriftFormatter) CreateViewDesc(org.apache.hadoop.hive.ql.plan.CreateViewDesc) ThriftJDBCBinarySerDe(org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe) NoOpFetchFormatter(org.apache.hadoop.hive.serde2.NoOpFetchFormatter) Context(org.apache.hadoop.hive.ql.Context) AnalyzeRewriteContext(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) DDLWork(org.apache.hadoop.hive.ql.plan.DDLWork) DefaultFetchFormatter(org.apache.hadoop.hive.serde2.DefaultFetchFormatter) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc)

Example 4 with FetchTask

use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.

the class PostExecOrcFileDump method run.

@Override
public void run(HookContext hookContext) throws Exception {
    assert (hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK);
    HiveConf conf = hookContext.getConf();
    LOG.info("Executing post execution hook to print orc file dump..");
    QueryPlan plan = hookContext.getQueryPlan();
    if (plan == null) {
        return;
    }
    FetchTask fetchTask = plan.getFetchTask();
    if (fetchTask != null) {
        SessionState ss = SessionState.get();
        SessionState.LogHelper console = ss.getConsole();
        // file dump should write to session state console's error stream
        PrintStream old = System.out;
        System.setOut(console.getErrStream());
        FetchWork fetchWork = fetchTask.getWork();
        boolean partitionedTable = fetchWork.isPartitioned();
        List<Path> directories;
        if (partitionedTable) {
            LOG.info("Printing orc file dump for files from partitioned directory..");
            directories = fetchWork.getPartDir();
        } else {
            LOG.info("Printing orc file dump for files from table directory..");
            directories = Lists.newArrayList();
            directories.add(fetchWork.getTblDir());
        }
        for (Path dir : directories) {
            printFileStatus(console, dir.getFileSystem(conf), dir);
        }
        // restore the old out stream
        System.out.flush();
        System.setOut(old);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SessionState(org.apache.hadoop.hive.ql.session.SessionState) PrintStream(java.io.PrintStream) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) HiveConf(org.apache.hadoop.hive.conf.HiveConf) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask)

Example 5 with FetchTask

use of org.apache.hadoop.hive.ql.exec.FetchTask in project hive by apache.

the class SimpleFetchOptimizer method optimize.

// returns non-null FetchTask instance when succeeded
@SuppressWarnings("unchecked")
private FetchTask optimize(ParseContext pctx, String alias, TableScanOperator source) throws Exception {
    String mode = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION);
    boolean aggressive = "more".equals(mode);
    final int limit = pctx.getQueryProperties().getOuterQueryLimit();
    // limit = 0 means that we do not need any task.
    if (limit == 0) {
        return null;
    }
    FetchData fetch = checkTree(aggressive, pctx, alias, source);
    if (fetch != null && checkThreshold(fetch, limit, pctx)) {
        FetchWork fetchWork = fetch.convertToWork();
        FetchTask fetchTask = (FetchTask) TaskFactory.get(fetchWork);
        fetchWork.setSink(fetch.completed(pctx, fetchWork));
        fetchWork.setSource(source);
        fetchWork.setLimit(limit);
        return fetchTask;
    }
    return null;
}
Also used : FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask)

Aggregations

FetchTask (org.apache.hadoop.hive.ql.exec.FetchTask)27 ArrayList (java.util.ArrayList)11 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 List (java.util.List)7 ValidTxnList (org.apache.hadoop.hive.common.ValidTxnList)7 Path (org.apache.hadoop.fs.Path)6 FetchWork (org.apache.hadoop.hive.ql.plan.FetchWork)6 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)6 Test (org.junit.Test)6 IOException (java.io.IOException)5 HiveConf (org.apache.hadoop.hive.conf.HiveConf)5 Context (org.apache.hadoop.hive.ql.Context)4 CacheUsage (org.apache.hadoop.hive.ql.cache.results.CacheUsage)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)4 Operator (org.apache.hadoop.hive.ql.exec.Operator)3 Task (org.apache.hadoop.hive.ql.exec.Task)3 CommandProcessorException (org.apache.hadoop.hive.ql.processors.CommandProcessorException)3 LinkedHashMap (java.util.LinkedHashMap)2 LinkedHashSet (java.util.LinkedHashSet)2