Search in sources :

Example 1 with FetchWork

use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.

the class MapredLocalTask method initializeOperators.

private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) {
        LOG.debug("initializeOperators: " + entry.getKey() + ", children = " + entry.getValue().getChildOperators());
    }
    // this mapper operator is used to initialize all the operators
    for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
        if (entry.getValue() == null) {
            continue;
        }
        JobConf jobClone = new JobConf(job);
        TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
        // push down projections
        ColumnProjectionUtils.appendReadColumns(jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
        // push down filters
        HiveInputFormat.pushFilters(jobClone, ts);
        AcidUtils.setTransactionalTableScan(jobClone, ts.getConf().isAcidTable());
        AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().getAcidOperationalProperties());
        // create a fetch operator
        FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
        fetchOpJobConfMap.put(fetchOp, jobClone);
        fetchOperators.put(entry.getKey(), fetchOp);
        l4j.info("fetchoperator for " + entry.getKey() + " created");
    }
    // initialize all forward operator
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
        // get the forward op
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
        // put the exe context into all the operators
        forwardOp.passExecContext(execContext);
        // All the operators need to be initialized before process
        FetchOperator fetchOp = entry.getValue();
        JobConf jobConf = fetchOpJobConfMap.get(fetchOp);
        if (jobConf == null) {
            jobConf = job;
        }
        // initialize the forward operator
        ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
        forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
        l4j.info("fetchoperator for " + entry.getKey() + " initialized");
    }
}
Also used : FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) Map(java.util.Map) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JobConf(org.apache.hadoop.mapred.JobConf) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Example 2 with FetchWork

use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.

the class ExecDriver method handleSampling.

private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception {
    assert mWork.getAliasToWork().keySet().size() == 1;
    String alias = mWork.getAliases().get(0);
    Operator<?> topOp = mWork.getAliasToWork().get(alias);
    PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
    ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
    List<Path> inputPaths = mWork.getPaths();
    Path tmpPath = context.getExternalTmpPath(inputPaths.get(0));
    Path partitionFile = new Path(tmpPath, ".partitions");
    ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
    PartitionKeySampler sampler = new PartitionKeySampler();
    if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
        console.printInfo("Use sampling data created in previous MR");
        // merges sampling data from previous MR and make partition keys for total sort
        for (Path path : inputPaths) {
            FileSystem fs = path.getFileSystem(job);
            for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
                sampler.addSampleFile(status.getPath(), job);
            }
        }
    } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
        console.printInfo("Creating sampling data..");
        assert topOp instanceof TableScanOperator;
        TableScanOperator ts = (TableScanOperator) topOp;
        FetchWork fetchWork;
        if (!partDesc.isPartitioned()) {
            assert inputPaths.size() == 1;
            fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
        } else {
            fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
        }
        fetchWork.setSource(ts);
        // random sampling
        FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, job, ts);
        try {
            ts.initialize(job, new ObjectInspector[] { fetcher.getOutputObjectInspector() });
            OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
            while (fetcher.pushRow()) {
            }
        } finally {
            fetcher.clearFetchContext();
        }
    } else {
        throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
    }
    sampler.writePartitionKeys(partitionFile, job);
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FileStatus(org.apache.hadoop.fs.FileStatus) PartitionKeySampler(org.apache.hadoop.hive.ql.exec.PartitionKeySampler) FileSystem(org.apache.hadoop.fs.FileSystem) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Example 3 with FetchWork

use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.

the class PostExecOrcFileDump method run.

@Override
public void run(HookContext hookContext) throws Exception {
    assert (hookContext.getHookType() == HookContext.HookType.POST_EXEC_HOOK);
    HiveConf conf = hookContext.getConf();
    LOG.info("Executing post execution hook to print orc file dump..");
    QueryPlan plan = hookContext.getQueryPlan();
    if (plan == null) {
        return;
    }
    FetchTask fetchTask = plan.getFetchTask();
    if (fetchTask != null) {
        SessionState ss = SessionState.get();
        SessionState.LogHelper console = ss.getConsole();
        // file dump should write to session state console's error stream
        PrintStream old = System.out;
        System.setOut(console.getErrStream());
        FetchWork fetchWork = fetchTask.getWork();
        boolean partitionedTable = fetchWork.isPartitioned();
        List<Path> directories;
        if (partitionedTable) {
            LOG.info("Printing orc file dump for files from partitioned directory..");
            directories = fetchWork.getPartDir();
        } else {
            LOG.info("Printing orc file dump for files from table directory..");
            directories = Lists.newArrayList();
            directories.add(fetchWork.getTblDir());
        }
        for (Path dir : directories) {
            FileSystem fs = dir.getFileSystem(conf);
            List<FileStatus> fileList = HdfsUtils.listLocatedStatus(fs, dir, hiddenFileFilter);
            for (FileStatus fileStatus : fileList) {
                LOG.info("Printing orc file dump for " + fileStatus.getPath());
                if (fileStatus.getLen() > 0) {
                    try {
                        // just creating orc reader is going to do sanity checks to make sure its valid ORC file
                        OrcFile.createReader(fs, fileStatus.getPath());
                        console.printError("-- BEGIN ORC FILE DUMP --");
                        FileDump.main(new String[] { fileStatus.getPath().toString(), "--rowindex=*" });
                        console.printError("-- END ORC FILE DUMP --");
                    } catch (FileFormatException e) {
                        LOG.warn("File " + fileStatus.getPath() + " is not ORC. Skip printing orc file dump");
                    } catch (IOException e) {
                        LOG.warn("Skip printing orc file dump. Exception: " + e.getMessage());
                    }
                } else {
                    LOG.warn("Zero length file encountered. Skip printing orc file dump.");
                }
            }
        }
        // restore the old out stream
        System.out.flush();
        System.setOut(old);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SessionState(org.apache.hadoop.hive.ql.session.SessionState) PrintStream(java.io.PrintStream) FileStatus(org.apache.hadoop.fs.FileStatus) FileFormatException(org.apache.orc.FileFormatException) IOException(java.io.IOException) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) FileSystem(org.apache.hadoop.fs.FileSystem) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) HiveConf(org.apache.hadoop.hive.conf.HiveConf)

Example 4 with FetchWork

use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.

the class GenMapRedUtils method setMapWork.

/**
   * initialize MapWork
   *
   * @param alias_id
   *          current alias
   * @param topOp
   *          the top operator of the stack
   * @param plan
   *          map work to initialize
   * @param local
   *          whether you need to add to map-reduce or local work
   * @param pList
   *          pruned partition list. If it is null it will be computed on-the-fly.
   * @param inputs
   *          read entities for the map work
   * @param conf
   *          current instance of hive conf
   */
public static void setMapWork(MapWork plan, ParseContext parseCtx, Set<ReadEntity> inputs, PrunedPartitionList partsList, TableScanOperator tsOp, String alias_id, HiveConf conf, boolean local) throws SemanticException {
    ArrayList<Path> partDir = new ArrayList<Path>();
    ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
    boolean isAcidTable = false;
    Path tblDir = null;
    plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
    if (partsList == null) {
        try {
            partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
            isAcidTable = tsOp.getConf().isAcidTable();
        } catch (SemanticException e) {
            throw e;
        }
    }
    // Generate the map work for this alias_id
    // pass both confirmed and unknown partitions through the map-reduce
    // framework
    Set<Partition> parts = partsList.getPartitions();
    PartitionDesc aliasPartnDesc = null;
    try {
        if (!parts.isEmpty()) {
            aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next());
        }
    } catch (HiveException e) {
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
    }
    // The table does not have any partitions
    if (aliasPartnDesc == null) {
        aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(tsOp.getConf().getTableMetadata()), null);
    }
    Map<String, String> props = tsOp.getConf().getOpProps();
    if (props != null) {
        Properties target = aliasPartnDesc.getProperties();
        target.putAll(props);
    }
    plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);
    long sizeNeeded = Integer.MAX_VALUE;
    int fileLimit = -1;
    if (parseCtx.getGlobalLimitCtx().isEnable()) {
        if (isAcidTable) {
            LOG.info("Skip Global Limit optimization for ACID table");
            parseCtx.getGlobalLimitCtx().disableOpt();
        } else {
            long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
            sizeNeeded = (parseCtx.getGlobalLimitCtx().getGlobalOffset() + parseCtx.getGlobalLimitCtx().getGlobalLimit()) * sizePerRow;
            // for the optimization that reduce number of input file, we limit number
            // of files allowed. If more than specific number of files have to be
            // selected, we skip this optimization. Since having too many files as
            // inputs can cause unpredictable latency. It's not necessarily to be
            // cheaper.
            fileLimit = HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
            if (sizePerRow <= 0 || fileLimit <= 0) {
                LOG.info("Skip optimization to reduce input size of 'limit'");
                parseCtx.getGlobalLimitCtx().disableOpt();
            } else if (parts.isEmpty()) {
                LOG.info("Empty input: skip limit optimization");
            } else {
                LOG.info("Try to reduce input size for 'limit' " + "sizeNeeded: " + sizeNeeded + "  file limit : " + fileLimit);
            }
        }
    }
    boolean isFirstPart = true;
    boolean emptyInput = true;
    boolean singlePartition = (parts.size() == 1);
    // Track the dependencies for the view. Consider a query like: select * from V;
    // where V is a view of the form: select * from T
    // The dependencies should include V at depth 0, and T at depth 1 (inferred).
    Map<String, ReadEntity> viewToInput = parseCtx.getViewAliasToInput();
    ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(alias_id, viewToInput);
    // The table should also be considered a part of inputs, even if the table is a
    // partitioned table and whether any partition is selected or not
    //This read entity is a direct read entity and not an indirect read (that is when
    // this is being read because it is a dependency of a view).
    boolean isDirectRead = (parentViewInfo == null);
    TableDesc tblDesc = null;
    boolean initTableDesc = false;
    PlanUtils.addPartitionInputs(parts, inputs, parentViewInfo, isDirectRead);
    for (Partition part : parts) {
        // Later the properties have to come from the partition as opposed
        // to from the table in order to support versioning.
        Path[] paths = null;
        SampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(tsOp);
        // Lookup list bucketing pruner
        Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(tsOp);
        ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName()) : null;
        if (sampleDescr != null) {
            assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
            paths = SamplePruner.prune(part, sampleDescr);
            parseCtx.getGlobalLimitCtx().disableOpt();
        } else if (listBucketingPruner != null) {
            assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
            /* Use list bucketing prunner's path. */
            paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
        } else {
            // contain enough size, we change to normal mode.
            if (parseCtx.getGlobalLimitCtx().isEnable()) {
                if (isFirstPart) {
                    long sizeLeft = sizeNeeded;
                    ArrayList<Path> retPathList = new ArrayList<Path>();
                    SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft, fileLimit, retPathList);
                    if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
                        continue;
                    } else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
                        LOG.info("Use full input -- first " + fileLimit + " files are more than " + sizeNeeded + " bytes");
                        parseCtx.getGlobalLimitCtx().disableOpt();
                    } else {
                        emptyInput = false;
                        paths = new Path[retPathList.size()];
                        int index = 0;
                        for (Path path : retPathList) {
                            paths[index++] = path;
                        }
                        if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
                            // if all files are needed to meet the size limit, we disable
                            // optimization. It usually happens for empty table/partition or
                            // table/partition with only one file. By disabling this
                            // optimization, we can avoid retrying the query if there is
                            // not sufficient rows.
                            parseCtx.getGlobalLimitCtx().disableOpt();
                        }
                    }
                    isFirstPart = false;
                } else {
                    paths = new Path[0];
                }
            }
            if (!parseCtx.getGlobalLimitCtx().isEnable()) {
                paths = part.getPath();
            }
        }
        // is it a partitioned table ?
        if (!part.getTable().isPartitioned()) {
            assert (tblDir == null);
            tblDir = paths[0];
            if (!initTableDesc) {
                tblDesc = Utilities.getTableDesc(part.getTable());
                initTableDesc = true;
            }
        } else if (tblDesc == null) {
            if (!initTableDesc) {
                tblDesc = Utilities.getTableDesc(part.getTable());
                initTableDesc = true;
            }
        }
        if (props != null) {
            Properties target = tblDesc.getProperties();
            target.putAll(props);
        }
        for (Path p : paths) {
            if (p == null) {
                continue;
            }
            String path = p.toString();
            if (LOG.isDebugEnabled()) {
                LOG.debug("Adding " + path + " of table" + alias_id);
            }
            partDir.add(p);
            try {
                if (part.getTable().isPartitioned()) {
                    partDesc.add(Utilities.getPartitionDesc(part));
                } else {
                    partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part, false));
                }
            } catch (HiveException e) {
                LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
                throw new SemanticException(e.getMessage(), e);
            }
        }
    }
    if (emptyInput) {
        parseCtx.getGlobalLimitCtx().disableOpt();
    }
    Utilities.addSchemaEvolutionToTableScanOperator(partsList.getSourceTable(), tsOp);
    Iterator<Path> iterPath = partDir.iterator();
    Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();
    if (!local) {
        while (iterPath.hasNext()) {
            assert iterPartnDesc.hasNext();
            Path path = iterPath.next();
            PartitionDesc prtDesc = iterPartnDesc.next();
            // Add the path to alias mapping
            plan.addPathToAlias(path, alias_id);
            plan.addPathToPartitionInfo(path, prtDesc);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Information added for path " + path);
            }
        }
        assert plan.getAliasToWork().get(alias_id) == null;
        plan.getAliasToWork().put(alias_id, tsOp);
    } else {
        // populate local work if needed
        MapredLocalWork localPlan = plan.getMapRedLocalWork();
        if (localPlan == null) {
            localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        }
        assert localPlan.getAliasToWork().get(alias_id) == null;
        assert localPlan.getAliasToFetchWork().get(alias_id) == null;
        localPlan.getAliasToWork().put(alias_id, tsOp);
        if (tblDir == null) {
            tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
            localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(partDir, partDesc, tblDesc));
        } else {
            localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir, tblDesc));
        }
        plan.setMapRedLocalWork(localPlan);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) SampleDesc(org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 5 with FetchWork

use of org.apache.hadoop.hive.ql.plan.FetchWork in project hive by apache.

the class SimpleFetchOptimizer method optimize.

// returns non-null FetchTask instance when succeeded
@SuppressWarnings("unchecked")
private FetchTask optimize(ParseContext pctx, String alias, TableScanOperator source) throws Exception {
    String mode = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION);
    boolean aggressive = "more".equals(mode);
    final int limit = pctx.getQueryProperties().getOuterQueryLimit();
    // limit = 0 means that we do not need any task.
    if (limit == 0) {
        return null;
    }
    FetchData fetch = checkTree(aggressive, pctx, alias, source);
    if (fetch != null && checkThreshold(fetch, limit, pctx)) {
        FetchWork fetchWork = fetch.convertToWork();
        FetchTask fetchTask = (FetchTask) TaskFactory.get(fetchWork, pctx.getConf());
        fetchWork.setSink(fetch.completed(pctx, fetchWork));
        fetchWork.setSource(source);
        fetchWork.setLimit(limit);
        return fetchTask;
    }
    return null;
}
Also used : FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask)

Aggregations

FetchWork (org.apache.hadoop.hive.ql.plan.FetchWork)15 Path (org.apache.hadoop.fs.Path)9 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)7 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)7 ArrayList (java.util.ArrayList)6 HashMap (java.util.HashMap)5 LinkedHashMap (java.util.LinkedHashMap)5 Map (java.util.Map)5 FetchTask (org.apache.hadoop.hive.ql.exec.FetchTask)5 MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)5 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)5 Operator (org.apache.hadoop.hive.ql.exec.Operator)4 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)4 HiveConf (org.apache.hadoop.hive.conf.HiveConf)3 FetchOperator (org.apache.hadoop.hive.ql.exec.FetchOperator)3 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)3 LazySimpleSerDe (org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe)3 IOException (java.io.IOException)2