Search in sources :

Example 16 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method extractSharedOptimizationInfoForRoot.

private static SharedResult extractSharedOptimizationInfoForRoot(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator retainableTsOp, TableScanOperator discardableTsOp) throws SemanticException {
    LinkedHashSet<Operator<?>> retainableOps = new LinkedHashSet<>();
    LinkedHashSet<Operator<?>> discardableOps = new LinkedHashSet<>();
    Set<Operator<?>> discardableInputOps = new HashSet<>();
    long dataSize = 0L;
    long maxDataSize = 0L;
    retainableOps.add(retainableTsOp);
    discardableOps.add(discardableTsOp);
    Operator<?> equalOp1 = retainableTsOp;
    Operator<?> equalOp2 = discardableTsOp;
    if (equalOp1.getNumChild() > 1 || equalOp2.getNumChild() > 1) {
        // TODO: Support checking multiple child operators to merge further.
        discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
        return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
    }
    Operator<?> currentOp1 = retainableTsOp.getChildOperators().get(0);
    Operator<?> currentOp2 = discardableTsOp.getChildOperators().get(0);
    // Special treatment for Filter operator that ignores the DPP predicates
    if (currentOp1 instanceof FilterOperator && currentOp2 instanceof FilterOperator) {
        boolean equalFilters = false;
        FilterDesc op1Conf = ((FilterOperator) currentOp1).getConf();
        FilterDesc op2Conf = ((FilterOperator) currentOp2).getConf();
        if (op1Conf.getIsSamplingPred() == op2Conf.getIsSamplingPred() && StringUtils.equals(op1Conf.getSampleDescExpr(), op2Conf.getSampleDescExpr())) {
            Multiset<String> conjsOp1String = extractConjsIgnoringDPPPreds(op1Conf.getPredicate());
            Multiset<String> conjsOp2String = extractConjsIgnoringDPPPreds(op2Conf.getPredicate());
            if (conjsOp1String.equals(conjsOp2String)) {
                equalFilters = true;
            }
        }
        if (equalFilters) {
            equalOp1 = currentOp1;
            equalOp2 = currentOp2;
            retainableOps.add(equalOp1);
            discardableOps.add(equalOp2);
            if (currentOp1.getChildOperators().size() > 1 || currentOp2.getChildOperators().size() > 1) {
                // TODO: Support checking multiple child operators to merge further.
                discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
                discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, retainableOps, discardableInputOps));
                return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
            }
            currentOp1 = currentOp1.getChildOperators().get(0);
            currentOp2 = currentOp2.getChildOperators().get(0);
        } else {
            // Bail out
            discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
            discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, retainableOps, discardableInputOps));
            return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
        }
    }
    return extractSharedOptimizationInfo(pctx, optimizerCache, equalOp1, equalOp2, currentOp1, currentOp2, retainableOps, discardableOps, discardableInputOps, false);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) LinkedHashSet(java.util.LinkedHashSet) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 17 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class HiveInputFormat method pushProjectionsAndFilters.

protected void pushProjectionsAndFilters(JobConf jobConf, Class inputFormatClass, Path splitPath, boolean nonNative) {
    Path splitPathWithNoSchema = Path.getPathWithoutSchemeAndAuthority(splitPath);
    if (this.mrwork == null) {
        init(job);
    }
    if (this.mrwork.getPathToAliases() == null) {
        return;
    }
    ArrayList<String> aliases = new ArrayList<String>();
    Iterator<Entry<Path, ArrayList<String>>> iterator = this.mrwork.getPathToAliases().entrySet().iterator();
    Set<Path> splitParentPaths = null;
    int pathsSize = this.mrwork.getPathToAliases().entrySet().size();
    while (iterator.hasNext()) {
        Entry<Path, ArrayList<String>> entry = iterator.next();
        Path key = entry.getKey();
        boolean match;
        if (nonNative) {
            // For non-native tables, we need to do an exact match to avoid
            // HIVE-1903.  (The table location contains no files, and the string
            // representation of its path does not have a trailing slash.)
            match = splitPath.equals(key) || splitPathWithNoSchema.equals(key);
        } else {
            // path to something deeper than the table location.)
            if (pathsSize > 1) {
                // In such cases, use pre-computed paths for comparison
                if (splitParentPaths == null) {
                    splitParentPaths = new HashSet<>();
                    FileUtils.populateParentPaths(splitParentPaths, splitPath);
                    FileUtils.populateParentPaths(splitParentPaths, splitPathWithNoSchema);
                }
                match = splitParentPaths.contains(key);
            } else {
                match = FileUtils.isPathWithinSubtree(splitPath, key) || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
            }
        }
        if (match) {
            ArrayList<String> list = entry.getValue();
            for (String val : list) {
                aliases.add(val);
            }
        }
    }
    for (String alias : aliases) {
        Operator<? extends OperatorDesc> op = this.mrwork.getAliasToWork().get(alias);
        if (op instanceof TableScanOperator) {
            TableScanOperator ts = (TableScanOperator) op;
            // push down projections.
            ColumnProjectionUtils.appendReadColumns(jobConf, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
            // push down filters
            pushFilters(jobConf, ts, this.mrwork);
            AcidUtils.setAcidOperationalProperties(job, ts.getConf().isTranscationalTable(), ts.getConf().getAcidOperationalProperties());
            AcidUtils.setValidWriteIdList(job, ts.getConf());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ArrayList(java.util.ArrayList) Entry(java.util.Map.Entry)

Example 18 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class LlapInputFormat method findTsOp.

static TableScanOperator findTsOp(MapWork mapWork) throws HiveException {
    if (mapWork.getAliasToWork() == null) {
        throw new HiveException("Unexpected - aliasToWork is missing; " + NONVECTOR_SETTING_MESSAGE);
    }
    Iterator<Operator<?>> ops = mapWork.getAliasToWork().values().iterator();
    TableScanOperator tableScanOperator = null;
    while (ops.hasNext()) {
        Operator<?> op = ops.next();
        if (op instanceof TableScanOperator) {
            if (tableScanOperator != null) {
                throw new HiveException("Unexpected - more than one TSOP; " + NONVECTOR_SETTING_MESSAGE);
            }
            tableScanOperator = (TableScanOperator) op;
        }
    }
    return tableScanOperator;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 19 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class ExecDriver method handleSampling.

private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception {
    assert mWork.getAliasToWork().keySet().size() == 1;
    String alias = mWork.getAliases().get(0);
    Operator<?> topOp = mWork.getAliasToWork().get(alias);
    PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
    ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
    List<Path> inputPaths = mWork.getPaths();
    Path tmpPath = context.getExternalTmpPath(inputPaths.get(0));
    Path partitionFile = new Path(tmpPath, ".partitions");
    ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
    PartitionKeySampler sampler = new PartitionKeySampler();
    if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
        console.printInfo("Use sampling data created in previous MR");
        // merges sampling data from previous MR and make partition keys for total sort
        for (Path path : inputPaths) {
            FileSystem fs = path.getFileSystem(job);
            for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
                sampler.addSampleFile(status.getPath(), job);
            }
        }
    } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
        console.printInfo("Creating sampling data..");
        assert topOp instanceof TableScanOperator;
        TableScanOperator ts = (TableScanOperator) topOp;
        FetchWork fetchWork;
        if (!partDesc.isPartitioned()) {
            assert inputPaths.size() == 1;
            fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
        } else {
            fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
        }
        fetchWork.setSource(ts);
        // random sampling
        FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, job, ts);
        try {
            ts.initialize(job, new ObjectInspector[] { fetcher.getOutputObjectInspector() });
            OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
            while (fetcher.pushRow()) {
            }
        } finally {
            fetcher.clearFetchContext();
        }
    } else {
        throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
    }
    sampler.writePartitionKeys(partitionFile, job);
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FileStatus(org.apache.hadoop.fs.FileStatus) PartitionKeySampler(org.apache.hadoop.hive.ql.exec.PartitionKeySampler) FileSystem(org.apache.hadoop.fs.FileSystem) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Example 20 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class MapredLocalTask method initializeOperators.

private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) {
        LOG.debug("initializeOperators: " + entry.getKey() + ", children = " + entry.getValue().getChildOperators());
    }
    // this mapper operator is used to initialize all the operators
    for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
        if (entry.getValue() == null) {
            continue;
        }
        JobConf jobClone = new JobConf(job);
        TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
        // push down projections
        ColumnProjectionUtils.appendReadColumns(jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
        // push down filters and as of information
        HiveInputFormat.pushFiltersAndAsOf(jobClone, ts, null);
        AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().isTranscationalTable(), ts.getConf().getAcidOperationalProperties());
        AcidUtils.setValidWriteIdList(jobClone, ts.getConf());
        // create a fetch operator
        FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
        fetchOpJobConfMap.put(fetchOp, jobClone);
        fetchOperators.put(entry.getKey(), fetchOp);
        l4j.info("fetchoperator for " + entry.getKey() + " created");
    }
    // initialize all forward operator
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
        // get the forward op
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
        // put the exe context into all the operators
        forwardOp.passExecContext(execContext);
        // All the operators need to be initialized before process
        FetchOperator fetchOp = entry.getValue();
        JobConf jobConf = fetchOpJobConfMap.get(fetchOp);
        if (jobConf == null) {
            jobConf = job;
        }
        // initialize the forward operator
        ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
        forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
        l4j.info("fetchoperator for " + entry.getKey() + " initialized");
    }
}
Also used : FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) Map(java.util.Map) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JobConf(org.apache.hadoop.mapred.JobConf) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)133 Operator (org.apache.hadoop.hive.ql.exec.Operator)52 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)44 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)36 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)35 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)32 HashMap (java.util.HashMap)30 Path (org.apache.hadoop.fs.Path)30 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)29 Table (org.apache.hadoop.hive.ql.metadata.Table)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)24 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)24 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)23 LinkedHashMap (java.util.LinkedHashMap)22 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)22 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)22 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)22 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)21