Examples with SplitSample - org.apache.hadoop.hive.ql.parse.SplitSample

Example 1 with SplitSample

use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.

the class GlobalLimitOptimizer method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    Context ctx = pctx.getContext();
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
    Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample();
    // is used.
    if (ctx.getTryCount() == 0 && topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) {
        // Here we recursively check:
        // 1. whether there are exact one LIMIT in the query
        // 2. whether there is no aggregation, group-by, distinct, sort by,
        //    distributed by, or table sampling in any of the sub-query.
        // The query only qualifies if both conditions are satisfied.
        //
        // Example qualified queries:
        //    CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
        //    INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
        //                               FROM ... LIMIT...
        //    SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
        //
        TableScanOperator ts = topOps.values().iterator().next();
        LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);
        // query qualify for the optimization
        if (tempGlobalLimit != null) {
            LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
            Table tab = ts.getConf().getTableMetadata();
            Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);
            if (!tab.isPartitioned()) {
                if (filterOps.size() == 0) {
                    Integer tempOffset = tempGlobalLimitDesc.getOffset();
                    globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
                }
            } else {
                // check if the pruner only contains partition columns
                if (onlyContainsPartnCols(tab, filterOps)) {
                    String alias = (String) topOps.keySet().toArray()[0];
                    PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);
                    // the filter to prune correctly
                    if (!partsList.hasUnknownPartitions()) {
                        Integer tempOffset = tempGlobalLimitDesc.getOffset();
                        globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
                    }
                }
            }
            if (globalLimitCtx.isEnable()) {
                LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset());
                LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit());
            }
        }
    }
    return pctx;
}

Also used : Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) LimitDesc(org.apache.hadoop.hive.ql.plan.LimitDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) GlobalLimitCtx(org.apache.hadoop.hive.ql.parse.GlobalLimitCtx)

Example 2 with SplitSample

use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.

the class CombineHiveInputFormat method sampleSplits.

/**
   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
   *
   * First, splits are grouped by alias they are for. If one split serves more than one
   * alias or not for any sampled alias, we just directly add it to returned list.
   * Then we find a list of exclusive splits for every alias to be sampled.
   * For each alias, we start from position of seedNumber%totalNumber, and keep add
   * splits until the total size hits percentage.
   * @param splits
   * @return the sampled splits
   */
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
    List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
    Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
    //
    for (CombineFileSplit split : splits) {
        String alias = null;
        for (Path path : split.getPaths()) {
            boolean schemeless = path.toUri().getScheme() == null;
            List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
            // 3. it serves different alias than another path for the same split
            if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
                alias = null;
                break;
            }
            alias = l.get(0);
        }
        if (alias != null) {
            // add it to the split list of the alias.
            if (!aliasToSplitList.containsKey(alias)) {
                aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
            }
            aliasToSplitList.get(alias).add(split);
        } else {
            // The split doesn't exclusively serve one alias
            retLists.add(split);
        }
    }
    //
    for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
        ArrayList<CombineFileSplit> splitList = entry.getValue();
        long totalSize = 0;
        for (CombineFileSplit split : splitList) {
            totalSize += split.getLength();
        }
        SplitSample splitSample = nameToSamples.get(entry.getKey());
        long targetSize = splitSample.getTargetSize(totalSize);
        int startIndex = splitSample.getSeedNum() % splitList.size();
        long size = 0;
        for (int i = 0; i < splitList.size(); i++) {
            CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
            retLists.add(split);
            long splitgLength = split.getLength();
            if (size + splitgLength >= targetSize) {
                LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
                if (size + splitgLength > targetSize) {
                    ((InputSplitShim) split).shrinkSplit(targetSize - size);
                }
                break;
            }
            size += splitgLength;
        }
    }
    return retLists;
}

Also used : Path(org.apache.hadoop.fs.Path) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) InputSplitShim(org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with SplitSample

use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.

the class SimpleFetchOptimizer method checkTree.

// all we can handle is LimitOperator, FilterOperator SelectOperator and final FS
//
// for non-aggressive mode (minimal)
// 1. sampling is not allowed
// 2. for partitioned table, all filters should be targeted to partition column
// 3. SelectOperator should use only simple cast/column access
private FetchData checkTree(boolean aggressive, ParseContext pctx, String alias, TableScanOperator ts) throws HiveException {
    SplitSample splitSample = pctx.getNameToSplitSample().get(alias);
    if (!aggressive && splitSample != null) {
        return null;
    }
    if (!aggressive && ts.getConf().getTableSample() != null) {
        return null;
    }
    Table table = ts.getConf().getTableMetadata();
    if (table == null) {
        return null;
    }
    ReadEntity parent = PlanUtils.getParentViewInfo(alias, pctx.getViewAliasToInput());
    if (!table.isPartitioned()) {
        FetchData fetch = new FetchData(ts, parent, table, splitSample);
        return checkOperators(fetch, aggressive, false);
    }
    boolean bypassFilter = false;
    if (HiveConf.getBoolVar(pctx.getConf(), HiveConf.ConfVars.HIVEOPTPPD)) {
        ExprNodeDesc pruner = pctx.getOpToPartPruner().get(ts);
        if (PartitionPruner.onlyContainsPartnCols(table, pruner)) {
            bypassFilter = !pctx.getPrunedPartitions(alias, ts).hasUnknownPartitions();
        }
    }
    if (!aggressive && !bypassFilter) {
        return null;
    }
    PrunedPartitionList partitions = pctx.getPrunedPartitions(alias, ts);
    FetchData fetch = new FetchData(ts, parent, table, partitions, splitSample, bypassFilter);
    return checkOperators(fetch, aggressive, bypassFilter);
}

Also used : ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Table(org.apache.hadoop.hive.ql.metadata.Table) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Aggregations

SplitSample (org.apache.hadoop.hive.ql.parse.SplitSample)3 Table (org.apache.hadoop.hive.ql.metadata.Table)2 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Path (org.apache.hadoop.fs.Path)1 Context (org.apache.hadoop.hive.ql.Context)1 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)1 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)1 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)1 ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)1 GlobalLimitCtx (org.apache.hadoop.hive.ql.parse.GlobalLimitCtx)1 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)1 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)1 LimitDesc (org.apache.hadoop.hive.ql.plan.LimitDesc)1 InputSplitShim (org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim)1 CombineFileSplit (org.apache.hadoop.mapred.lib.CombineFileSplit)1