use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.
the class GlobalLimitOptimizer method transform.
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Context ctx = pctx.getContext();
Map<String, TableScanOperator> topOps = pctx.getTopOps();
GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample();
// is used.
if (ctx.getTryCount() == 0 && topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) {
// Here we recursively check:
// 1. whether there are exact one LIMIT in the query
// 2. whether there is no aggregation, group-by, distinct, sort by,
// distributed by, or table sampling in any of the sub-query.
// The query only qualifies if both conditions are satisfied.
//
// Example qualified queries:
// CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
// INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
// FROM ... LIMIT...
// SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
//
TableScanOperator ts = topOps.values().iterator().next();
LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);
// query qualify for the optimization
if (tempGlobalLimit != null) {
LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
Table tab = ts.getConf().getTableMetadata();
Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);
if (!tab.isPartitioned()) {
if (filterOps.size() == 0) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
} else {
// check if the pruner only contains partition columns
if (onlyContainsPartnCols(tab, filterOps)) {
String alias = (String) topOps.keySet().toArray()[0];
PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);
// the filter to prune correctly
if (!partsList.hasUnknownPartitions()) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
}
}
if (globalLimitCtx.isEnable()) {
LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset());
LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit());
}
}
}
return pctx;
}
use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.
the class CombineHiveInputFormat method sampleSplits.
/**
* This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
*
* First, splits are grouped by alias they are for. If one split serves more than one
* alias or not for any sampled alias, we just directly add it to returned list.
* Then we find a list of exclusive splits for every alias to be sampled.
* For each alias, we start from position of seedNumber%totalNumber, and keep add
* splits until the total size hits percentage.
* @param splits
* @return the sampled splits
*/
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
//
for (CombineFileSplit split : splits) {
String alias = null;
for (Path path : split.getPaths()) {
boolean schemeless = path.toUri().getScheme() == null;
List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
// 3. it serves different alias than another path for the same split
if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
alias = null;
break;
}
alias = l.get(0);
}
if (alias != null) {
// add it to the split list of the alias.
if (!aliasToSplitList.containsKey(alias)) {
aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
}
aliasToSplitList.get(alias).add(split);
} else {
// The split doesn't exclusively serve one alias
retLists.add(split);
}
}
//
for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
ArrayList<CombineFileSplit> splitList = entry.getValue();
long totalSize = 0;
for (CombineFileSplit split : splitList) {
totalSize += split.getLength();
}
SplitSample splitSample = nameToSamples.get(entry.getKey());
long targetSize = splitSample.getTargetSize(totalSize);
int startIndex = splitSample.getSeedNum() % splitList.size();
long size = 0;
for (int i = 0; i < splitList.size(); i++) {
CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
retLists.add(split);
long splitgLength = split.getLength();
if (size + splitgLength >= targetSize) {
LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
if (size + splitgLength > targetSize) {
((InputSplitShim) split).shrinkSplit(targetSize - size);
}
break;
}
size += splitgLength;
}
}
return retLists;
}
use of org.apache.hadoop.hive.ql.parse.SplitSample in project hive by apache.
the class SimpleFetchOptimizer method checkTree.
// all we can handle is LimitOperator, FilterOperator SelectOperator and final FS
//
// for non-aggressive mode (minimal)
// 1. sampling is not allowed
// 2. for partitioned table, all filters should be targeted to partition column
// 3. SelectOperator should use only simple cast/column access
private FetchData checkTree(boolean aggressive, ParseContext pctx, String alias, TableScanOperator ts) throws HiveException {
SplitSample splitSample = pctx.getNameToSplitSample().get(alias);
if (!aggressive && splitSample != null) {
return null;
}
if (!aggressive && ts.getConf().getTableSample() != null) {
return null;
}
Table table = ts.getConf().getTableMetadata();
if (table == null) {
return null;
}
ReadEntity parent = PlanUtils.getParentViewInfo(alias, pctx.getViewAliasToInput());
if (!table.isPartitioned()) {
FetchData fetch = new FetchData(ts, parent, table, splitSample);
return checkOperators(fetch, aggressive, false);
}
boolean bypassFilter = false;
if (HiveConf.getBoolVar(pctx.getConf(), HiveConf.ConfVars.HIVEOPTPPD)) {
ExprNodeDesc pruner = pctx.getOpToPartPruner().get(ts);
if (PartitionPruner.onlyContainsPartnCols(table, pruner)) {
bypassFilter = !pctx.getPrunedPartitions(alias, ts).hasUnknownPartitions();
}
}
if (!aggressive && !bypassFilter) {
return null;
}
PrunedPartitionList partitions = pctx.getPrunedPartitions(alias, ts);
FetchData fetch = new FetchData(ts, parent, table, partitions, splitSample, bypassFilter);
return checkOperators(fetch, aggressive, bypassFilter);
}
Aggregations