use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SharedWorkOptimizer method pushFilterToTopOfTableScan.
private static void pushFilterToTopOfTableScan(SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp) throws UDFArgumentException {
ExprNodeGenericFuncDesc tableScanExprNode = tsOp.getConf().getFilterExpr();
List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(tsOp.getChildOperators());
for (Operator<? extends OperatorDesc> op : allChildren) {
if (op instanceof FilterOperator) {
FilterOperator filterOp = (FilterOperator) op;
ExprNodeDesc filterExprNode = filterOp.getConf().getPredicate();
if (tableScanExprNode.isSame(filterExprNode)) {
// We do not need to do anything
return;
}
if (tableScanExprNode.getGenericUDF() instanceof GenericUDFOPOr) {
for (ExprNodeDesc childExprNode : tableScanExprNode.getChildren()) {
if (childExprNode.isSame(filterExprNode)) {
// so probably we pushed previously
return;
}
}
}
ExprNodeGenericFuncDesc newPred = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPAnd(), Arrays.<ExprNodeDesc>asList(tableScanExprNode.clone(), filterExprNode));
filterOp.getConf().setPredicate(newPred);
} else {
Operator<FilterDesc> newOp = OperatorFactory.get(tsOp.getCompilationOpContext(), new FilterDesc(tableScanExprNode.clone(), false), new RowSchema(tsOp.getSchema().getSignature()));
tsOp.replaceChild(op, newOp);
newOp.getParentOperators().add(tsOp);
op.replaceParent(tsOp, newOp);
newOp.getChildOperators().add(op);
// Add to cache (same group as tsOp)
optimizerCache.putIfWorkExists(newOp, tsOp);
}
}
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class ColumnPrunerProcCtx method handleFilterUnionChildren.
/**
* If the input filter operator has direct child(ren) which are union operator,
* and the filter's column is not the same as union's
* create select operator between them. The select operator has same number of columns as
* pruned child operator.
*
* @param curOp
* The filter operator which need to handle children.
* @throws SemanticException
*/
public void handleFilterUnionChildren(Operator<? extends OperatorDesc> curOp) throws SemanticException {
if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) {
return;
}
List<FieldNode> parentPrunList = prunedColLists.get(curOp);
if (parentPrunList == null || parentPrunList.size() == 0) {
return;
}
List<FieldNode> prunList = null;
for (Operator<? extends OperatorDesc> child : curOp.getChildOperators()) {
if (child instanceof UnionOperator) {
prunList = genColLists(child);
if (prunList == null || prunList.size() == 0 || parentPrunList.size() == prunList.size()) {
continue;
}
ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColNames = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
for (ColumnInfo colInfo : child.getSchema().getSignature()) {
if (lookupColumn(prunList, colInfo.getInternalName()) == null) {
continue;
}
ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
exprs.add(colDesc);
outputColNames.add(colInfo.getInternalName());
ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(), colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
newCol.setAlias(colInfo.getAlias());
outputRS.add(newCol);
colExprMap.put(colInfo.getInternalName(), colDesc);
}
SelectDesc select = new SelectDesc(exprs, outputColNames, false);
curOp.removeChild(child);
SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(select, new RowSchema(outputRS), curOp);
OperatorFactory.makeChild(sel, child);
sel.setColumnExprMap(colExprMap);
}
}
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class ConstantPropagateProcFactory method foldOperator.
/**
* Change operator row schema, replace column with constant if it is.
*
* @param op
* @param constants
* @throws SemanticException
*/
private static void foldOperator(Operator<? extends Serializable> op, ConstantPropagateProcCtx cppCtx) throws SemanticException {
RowSchema schema = op.getSchema();
Map<ColumnInfo, ExprNodeDesc> constants = cppCtx.getOpToConstantExprs().get(op);
if (schema != null && schema.getSignature() != null) {
for (ColumnInfo col : schema.getSignature()) {
ExprNodeDesc constant = constants.get(col);
if (constant != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Replacing column " + col + " with constant " + constant + " in " + op);
}
if (!col.getType().equals(constant.getTypeInfo())) {
constant = typeCast(constant, col.getType());
}
if (constant != null) {
col.setObjectinspector(constant.getWritableObjectInspector());
}
}
}
}
Map<String, ExprNodeDesc> colExprMap = op.getColumnExprMap();
if (colExprMap != null) {
for (Entry<ColumnInfo, ExprNodeDesc> e : constants.entrySet()) {
String internalName = e.getKey().getInternalName();
if (colExprMap.containsKey(internalName)) {
colExprMap.put(internalName, e.getValue());
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class DynamicPartitionPruningOptimization method generateSemiJoinOperatorPlan.
// Generates plan for min/max when dynamic partition pruning is ruled out.
private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext, TableScanOperator ts, String keyBaseAlias, String internalColName, String colName, SemiJoinHint sjHint) throws SemanticException {
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
// we need the expr that generated the key of the reduce sink
ExprNodeDesc key = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex());
assert colName != null;
// Fetch the TableScan Operator.
Operator<?> op = parentOfRS;
while (!(op == null || op instanceof TableScanOperator || op instanceof ReduceSinkOperator)) {
op = op.getParentOperators().get(0);
}
Preconditions.checkNotNull(op);
if (op instanceof TableScanOperator) {
Table table = ((TableScanOperator) op).getConf().getTableMetadata();
if (table.isPartitionKey(colName)) {
// The column is partition column, skip the optimization.
return false;
}
}
// Check if there already exists a semijoin branch
GroupByOperator gb = parseContext.getColExprToGBMap().get(key);
if (gb != null) {
// Already an existing semijoin branch, reuse it
createFinalRsForSemiJoinOp(parseContext, ts, gb, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
// done!
return true;
}
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
// group by requires "ArrayList", don't ask.
ArrayList<String> outputNames = new ArrayList<String>();
outputNames.add(HiveConf.getColumnInternalName(0));
// project the relevant key column
SelectDesc select = new SelectDesc(keyExprs, outputNames);
// Create the new RowSchema for the projected column
ColumnInfo columnInfo = parentOfRS.getSchema().getColumnInfo(internalColName);
ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>();
signature.add(columnInfo);
RowSchema rowSchema = new RowSchema(signature);
// Create the column expr map
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ExprNodeDesc exprNode = null;
if (parentOfRS.getColumnExprMap() != null) {
exprNode = parentOfRS.getColumnExprMap().get(internalColName).clone();
} else {
exprNode = new ExprNodeColumnDesc(columnInfo);
}
if (exprNode instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc encd = (ExprNodeColumnDesc) exprNode;
encd.setColumn(internalColName);
}
colExprMap.put(internalColName, exprNode);
// Create the Select Operator
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(select, rowSchema, colExprMap, parentOfRS);
// do a group by to aggregate min,max and bloom filter.
float groupByMemoryUsage = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
// Add min/max and bloom filter aggregations
List<ObjectInspector> aggFnOIs = new ArrayList<ObjectInspector>();
aggFnOIs.add(key.getWritableObjectInspector());
ArrayList<ExprNodeDesc> params = new ArrayList<ExprNodeDesc>();
params.add(new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), "", false));
ArrayList<AggregationDesc> aggs = new ArrayList<AggregationDesc>();
try {
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
if (sjHint != null && sjHint.getNumEntries() > 0) {
LOG.debug("Setting size for " + keyBaseAlias + " to " + sjHint.getNumEntries() + " based on the hint");
bloomFilterEval.setHintEntries(sjHint.getNumEntries());
}
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval);
aggs.add(min);
aggs.add(max);
aggs.add(bloomFilter);
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
}
// Create the Group by Operator
ArrayList<String> gbOutputNames = new ArrayList<String>();
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggs, false, groupByMemoryUsage, memoryThreshold, null, false, -1, false);
ArrayList<ColumnInfo> groupbyColInfos = new ArrayList<ColumnInfo>();
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(1), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(2), key.getTypeInfo(), "", false));
GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, new RowSchema(groupbyColInfos), selectOp);
groupByOp.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
// Get the column names of the aggregations for reduce sink
int colPos = 0;
ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
for (int i = 0; i < aggs.size() - 1; i++) {
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(key.getTypeInfo(), gbOutputNames.get(colPos++), "", false);
rsValueCols.add(colExpr);
}
// Bloom Filter uses binary
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, gbOutputNames.get(colPos++), "", false);
rsValueCols.add(colExpr);
// Create the reduce sink operator
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false, -1, 0, 1, Operation.NOT_ACID);
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(groupByOp.getSchema()), groupByOp);
Map<String, ExprNodeDesc> columnExprMap = new HashMap<String, ExprNodeDesc>();
rsOp.setColumnExprMap(columnExprMap);
rsOp.getConf().setReducerTraits(EnumSet.of(ReduceSinkDesc.ReducerTraits.QUICKSTART));
// Create the final Group By Operator
ArrayList<AggregationDesc> aggsFinal = new ArrayList<AggregationDesc>();
try {
List<ObjectInspector> minFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> maxFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> bloomFilterFinalFnOIs = new ArrayList<ObjectInspector>();
ArrayList<ExprNodeDesc> minFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> maxFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> bloomFilterFinalParams = new ArrayList<ExprNodeDesc>();
// Use the expressions from Reduce Sink.
minFinalFnOIs.add(rsValueCols.get(0).getWritableObjectInspector());
maxFinalFnOIs.add(rsValueCols.get(1).getWritableObjectInspector());
bloomFilterFinalFnOIs.add(rsValueCols.get(2).getWritableObjectInspector());
// Coming from a ReduceSink the aggregations would be in the form VALUE._col0, VALUE._col1
minFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(0).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(0), "", false));
maxFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(1).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(1), "", false));
bloomFilterFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(2).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(2), "", false));
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", minFinalFnOIs, false, false), minFinalParams, false, Mode.FINAL);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", maxFinalFnOIs, false, false), maxFinalParams, false, Mode.FINAL);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", bloomFilterFinalFnOIs, false, false), bloomFilterFinalParams, false, Mode.FINAL);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
if (sjHint != null && sjHint.getNumEntries() > 0) {
bloomFilterEval.setHintEntries(sjHint.getNumEntries());
}
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval);
aggsFinal.add(min);
aggsFinal.add(max);
aggsFinal.add(bloomFilter);
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
}
GroupByDesc groupByDescFinal = new GroupByDesc(GroupByDesc.Mode.FINAL, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggsFinal, false, groupByMemoryUsage, memoryThreshold, null, false, 0, false);
GroupByOperator groupByOpFinal = (GroupByOperator) OperatorFactory.getAndMakeChild(groupByDescFinal, new RowSchema(rsOp.getSchema()), rsOp);
groupByOpFinal.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
createFinalRsForSemiJoinOp(parseContext, ts, groupByOpFinal, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
return true;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class ProjectionPusher method pushProjectionsAndFilters.
private void pushProjectionsAndFilters(final JobConf jobConf, final String splitPath, final String splitPathWithNoSchema) {
if (mapWork == null) {
return;
} else if (mapWork.getPathToAliases() == null) {
return;
}
final Set<String> aliases = new HashSet<String>();
try {
ArrayList<String> a = HiveFileFormatUtils.getFromPathRecursively(mapWork.getPathToAliases(), new Path(splitPath), null, false, true);
if (a != null) {
aliases.addAll(a);
}
if (a == null || a.isEmpty()) {
// TODO: not having aliases for path usually means some bug. Should it give up?
LOG.warn("Couldn't find aliases for " + splitPath);
}
} catch (IllegalArgumentException | IOException e) {
throw new RuntimeException(e);
}
// Collect the needed columns from all the aliases and create ORed filter
// expression for the table.
boolean allColumnsNeeded = false;
boolean noFilters = false;
Set<Integer> neededColumnIDs = new HashSet<Integer>();
// To support nested column pruning, we need to track the path from the top to the nested
// fields
Set<String> neededNestedColumnPaths = new HashSet<String>();
List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
RowSchema rowSchema = null;
for (String alias : aliases) {
final Operator<? extends Serializable> op = mapWork.getAliasToWork().get(alias);
if (op != null && op instanceof TableScanOperator) {
final TableScanOperator ts = (TableScanOperator) op;
if (ts.getNeededColumnIDs() == null) {
allColumnsNeeded = true;
} else {
neededColumnIDs.addAll(ts.getNeededColumnIDs());
if (ts.getNeededNestedColumnPaths() != null) {
neededNestedColumnPaths.addAll(ts.getNeededNestedColumnPaths());
}
}
rowSchema = ts.getSchema();
ExprNodeGenericFuncDesc filterExpr = ts.getConf() == null ? null : ts.getConf().getFilterExpr();
// No filter if any TS has no filter expression
noFilters = filterExpr == null;
filterExprs.add(filterExpr);
}
}
ExprNodeGenericFuncDesc tableFilterExpr = null;
if (!noFilters) {
try {
for (ExprNodeGenericFuncDesc filterExpr : filterExprs) {
if (tableFilterExpr == null) {
tableFilterExpr = filterExpr;
} else {
tableFilterExpr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(tableFilterExpr, filterExpr));
}
}
} catch (UDFArgumentException ex) {
LOG.debug("Turn off filtering due to " + ex);
tableFilterExpr = null;
}
}
// push down projections
if (!allColumnsNeeded) {
if (!neededColumnIDs.isEmpty()) {
ColumnProjectionUtils.appendReadColumns(jobConf, new ArrayList<Integer>(neededColumnIDs));
ColumnProjectionUtils.appendNestedColumnPaths(jobConf, new ArrayList<String>(neededNestedColumnPaths));
}
} else {
ColumnProjectionUtils.setReadAllColumns(jobConf);
}
pushFilters(jobConf, rowSchema, tableFilterExpr);
}
Aggregations