use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class DynamicPartitionPruningOptimization method generateSemiJoinOperatorPlan.
// Generates plan for min/max when dynamic partition pruning is ruled out.
private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext, TableScanOperator ts, String keyBaseAlias, String internalColName, String colName, SemiJoinHint sjHint) throws SemanticException {
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
// we need the expr that generated the key of the reduce sink
ExprNodeDesc key = ctx.getKeyCol();
assert colName != null;
// Fetch the TableScan Operator.
Operator<?> op = parentOfRS;
while (!(op == null || op instanceof TableScanOperator || op instanceof ReduceSinkOperator)) {
op = op.getParentOperators().get(0);
}
Preconditions.checkNotNull(op);
if (op instanceof TableScanOperator) {
Table table = ((TableScanOperator) op).getConf().getTableMetadata();
if (table.isPartitionKey(colName)) {
// The column is partition column, skip the optimization.
return false;
}
}
// Check if there already exists a semijoin branch
GroupByOperator gb = parseContext.getColExprToGBMap().get(key);
if (gb != null) {
// Already an existing semijoin branch, reuse it
createFinalRsForSemiJoinOp(parseContext, ts, gb, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
// done!
return true;
}
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
// group by requires "ArrayList", don't ask.
ArrayList<String> outputNames = new ArrayList<String>();
// project the relevant key column
SelectDesc select = new SelectDesc(keyExprs, outputNames);
// Create the new RowSchema for the projected column
ColumnInfo columnInfo = parentOfRS.getSchema().getColumnInfo(internalColName);
columnInfo = new ColumnInfo(columnInfo);
outputNames.add(internalColName);
ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>();
signature.add(columnInfo);
RowSchema rowSchema = new RowSchema(signature);
// Create the column expr map
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ExprNodeDesc exprNode = null;
if (columnInfo == null) {
LOG.debug("No ColumnInfo found in {} for {}", parentOfRS.getOperatorId(), internalColName);
return false;
}
exprNode = new ExprNodeColumnDesc(columnInfo);
colExprMap.put(internalColName, exprNode);
// Create the Select Operator
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(select, rowSchema, colExprMap, parentOfRS);
// do a group by to aggregate min,max and bloom filter.
float groupByMemoryUsage = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(parseContext.getConf(), ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(parseContext.getConf(), ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
// Add min/max and bloom filter aggregations
List<ObjectInspector> aggFnOIs = new ArrayList<ObjectInspector>();
aggFnOIs.add(key.getWritableObjectInspector());
ArrayList<ExprNodeDesc> params = new ArrayList<ExprNodeDesc>();
params.add(new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), "", false));
ArrayList<AggregationDesc> aggs = new ArrayList<AggregationDesc>();
try {
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
if (sjHint != null && sjHint.getNumEntries() > 0) {
LOG.debug("Setting size for " + keyBaseAlias + " to " + sjHint.getNumEntries() + " based on the hint");
bloomFilterEval.setHintEntries(sjHint.getNumEntries());
}
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval);
aggs.add(min);
aggs.add(max);
aggs.add(bloomFilter);
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
}
// Create the Group by Operator
ArrayList<String> gbOutputNames = new ArrayList<String>();
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggs, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, -1, false);
ArrayList<ColumnInfo> groupbyColInfos = new ArrayList<ColumnInfo>();
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(1), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(2), key.getTypeInfo(), "", false));
GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, new RowSchema(groupbyColInfos), selectOp);
groupByOp.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
// Get the column names of the aggregations for reduce sink
int colPos = 0;
ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
Map<String, ExprNodeDesc> columnExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < aggs.size() - 1; i++) {
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(key.getTypeInfo(), gbOutputNames.get(colPos), "", false);
rsValueCols.add(colExpr);
columnExprMap.put(gbOutputNames.get(colPos), colExpr);
colPos++;
}
// Bloom Filter uses binary
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, gbOutputNames.get(colPos), "", false);
rsValueCols.add(colExpr);
columnExprMap.put(gbOutputNames.get(colPos), colExpr);
colPos++;
// Create the reduce sink operator
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false, -1, 0, 1, Operation.NOT_ACID, NullOrdering.defaultNullOrder(parseContext.getConf()));
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(groupByOp.getSchema()), groupByOp);
rsOp.setColumnExprMap(columnExprMap);
rsOp.getConf().setReducerTraits(EnumSet.of(ReduceSinkDesc.ReducerTraits.QUICKSTART));
// Create the final Group By Operator
ArrayList<AggregationDesc> aggsFinal = new ArrayList<AggregationDesc>();
try {
List<ObjectInspector> minFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> maxFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> bloomFilterFinalFnOIs = new ArrayList<ObjectInspector>();
ArrayList<ExprNodeDesc> minFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> maxFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> bloomFilterFinalParams = new ArrayList<ExprNodeDesc>();
// Use the expressions from Reduce Sink.
minFinalFnOIs.add(rsValueCols.get(0).getWritableObjectInspector());
maxFinalFnOIs.add(rsValueCols.get(1).getWritableObjectInspector());
bloomFilterFinalFnOIs.add(rsValueCols.get(2).getWritableObjectInspector());
// Coming from a ReduceSink the aggregations would be in the form VALUE._col0, VALUE._col1
minFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(0).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(0), "", false));
maxFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(1).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(1), "", false));
bloomFilterFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(2).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(2), "", false));
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", minFinalFnOIs, false, false), minFinalParams, false, Mode.FINAL);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", maxFinalFnOIs, false, false), maxFinalParams, false, Mode.FINAL);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", bloomFilterFinalFnOIs, false, false), bloomFilterFinalParams, false, Mode.FINAL);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
if (sjHint != null && sjHint.getNumEntries() > 0) {
bloomFilterEval.setHintEntries(sjHint.getNumEntries());
}
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval);
aggsFinal.add(min);
aggsFinal.add(max);
aggsFinal.add(bloomFilter);
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
}
GroupByDesc groupByDescFinal = new GroupByDesc(GroupByDesc.Mode.FINAL, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggsFinal, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, 0, false);
GroupByOperator groupByOpFinal = (GroupByOperator) OperatorFactory.getAndMakeChild(groupByDescFinal, new RowSchema(rsOp.getSchema()), rsOp);
groupByOpFinal.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
createFinalRsForSemiJoinOp(parseContext, ts, groupByOpFinal, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
return true;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class GenMapRedUtils method initPlan.
/**
* Initialize the current plan by adding it to root tasks.
*
* @param op
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
Task<?> currTask = mapredCtx.getCurrTask();
MapredWork plan = (MapredWork) currTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<?>> opTaskMap = opProcCtx.getOpTaskMap();
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
opTaskMap.put(reducer, currTask);
plan.setReduceWork(new ReduceWork());
plan.getReduceWork().setReducer(reducer);
ReduceSinkDesc desc = op.getConf();
plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
if (needsTagging(plan.getReduceWork())) {
plan.getReduceWork().setNeedsTagging(true);
}
assert currTopOp != null;
String currAliasId = opProcCtx.getCurrAliasId();
if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
}
currTopOp = null;
currAliasId = null;
opProcCtx.setCurrTask(currTask);
opProcCtx.setCurrTopOp(currTopOp);
opProcCtx.setCurrAliasId(currAliasId);
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class GenMapRedUtils method initUnionPlan.
/**
* Initialize the current union plan.
*
* @param op
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
public static void initUnionPlan(ReduceSinkOperator op, UnionOperator currUnionOp, GenMRProcContext opProcCtx, Task<?> unionTask) throws SemanticException {
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
MapredWork plan = (MapredWork) unionTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<?>> opTaskMap = opProcCtx.getOpTaskMap();
opTaskMap.put(reducer, unionTask);
plan.setReduceWork(new ReduceWork());
plan.getReduceWork().setReducer(reducer);
plan.getReduceWork().setReducer(reducer);
ReduceSinkDesc desc = op.getConf();
plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
if (needsTagging(plan.getReduceWork())) {
plan.getReduceWork().setNeedsTagging(true);
}
initUnionPlan(opProcCtx, currUnionOp, unionTask, false);
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class GenMapRedUtils method splitPlan.
/**
* Met cRS in pOP(parentTask with RS)-cRS-cOP(noTask) case
* Create new child task for cRS-cOP and link two tasks by temporary file : pOP-FS / TS-cRS-cOP
*
* @param cRS
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
static void splitPlan(ReduceSinkOperator cRS, GenMRProcContext opProcCtx) throws SemanticException {
// Generate a new task
ParseContext parseCtx = opProcCtx.getParseCtx();
Task<?> parentTask = opProcCtx.getCurrTask();
MapredWork childPlan = getMapRedWork(parseCtx);
Task<?> childTask = TaskFactory.get(childPlan);
Operator<? extends OperatorDesc> reducer = cRS.getChildOperators().get(0);
// Add the reducer
ReduceWork rWork = new ReduceWork();
childPlan.setReduceWork(rWork);
rWork.setReducer(reducer);
ReduceSinkDesc desc = cRS.getConf();
childPlan.getReduceWork().setNumReduceTasks(Integer.valueOf(desc.getNumReducers()));
opProcCtx.getOpTaskMap().put(reducer, childTask);
splitTasks(cRS, parentTask, childTask, opProcCtx);
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class ColumnPrunerProcFactory method getPruneReduceSinkOpRetainFlags.
private static boolean[] getPruneReduceSinkOpRetainFlags(List<String> retainedParentOpOutputCols, ReduceSinkOperator reduce) {
ReduceSinkDesc reduceConf = reduce.getConf();
List<ExprNodeDesc> originalValueEval = reduceConf.getValueCols();
boolean[] flags = new boolean[originalValueEval.size()];
for (int i = 0; i < originalValueEval.size(); i++) {
flags[i] = false;
List<String> current = originalValueEval.get(i).getCols();
if (current == null || current.size() == 0) {
flags[i] = true;
} else {
for (int j = 0; j < current.size(); j++) {
if (retainedParentOpOutputCols.contains(current.get(j))) {
flags[i] = true;
break;
}
}
}
}
return flags;
}
Aggregations