use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SemanticAnalyzer method genGroupByPlan1ReduceMultiGBY.
@SuppressWarnings({ "nls" })
private Operator genGroupByPlan1ReduceMultiGBY(List<String> dests, QB qb, Operator input, Map<String, Operator> aliasToOpInfo) throws SemanticException {
QBParseInfo parseInfo = qb.getParseInfo();
ExprNodeDesc previous = null;
Operator selectInput = input;
// In order to facilitate partition pruning, or the where clauses together and put them at the
// top of the operator tree, this could also reduce the amount of data going to the reducer
List<ExprNodeDesc.ExprNodeDescEqualityWrapper> whereExpressions = new ArrayList<ExprNodeDesc.ExprNodeDescEqualityWrapper>();
for (String dest : dests) {
ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
if (!groupingSets.isEmpty()) {
throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR_MULTIGBY.getMsg());
}
ASTNode whereExpr = parseInfo.getWhrForClause(dest);
if (whereExpr != null) {
OpParseContext inputCtx = opParseCtx.get(input);
RowResolver inputRR = inputCtx.getRowResolver();
ExprNodeDesc current = genExprNodeDesc((ASTNode) whereExpr.getChild(0), inputRR);
// Check the list of where expressions already added so they aren't duplicated
ExprNodeDesc.ExprNodeDescEqualityWrapper currentWrapped = new ExprNodeDesc.ExprNodeDescEqualityWrapper(current);
if (!whereExpressions.contains(currentWrapped)) {
whereExpressions.add(currentWrapped);
} else {
continue;
}
if (previous == null) {
// If this is the first expression
previous = current;
continue;
}
GenericUDFOPOr or = new GenericUDFOPOr();
List<ExprNodeDesc> expressions = new ArrayList<ExprNodeDesc>(2);
expressions.add(current);
expressions.add(previous);
ExprNodeDesc orExpr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, or, expressions);
previous = orExpr;
} else {
// If an expression does not have a where clause, there can be no common filter
previous = null;
break;
}
}
if (previous != null) {
OpParseContext inputCtx = opParseCtx.get(input);
RowResolver inputRR = inputCtx.getRowResolver();
FilterDesc orFilterDesc = new FilterDesc(previous, false);
orFilterDesc.setGenerated(true);
selectInput = putOpInsertMap(OperatorFactory.getAndMakeChild(orFilterDesc, new RowSchema(inputRR.getColumnInfos()), input), inputRR);
}
// insert a select operator here used by the ColumnPruner to reduce
// the data to shuffle
Operator select = genSelectAllDesc(selectInput);
// Generate ReduceSinkOperator
ReduceSinkOperator reduceSinkOperatorInfo = genCommonGroupByPlanReduceSinkOperator(qb, dests, select);
// It is assumed throughout the code that a reducer has a single child, add a
// ForwardOperator so that we can add multiple filter/group by operators as children
RowResolver reduceSinkOperatorInfoRR = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
Operator forwardOp = putOpInsertMap(OperatorFactory.getAndMakeChild(new ForwardDesc(), new RowSchema(reduceSinkOperatorInfoRR.getColumnInfos()), reduceSinkOperatorInfo), reduceSinkOperatorInfoRR);
Operator curr = forwardOp;
for (String dest : dests) {
curr = forwardOp;
if (parseInfo.getWhrForClause(dest) != null) {
ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest);
curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, forwardOp, aliasToOpInfo, false, true);
}
// Generate GroupbyOperator
Operator groupByOperatorInfo = genGroupByPlanGroupByOperator(parseInfo, dest, curr, reduceSinkOperatorInfo, GroupByDesc.Mode.COMPLETE, null);
// TODO: should we pass curr instead of null?
curr = genPostGroupByBodyPlan(groupByOperatorInfo, dest, qb, aliasToOpInfo, null);
}
return curr;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SemanticAnalyzer method genJoinOperatorChildren.
private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Operator[] right, HashSet<Integer> omitOpts, ExprNodeDesc[][] joinKeys) throws SemanticException {
RowResolver outputRR = new RowResolver();
ArrayList<String> outputColumnNames = new ArrayList<String>();
// all children are base classes
Operator<?>[] rightOps = new Operator[right.length];
int outputPos = 0;
Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
HashMap<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
HashMap<Byte, List<ExprNodeDesc>> filterMap = new HashMap<Byte, List<ExprNodeDesc>>();
// Only used for semijoin with residual predicates
List<ColumnInfo> topSelectInputColumns = new ArrayList<>();
for (int pos = 0; pos < right.length; ++pos) {
Operator<?> input = right[pos] == null ? left : right[pos];
if (input == null) {
input = left;
}
ReduceSinkOperator rs = (ReduceSinkOperator) input;
if (rs.getNumParent() != 1) {
throw new SemanticException("RS should have single parent");
}
Operator<?> parent = rs.getParentOperators().get(0);
ReduceSinkDesc rsDesc = (ReduceSinkDesc) (input.getConf());
int[] index = rs.getValueIndex();
ArrayList<ExprNodeDesc> valueDesc = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> filterDesc = new ArrayList<ExprNodeDesc>();
Byte tag = (byte) rsDesc.getTag();
// we will add a Select on top of the join
if (omitOpts != null && omitOpts.contains(pos) && join.getPostJoinFilters().size() == 0) {
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
continue;
}
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
List<String> valColNames = rsDesc.getOutputValueColumnNames();
// prepare output descriptors for the input opt
RowResolver inputRR = opParseCtx.get(input).getRowResolver();
RowResolver parentRR = opParseCtx.get(parent).getRowResolver();
posToAliasMap.put(pos, new HashSet<String>(inputRR.getTableNames()));
List<ColumnInfo> columns = parentRR.getColumnInfos();
for (int i = 0; i < index.length; i++) {
ColumnInfo prev = columns.get(i);
String[] nm = parentRR.reverseLookup(prev.getInternalName());
String[] nm2 = parentRR.getAlternateMappings(prev.getInternalName());
if (outputRR.get(nm[0], nm[1]) != null) {
continue;
}
ColumnInfo info = new ColumnInfo(prev);
String field;
if (index[i] >= 0) {
field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
} else {
field = Utilities.ReduceField.VALUE + "." + valColNames.get(-index[i] - 1);
}
String internalName = getColumnInternalName(outputColumnNames.size());
ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
info.setInternalName(internalName);
colExprMap.put(internalName, desc);
outputRR.put(nm[0], nm[1], info);
if (nm2 != null) {
outputRR.addMappingOnly(nm2[0], nm2[1], info);
}
valueDesc.add(desc);
outputColumnNames.add(internalName);
reversedExprs.put(internalName, tag);
// Populate semijoin select if needed
if (omitOpts == null || !omitOpts.contains(pos)) {
topSelectInputColumns.add(info);
}
}
for (ASTNode cond : join.getFilters().get(tag)) {
filterDesc.add(genExprNodeDesc(cond, inputRR));
}
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
}
JoinCondDesc[] joinCondns = new JoinCondDesc[join.getJoinCond().length];
for (int i = 0; i < join.getJoinCond().length; i++) {
JoinCond condn = join.getJoinCond()[i];
joinCondns[i] = new JoinCondDesc(condn);
}
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, join.getNoOuterJoin(), joinCondns, filterMap, joinKeys, null);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(join.getFilterMap());
// Add filters that apply to more than one input
if (join.getPostJoinFilters().size() != 0 && (!join.getNoOuterJoin() || !join.getNoSemiJoin() || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) {
LOG.debug("Generate JOIN with post-filtering conditions");
List<ExprNodeDesc> residualFilterExprs = new ArrayList<ExprNodeDesc>();
for (ASTNode cond : join.getPostJoinFilters()) {
residualFilterExprs.add(genExprNodeDesc(cond, outputRR, false, isCBOExecuted()));
}
desc.setResidualFilterExprs(residualFilterExprs);
// Clean post-conditions
join.getPostJoinFilters().clear();
}
JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(getOpContext(), desc, new RowSchema(outputRR.getColumnInfos()), rightOps);
joinOp.setColumnExprMap(colExprMap);
joinOp.setPosToAliasMap(posToAliasMap);
if (join.getNullSafes() != null) {
boolean[] nullsafes = new boolean[join.getNullSafes().size()];
for (int i = 0; i < nullsafes.length; i++) {
nullsafes[i] = join.getNullSafes().get(i);
}
desc.setNullSafes(nullsafes);
}
Operator<?> topOp = putOpInsertMap(joinOp, outputRR);
if (omitOpts != null && !omitOpts.isEmpty() && desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
// Adding a select operator to top of semijoin to ensure projection of only correct columns
final List<ExprNodeDesc> topSelectExprs = new ArrayList<>();
final List<String> topSelectOutputColNames = new ArrayList<>();
final RowResolver topSelectRR = new RowResolver();
final Map<String, ExprNodeDesc> topSelectColExprMap = new HashMap<String, ExprNodeDesc>();
for (ColumnInfo colInfo : topSelectInputColumns) {
ExprNodeColumnDesc columnExpr = new ExprNodeColumnDesc(colInfo);
topSelectExprs.add(columnExpr);
topSelectOutputColNames.add(colInfo.getInternalName());
topSelectColExprMap.put(colInfo.getInternalName(), columnExpr);
String[] nm = outputRR.reverseLookup(columnExpr.getColumn());
String[] nm2 = outputRR.getAlternateMappings(columnExpr.getColumn());
topSelectRR.put(nm[0], nm[1], colInfo);
if (nm2 != null) {
topSelectRR.addMappingOnly(nm2[0], nm2[1], colInfo);
}
}
final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames);
topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR);
topOp.setColumnExprMap(topSelectColExprMap);
}
return topOp;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SetSparkReducerParallelism method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
Set<ReduceSinkOperator> parentSinks = null;
int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (!useOpStats) {
parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
parentSinks.remove(sink);
if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
// We haven't processed all the parent sinks, and we need
// them to be done in order to compute the parallelism for this sink.
// In this case, skip. We should visit this again from another path.
LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
return false;
}
}
if (context.getVisitedReduceSinks().contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
}
context.getVisitedReduceSinks().add(sink);
if (needSetParallelism(sink, context.getConf())) {
if (constantReducers > 0) {
LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
desc.setNumReducers(constantReducers);
} else {
// If it's a FileSink to bucketed files, use the bucket count as the reducer number
FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
if (fso != null) {
String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
if (numBuckets > 0) {
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
desc.setNumReducers(numBuckets);
return false;
}
}
if (useOpStats || parentSinks.isEmpty()) {
long numberOfBytes = 0;
if (useOpStats) {
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
}
} else {
LOG.warn("No stats available from: " + sibling);
}
}
} else {
// we should use TS stats to infer parallelism
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
for (TableScanOperator source : sources) {
if (source.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
}
} else {
LOG.warn("No stats available from table source: " + source);
}
}
}
LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
}
// Divide it by 2 so that we can have more reducers
long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
getSparkMemoryAndCores(context);
if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
// warn the user if bytes per reducer is much larger than memory per task
if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
}
// If there are more cores, use the number of cores
numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
}
numReducers = Math.min(numReducers, maxReducers);
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
desc.setNumReducers(numReducers);
} else {
// Use the maximum parallelism from all parent reduce sinks
int numberOfReducers = 0;
for (ReduceSinkOperator parent : parentSinks) {
numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
}
desc.setNumReducers(numberOfReducers);
LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
}
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
if (keyCols != null && keyCols.equals(partCols)) {
desc.setReducerTraits(EnumSet.of(UNIFORM));
}
}
} else {
LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
}
return false;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SparkSMBJoinHintOptimizer method removeSmallTableReduceSink.
/**
* In bucket mapjoin, there are ReduceSinks that mark a small table parent (Reduce Sink are removed from big-table).
* In SMB join these are not expected for any parents, either from small or big tables.
* @param mapJoinOp
*/
@SuppressWarnings("unchecked")
private void removeSmallTableReduceSink(MapJoinOperator mapJoinOp) {
SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOperators.size(); i++) {
Operator<? extends OperatorDesc> par = parentOperators.get(i);
if (i != smbJoinDesc.getPosBigTable()) {
if (par instanceof ReduceSinkOperator) {
List<Operator<? extends OperatorDesc>> grandParents = par.getParentOperators();
Preconditions.checkArgument(grandParents.size() == 1, "AssertionError: expect # of parents to be 1, but was " + grandParents.size());
Operator<? extends OperatorDesc> grandParent = grandParents.get(0);
grandParent.removeChild(par);
grandParent.setChildOperators(Utilities.makeList(mapJoinOp));
mapJoinOp.getParentOperators().set(i, grandParent);
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class ReduceSinkDeDuplicationUtils method merge.
// for JOIN-RS case, it's not possible generally to merge if child has
// less key/partition columns than parents
public static boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException {
List<Operator<?>> parents = pJoin.getParentOperators();
ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
ReduceSinkDesc cRSc = cRS.getConf();
for (ReduceSinkOperator pRSNs : pRSs) {
ReduceSinkDesc pRSNc = pRSNs.getConf();
if (cRSc.getKeyCols().size() != pRSNc.getKeyCols().size()) {
return false;
}
if (cRSc.getPartitionCols().size() != pRSNc.getPartitionCols().size()) {
return false;
}
Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRSNc.getNumReducers());
if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
return false;
}
Integer moveRSOrderTo = checkOrder(true, cRSc.getOrder(), pRSNc.getOrder(), cRSc.getNullOrder(), pRSNc.getNullOrder());
if (moveRSOrderTo == null) {
return false;
}
}
boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);
int cKeySize = cRSc.getKeyCols().size();
for (int i = 0; i < cKeySize; i++) {
ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
for (int tag = 0; tag < pRSs.length; tag++) {
pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
}
int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
if (found != i) {
return false;
}
}
int cPartSize = cRSc.getPartitionCols().size();
for (int i = 0; i < cPartSize; i++) {
ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
for (int tag = 0; tag < pRSs.length; tag++) {
pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
}
int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
if (found != i) {
return false;
}
}
for (ReduceSinkOperator pRS : pRSs) {
pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
}
return true;
}
Aggregations