use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SharedWorkOptimizer method compareOperator.
private static boolean compareOperator(ParseContext pctx, Operator<?> op1, Operator<?> op2) throws SemanticException {
if (!op1.getClass().getName().equals(op2.getClass().getName())) {
return false;
}
// TODO: move this to logicalEquals
if (op1 instanceof ReduceSinkOperator) {
ReduceSinkDesc op1Conf = ((ReduceSinkOperator) op1).getConf();
ReduceSinkDesc op2Conf = ((ReduceSinkOperator) op2).getConf();
if (StringUtils.equals(op1Conf.getKeyColString(), op2Conf.getKeyColString()) && StringUtils.equals(op1Conf.getValueColsString(), op2Conf.getValueColsString()) && StringUtils.equals(op1Conf.getParitionColsString(), op2Conf.getParitionColsString()) && op1Conf.getTag() == op2Conf.getTag() && StringUtils.equals(op1Conf.getOrder(), op2Conf.getOrder()) && op1Conf.getTopN() == op2Conf.getTopN() && op1Conf.isAutoParallel() == op2Conf.isAutoParallel()) {
return true;
} else {
return false;
}
}
// TODO: move this to logicalEquals
if (op1 instanceof TableScanOperator) {
TableScanOperator tsOp1 = (TableScanOperator) op1;
TableScanOperator tsOp2 = (TableScanOperator) op2;
TableScanDesc op1Conf = tsOp1.getConf();
TableScanDesc op2Conf = tsOp2.getConf();
Table tableMeta1 = op1Conf.getTableMetadata();
Table tableMeta2 = op2Conf.getTableMetadata();
if (StringUtils.equals(tableMeta1.getFullyQualifiedName(), tableMeta2.getFullyQualifiedName()) && op1Conf.getNeededColumns().equals(op2Conf.getNeededColumns()) && StringUtils.equals(op1Conf.getFilterExprString(), op2Conf.getFilterExprString()) && pctx.getPrunedPartitions(tsOp1).getPartitions().equals(pctx.getPrunedPartitions(tsOp2).getPartitions()) && op1Conf.getRowLimit() == op2Conf.getRowLimit()) {
return true;
} else {
return false;
}
}
return op1.logicalEquals(op2);
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SharedWorkOptimizer method gatherDPPTableScanOps.
/**
* This method gathers the TS operators with DPP from the context and
* stores them into the input optimization cache.
*/
private static void gatherDPPTableScanOps(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException {
// Find TS operators with partition pruning enabled in plan
// because these TS may potentially read different data for
// different pipeline.
// These can be:
// 1) TS with DPP.
// 2) TS with semijoin DPP.
Map<String, TableScanOperator> topOps = pctx.getTopOps();
Collection<Operator<? extends OperatorDesc>> tableScanOps = Lists.<Operator<?>>newArrayList(topOps.values());
Set<AppMasterEventOperator> s = OperatorUtils.findOperators(tableScanOps, AppMasterEventOperator.class);
for (AppMasterEventOperator a : s) {
if (a.getConf() instanceof DynamicPruningEventDesc) {
DynamicPruningEventDesc dped = (DynamicPruningEventDesc) a.getConf();
optimizerCache.tableScanToDPPSource.put(dped.getTableScan(), a);
}
}
for (Entry<ReduceSinkOperator, SemiJoinBranchInfo> e : pctx.getRsToSemiJoinBranchInfo().entrySet()) {
optimizerCache.tableScanToDPPSource.put(e.getValue().getTsOp(), e.getKey());
}
LOG.debug("DPP information stored in the cache: {}", optimizerCache.tableScanToDPPSource);
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class ConvertJoinMapJoin method removeCycleCreatingSemiJoinOps.
// Remove any semijoin branch associated with hashjoin's parent's operator
// pipeline which can cause a cycle after hashjoin optimization.
private void removeCycleCreatingSemiJoinOps(MapJoinOperator mapjoinOp, Operator<?> parentSelectOpOfBigTable, ParseContext parseContext) throws SemanticException {
Map<ReduceSinkOperator, TableScanOperator> semiJoinMap = new HashMap<ReduceSinkOperator, TableScanOperator>();
for (Operator<?> op : parentSelectOpOfBigTable.getChildOperators()) {
if (!(op instanceof SelectOperator)) {
continue;
}
while (op.getChildOperators().size() > 0) {
op = op.getChildOperators().get(0);
}
// If not ReduceSink Op, skip
if (!(op instanceof ReduceSinkOperator)) {
continue;
}
ReduceSinkOperator rs = (ReduceSinkOperator) op;
TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
if (ts == null) {
// skip, no semijoin branch
continue;
}
// Found a semijoin branch.
// There can be more than one semijoin branch coming from the parent
// GBY Operator of the RS Operator.
Operator<?> parentGB = op.getParentOperators().get(0);
for (Operator<?> childRS : parentGB.getChildOperators()) {
// Get the RS and TS for this branch
rs = (ReduceSinkOperator) childRS;
ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
assert ts != null;
for (Operator<?> parent : mapjoinOp.getParentOperators()) {
if (!(parent instanceof ReduceSinkOperator)) {
continue;
}
Set<TableScanOperator> tsOps = OperatorUtils.findOperatorsUpstream(parent, TableScanOperator.class);
boolean found = false;
for (TableScanOperator parentTS : tsOps) {
// If the parent is same as the ts, then we have a cycle.
if (ts == parentTS) {
semiJoinMap.put(rs, ts);
found = true;
break;
}
}
if (found)
break;
}
}
}
if (semiJoinMap.size() > 0) {
for (ReduceSinkOperator rs : semiJoinMap.keySet()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Found semijoin optimization from the big table side of a map join, which will cause a task cycle. " + "Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semiJoinMap.get(rs)));
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(parseContext, rs, semiJoinMap.get(rs));
}
}
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class ConvertJoinMapJoin method checkConvertJoinSMBJoin.
/*
* This method tries to convert a join to an SMB. This is done based on
* traits. If the sorted by columns are the same as the join columns then, we
* can convert the join to an SMB. Otherwise retain the bucket map join as it
* is still more efficient than a regular join.
*/
private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();
int size = -1;
for (Operator<?> parentOp : joinOp.getParentOperators()) {
// each side better have 0 or more RS. if either side is unbalanced, cannot convert.
// This is a workaround for now. Right fix would be to refactor code in the
// MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(), ReduceSinkOperator.class);
if (size < 0) {
size = set.size();
continue;
}
if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
continue;
} else {
return false;
}
}
// transformation of the join operation
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (!(parentOp instanceof ReduceSinkOperator)) {
// could be mux/demux operators. Currently not supported
LOG.info("Found correlation optimizer operators. Cannot convert to SMB at this time.");
return false;
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
if (!checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(), rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), false)) {
LOG.info("We cannot convert to SMB because the sort column names do not match.");
return false;
}
if (!checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getBucketColNames(), rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(), true)) {
LOG.info("We cannot convert to SMB because bucket column names do not match.");
return false;
}
}
if (numBuckets < 0) {
numBuckets = bigTableRS.getConf().getNumReducers();
}
tezBucketJoinProcCtx.setNumBuckets(numBuckets);
LOG.info("We can convert the join to an SMB join.");
return true;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class ConvertJoinMapJoin method checkConvertJoinBucketMapJoin.
/*
* If the parent reduce sink of the big table side has the same emit key cols as its parent, we
* can create a bucket map join eliminating the reduce sink.
*/
private boolean checkConvertJoinBucketMapJoin(JoinOperator joinOp, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
// constituent reduce sinks
if (!(joinOp.getParentOperators().get(0) instanceof ReduceSinkOperator)) {
LOG.info("Operator is " + joinOp.getParentOperators().get(0).getName() + ". Cannot convert to bucket map join");
return false;
}
ReduceSinkOperator rs = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
List<List<String>> parentColNames = rs.getOpTraits().getBucketColNames();
Operator<? extends OperatorDesc> parentOfParent = rs.getParentOperators().get(0);
List<List<String>> grandParentColNames = parentOfParent.getOpTraits().getBucketColNames();
int numBuckets = parentOfParent.getOpTraits().getNumBuckets();
// all keys matched.
if (!checkColEquality(grandParentColNames, parentColNames, rs.getColumnExprMap(), true)) {
LOG.info("No info available to check for bucket map join. Cannot convert");
return false;
}
/*
* this is the case when the big table is a sub-query and is probably already bucketed by the
* join column in say a group by operation
*/
if (numBuckets < 0) {
numBuckets = rs.getConf().getNumReducers();
}
tezBucketJoinProcCtx.setNumBuckets(numBuckets);
return true;
}
Aggregations