use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.
the class HiveRelMdSelectivity method computeInnerJoinSelectivity.
private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode predicate) {
Pair<Boolean, RexNode> predInfo = getCombinedPredicateForJoin(j, predicate);
if (!predInfo.getKey()) {
return new FilterSelectivityEstimator(j).estimateSelectivity(predInfo.getValue());
}
RexNode combinedPredicate = predInfo.getValue();
JoinPredicateInfo jpi;
try {
jpi = JoinPredicateInfo.constructJoinPredicateInfo(j, combinedPredicate);
} catch (CalciteSemanticException e) {
throw new RuntimeException(e);
}
ImmutableMap.Builder<Integer, Double> colStatMapBuilder = ImmutableMap.builder();
ImmutableMap<Integer, Double> colStatMap;
int rightOffSet = j.getLeft().getRowType().getFieldCount();
// Join which are part of join keys
for (Integer ljk : jpi.getProjsFromLeftPartOfJoinKeysInChildSchema()) {
colStatMapBuilder.put(ljk, HiveRelMdDistinctRowCount.getDistinctRowCount(j.getLeft(), mq, ljk));
}
// Join which are part of join keys
for (Integer rjk : jpi.getProjsFromRightPartOfJoinKeysInChildSchema()) {
colStatMapBuilder.put(rjk + rightOffSet, HiveRelMdDistinctRowCount.getDistinctRowCount(j.getRight(), mq, rjk));
}
colStatMap = colStatMapBuilder.build();
// 3. Walk through the Join Condition Building NDV for selectivity
// NDV of the join can not exceed the cardinality of cross join.
List<JoinLeafPredicateInfo> peLst = jpi.getEquiJoinPredicateElements();
int noOfPE = peLst.size();
double ndvCrossProduct = 1;
if (noOfPE > 0) {
ndvCrossProduct = exponentialBackoff(peLst, colStatMap);
if (j instanceof SemiJoin) {
ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()), ndvCrossProduct);
} else if (j instanceof HiveJoin) {
ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()) * mq.getRowCount(j.getRight()), ndvCrossProduct);
} else {
throw new RuntimeException("Unexpected Join type: " + j.getClass().getName());
}
}
// 4. Join Selectivity = 1/NDV
return (1 / ndvCrossProduct);
}
use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.
the class HiveOpConverter method translateJoin.
private OpAttr translateJoin(RelNode joinRel) throws SemanticException {
// 0. Additional data structures needed for the join optimization
// through Hive
String[] baseSrc = new String[joinRel.getInputs().size()];
String tabAlias = getHiveDerivedTableAlias();
// 1. Convert inputs
OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
for (int i = 0; i < inputs.length; i++) {
inputs[i] = dispatch(joinRel.getInput(i));
children.add(inputs[i].inputs.get(0));
baseSrc[i] = inputs[i].tabAlias;
}
// 2. Generate tags
for (int tag = 0; tag < children.size(); tag++) {
ReduceSinkOperator reduceSinkOp = (ReduceSinkOperator) children.get(tag);
reduceSinkOp.getConf().setTag(tag);
}
// 3. Virtual columns
Set<Integer> newVcolsInCalcite = new HashSet<Integer>();
newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite);
if (joinRel instanceof HiveMultiJoin || !(joinRel instanceof SemiJoin)) {
int shift = inputs[0].inputs.get(0).getSchema().getSignature().size();
for (int i = 1; i < inputs.length; i++) {
newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift));
shift += inputs[i].inputs.get(0).getSchema().getSignature().size();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Translating operator rel#" + joinRel.getId() + ":" + joinRel.getRelTypeName() + " with row type: [" + joinRel.getRowType() + "]");
}
// 4. Extract join key expressions from HiveSortExchange
ExprNodeDesc[][] joinExpressions = new ExprNodeDesc[inputs.length][];
for (int i = 0; i < inputs.length; i++) {
joinExpressions[i] = ((HiveSortExchange) joinRel.getInput(i)).getJoinExpressions();
}
// 5. Extract rest of join predicate info. We infer the rest of join condition
// that will be added to the filters (join conditions that are not part of
// the join key)
List<RexNode> joinFilters;
if (joinRel instanceof HiveJoin) {
joinFilters = ImmutableList.of(((HiveJoin) joinRel).getJoinFilter());
} else if (joinRel instanceof HiveMultiJoin) {
joinFilters = ((HiveMultiJoin) joinRel).getJoinFilters();
} else if (joinRel instanceof HiveSemiJoin) {
joinFilters = ImmutableList.of(((HiveSemiJoin) joinRel).getJoinFilter());
} else {
throw new SemanticException("Can't handle join type: " + joinRel.getClass().getName());
}
List<List<ExprNodeDesc>> filterExpressions = Lists.newArrayList();
for (int i = 0; i < joinFilters.size(); i++) {
List<ExprNodeDesc> filterExpressionsForInput = new ArrayList<ExprNodeDesc>();
if (joinFilters.get(i) != null) {
for (RexNode conj : RelOptUtil.conjunctions(joinFilters.get(i))) {
ExprNodeDesc expr = convertToExprNode(conj, joinRel, null, newVcolsInCalcite);
filterExpressionsForInput.add(expr);
}
}
filterExpressions.add(filterExpressionsForInput);
}
// 6. Generate Join operator
JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children, baseSrc, tabAlias);
// 7. Return result
return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
}
use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.
the class ASTConverter method convertSource.
private QueryBlockInfo convertSource(RelNode r) throws CalciteSemanticException {
Schema s = null;
ASTNode ast = null;
if (r instanceof TableScan) {
TableScan f = (TableScan) r;
s = new Schema(f);
ast = ASTBuilder.table(f);
} else if (r instanceof DruidQuery) {
DruidQuery f = (DruidQuery) r;
s = new Schema(f);
ast = ASTBuilder.table(f);
} else if (r instanceof Join) {
Join join = (Join) r;
QueryBlockInfo left = convertSource(join.getLeft());
QueryBlockInfo right = convertSource(join.getRight());
s = new Schema(left.schema, right.schema);
ASTNode cond = join.getCondition().accept(new RexVisitor(s));
boolean semiJoin = join instanceof SemiJoin;
if (join.getRight() instanceof Join && !semiJoin) {
// should not be done for semijoin since it will change the semantics
// Invert join inputs; this is done because otherwise the SemanticAnalyzer
// methods to merge joins will not kick in
JoinRelType type;
if (join.getJoinType() == JoinRelType.LEFT) {
type = JoinRelType.RIGHT;
} else if (join.getJoinType() == JoinRelType.RIGHT) {
type = JoinRelType.LEFT;
} else {
type = join.getJoinType();
}
ast = ASTBuilder.join(right.ast, left.ast, type, cond, semiJoin);
} else {
ast = ASTBuilder.join(left.ast, right.ast, join.getJoinType(), cond, semiJoin);
}
if (semiJoin) {
s = left.schema;
}
} else if (r instanceof Union) {
Union u = ((Union) r);
ASTNode left = new ASTConverter(((Union) r).getInput(0), this.derivedTableCount).convert();
for (int ind = 1; ind < u.getInputs().size(); ind++) {
left = getUnionAllAST(left, new ASTConverter(((Union) r).getInput(ind), this.derivedTableCount).convert());
String sqAlias = nextAlias();
ast = ASTBuilder.subQuery(left, sqAlias);
s = new Schema((Union) r, sqAlias);
}
} else {
ASTConverter src = new ASTConverter(r, this.derivedTableCount);
ASTNode srcAST = src.convert();
String sqAlias = nextAlias();
s = src.getRowSchema(sqAlias);
ast = ASTBuilder.subQuery(srcAST, sqAlias);
}
return new QueryBlockInfo(s, ast);
}
use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.
the class HiveOpConverter method genJoin.
private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions, List<List<ExprNodeDesc>> filterExpressions, List<Operator<?>> children, String[] baseSrc, String tabAlias) throws SemanticException {
// 1. Extract join type
JoinCondDesc[] joinCondns;
boolean semiJoin;
boolean noOuterJoin;
if (join instanceof HiveMultiJoin) {
HiveMultiJoin hmj = (HiveMultiJoin) join;
joinCondns = new JoinCondDesc[hmj.getJoinInputs().size()];
for (int i = 0; i < hmj.getJoinInputs().size(); i++) {
joinCondns[i] = new JoinCondDesc(new JoinCond(hmj.getJoinInputs().get(i).left, hmj.getJoinInputs().get(i).right, transformJoinType(hmj.getJoinTypes().get(i))));
}
semiJoin = false;
noOuterJoin = !hmj.isOuterJoin();
} else {
joinCondns = new JoinCondDesc[1];
semiJoin = join instanceof SemiJoin;
JoinType joinType;
if (semiJoin) {
joinType = JoinType.LEFTSEMI;
} else {
joinType = extractJoinType((Join) join);
}
joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType));
noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER && joinType != JoinType.RIGHTOUTER;
}
// 2. We create the join aux structures
ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
ArrayList<String> outputColumnNames = new ArrayList<String>(join.getRowType().getFieldNames());
Operator<?>[] childOps = new Operator[children.size()];
Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
Map<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> filters = new HashMap<Byte, List<ExprNodeDesc>>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
int outputPos = 0;
for (int pos = 0; pos < children.size(); pos++) {
// 2.1. Backtracking from RS
ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
if (inputRS.getNumParent() != 1) {
throw new SemanticException("RS should have single parent");
}
Operator<?> parent = inputRS.getParentOperators().get(0);
ReduceSinkDesc rsDesc = inputRS.getConf();
int[] index = inputRS.getValueIndex();
Byte tag = (byte) rsDesc.getTag();
// 2.1.1. If semijoin...
if (semiJoin && pos != 0) {
exprMap.put(tag, new ArrayList<ExprNodeDesc>());
childOps[pos] = inputRS;
continue;
}
posToAliasMap.put(pos, new HashSet<String>(inputRS.getSchema().getTableNames()));
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
List<String> valColNames = rsDesc.getOutputValueColumnNames();
Map<String, ExprNodeDesc> descriptors = buildBacktrackFromReduceSinkForJoin(outputPos, outputColumnNames, keyColNames, valColNames, index, parent, baseSrc[pos]);
List<ColumnInfo> parentColumns = parent.getSchema().getSignature();
for (int i = 0; i < index.length; i++) {
ColumnInfo info = new ColumnInfo(parentColumns.get(i));
info.setInternalName(outputColumnNames.get(outputPos));
info.setTabAlias(tabAlias);
outputColumns.add(info);
reversedExprs.put(outputColumnNames.get(outputPos), tag);
outputPos++;
}
exprMap.put(tag, new ArrayList<ExprNodeDesc>(descriptors.values()));
colExprMap.putAll(descriptors);
childOps[pos] = inputRS;
}
// 3. We populate the filters and filterMap structure needed in the join descriptor
List<List<ExprNodeDesc>> filtersPerInput = Lists.newArrayList();
int[][] filterMap = new int[children.size()][];
for (int i = 0; i < children.size(); i++) {
filtersPerInput.add(new ArrayList<ExprNodeDesc>());
}
// 3. We populate the filters structure
for (int i = 0; i < filterExpressions.size(); i++) {
int leftPos = joinCondns[i].getLeft();
int rightPos = joinCondns[i].getRight();
for (ExprNodeDesc expr : filterExpressions.get(i)) {
// We need to update the exprNode, as currently
// they refer to columns in the output of the join;
// they should refer to the columns output by the RS
int inputPos = updateExprNode(expr, reversedExprs, colExprMap);
if (inputPos == -1) {
inputPos = leftPos;
}
filtersPerInput.get(inputPos).add(expr);
if (joinCondns[i].getType() == JoinDesc.FULL_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.LEFT_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.RIGHT_OUTER_JOIN) {
if (inputPos == leftPos) {
updateFilterMap(filterMap, leftPos, rightPos);
} else {
updateFilterMap(filterMap, rightPos, leftPos);
}
}
}
}
for (int pos = 0; pos < children.size(); pos++) {
ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
ReduceSinkDesc rsDesc = inputRS.getConf();
Byte tag = (byte) rsDesc.getTag();
filters.put(tag, filtersPerInput.get(pos));
}
// 4. We create the join operator with its descriptor
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns, filters, joinExpressions);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(filterMap);
JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(childOps[0].getCompilationOpContext(), desc, new RowSchema(outputColumns), childOps);
joinOp.setColumnExprMap(colExprMap);
joinOp.setPosToAliasMap(posToAliasMap);
joinOp.getConf().setBaseSrc(baseSrc);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + joinOp + " with row schema: [" + joinOp.getSchema() + "]");
}
return joinOp;
}
use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.
the class HiveRelMdRowCount method analyzeJoinForPKFK.
/*
* For T1 join T2 on T1.x = T2.y if we identify 'y' s a key of T2 then we can
* infer the join cardinality as: rowCount(T1) * selectivity(T2) i.e this is
* like a SemiJoin where the T1(Fact side/FK side) is filtered by a factor
* based on the Selectivity of the PK/Dim table side.
*
* 1. If both T1.x and T2.y are keys then use the larger one as the PK side.
* 2. In case of outer Joins: a) The FK side should be the Null Preserving
* side. It doesn't make sense to apply this heuristic in case of Dim loj Fact
* or Fact roj Dim b) The selectivity factor applied on the Fact Table should
* be 1.
*/
public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery mq) {
RelNode left = joinRel.getInputs().get(0);
RelNode right = joinRel.getInputs().get(1);
final List<RexNode> initJoinFilters = RelOptUtil.conjunctions(joinRel.getCondition());
/*
* No joining condition.
*/
if (initJoinFilters.isEmpty()) {
return null;
}
List<RexNode> leftFilters = new ArrayList<RexNode>();
List<RexNode> rightFilters = new ArrayList<RexNode>();
List<RexNode> joinFilters = new ArrayList<RexNode>(initJoinFilters);
// column counts that is not true for semiJoins.
if (joinRel instanceof SemiJoin) {
return null;
}
RelOptUtil.classifyFilters(joinRel, joinFilters, joinRel.getJoinType(), false, !joinRel.getJoinType().generatesNullsOnRight(), !joinRel.getJoinType().generatesNullsOnLeft(), joinFilters, leftFilters, rightFilters);
Pair<Integer, Integer> joinCols = canHandleJoin(joinRel, leftFilters, rightFilters, joinFilters);
if (joinCols == null) {
return null;
}
int leftColIdx = joinCols.left;
int rightColIdx = joinCols.right;
RexBuilder rexBuilder = joinRel.getCluster().getRexBuilder();
RexNode leftPred = RexUtil.composeConjunction(rexBuilder, leftFilters, true);
RexNode rightPred = RexUtil.composeConjunction(rexBuilder, rightFilters, true);
ImmutableBitSet lBitSet = ImmutableBitSet.of(leftColIdx);
ImmutableBitSet rBitSet = ImmutableBitSet.of(rightColIdx);
/*
* If the form is Dim loj F or Fact roj Dim or Dim semij Fact then return
* null.
*/
boolean leftIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel.getJoinType() == JoinRelType.RIGHT) && !(joinRel instanceof SemiJoin) && isKey(lBitSet, left, mq);
boolean rightIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel.getJoinType() == JoinRelType.LEFT) && isKey(rBitSet, right, mq);
if (!leftIsKey && !rightIsKey) {
return null;
}
double leftRowCount = mq.getRowCount(left);
double rightRowCount = mq.getRowCount(right);
if (leftIsKey && rightIsKey) {
if (rightRowCount < leftRowCount) {
leftIsKey = false;
}
}
int pkSide = leftIsKey ? 0 : rightIsKey ? 1 : -1;
boolean isPKSideSimpleTree = pkSide != -1 ? IsSimpleTreeOnJoinKey.check(pkSide == 0 ? left : right, pkSide == 0 ? leftColIdx : rightColIdx, mq) : false;
double leftNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(left, lBitSet, leftPred) : -1;
double rightNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(right, rBitSet, rightPred) : -1;
/*
* If the ndv of the PK - FK side don't match, and the PK side is a filter
* on the Key column then scale the NDV on the FK side.
*
* As described by Peter Boncz: http://databasearchitects.blogspot.com/
* in such cases we can be off by a large margin in the Join cardinality
* estimate. The e.g. he provides is on the join of StoreSales and DateDim
* on the TPCDS dataset. Since the DateDim is populated for 20 years into
* the future, while the StoreSales only has 5 years worth of data, there
* are 40 times fewer distinct dates in StoreSales.
*
* In general it is hard to infer the range for the foreign key on an
* arbitrary expression. For e.g. the NDV for DayofWeek is the same
* irrespective of NDV on the number of unique days, whereas the
* NDV of Quarters has the same ratio as the NDV on the keys.
*
* But for expressions that apply only on columns that have the same NDV
* as the key (implying that they are alternate keys) we can apply the
* ratio. So in the case of StoreSales - DateDim joins for predicate on the
* d_date column we can apply the scaling factor.
*/
double ndvScalingFactor = 1.0;
if (isPKSideSimpleTree) {
ndvScalingFactor = pkSide == 0 ? leftNDV / rightNDV : rightNDV / leftNDV;
}
if (pkSide == 0) {
FKSideInfo fkInfo = new FKSideInfo(rightRowCount, rightNDV);
double pkSelectivity = pkSelectivity(joinRel, mq, true, left, leftRowCount);
PKSideInfo pkInfo = new PKSideInfo(leftRowCount, leftNDV, joinRel.getJoinType().generatesNullsOnRight() ? 1.0 : pkSelectivity);
return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
}
if (pkSide == 1) {
FKSideInfo fkInfo = new FKSideInfo(leftRowCount, leftNDV);
double pkSelectivity = pkSelectivity(joinRel, mq, false, right, rightRowCount);
PKSideInfo pkInfo = new PKSideInfo(rightRowCount, rightNDV, joinRel.getJoinType().generatesNullsOnLeft() ? 1.0 : pkSelectivity);
return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
}
return null;
}
Aggregations