Search in sources :

Example 1 with SemiJoin

use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.

the class HiveRelMdSelectivity method computeInnerJoinSelectivity.

private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode predicate) {
    Pair<Boolean, RexNode> predInfo = getCombinedPredicateForJoin(j, predicate);
    if (!predInfo.getKey()) {
        return new FilterSelectivityEstimator(j).estimateSelectivity(predInfo.getValue());
    }
    RexNode combinedPredicate = predInfo.getValue();
    JoinPredicateInfo jpi;
    try {
        jpi = JoinPredicateInfo.constructJoinPredicateInfo(j, combinedPredicate);
    } catch (CalciteSemanticException e) {
        throw new RuntimeException(e);
    }
    ImmutableMap.Builder<Integer, Double> colStatMapBuilder = ImmutableMap.builder();
    ImmutableMap<Integer, Double> colStatMap;
    int rightOffSet = j.getLeft().getRowType().getFieldCount();
    // Join which are part of join keys
    for (Integer ljk : jpi.getProjsFromLeftPartOfJoinKeysInChildSchema()) {
        colStatMapBuilder.put(ljk, HiveRelMdDistinctRowCount.getDistinctRowCount(j.getLeft(), mq, ljk));
    }
    // Join which are part of join keys
    for (Integer rjk : jpi.getProjsFromRightPartOfJoinKeysInChildSchema()) {
        colStatMapBuilder.put(rjk + rightOffSet, HiveRelMdDistinctRowCount.getDistinctRowCount(j.getRight(), mq, rjk));
    }
    colStatMap = colStatMapBuilder.build();
    // 3. Walk through the Join Condition Building NDV for selectivity
    // NDV of the join can not exceed the cardinality of cross join.
    List<JoinLeafPredicateInfo> peLst = jpi.getEquiJoinPredicateElements();
    int noOfPE = peLst.size();
    double ndvCrossProduct = 1;
    if (noOfPE > 0) {
        ndvCrossProduct = exponentialBackoff(peLst, colStatMap);
        if (j instanceof SemiJoin) {
            ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()), ndvCrossProduct);
        } else if (j instanceof HiveJoin) {
            ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()) * mq.getRowCount(j.getRight()), ndvCrossProduct);
        } else {
            throw new RuntimeException("Unexpected Join type: " + j.getClass().getName());
        }
    }
    // 4. Join Selectivity = 1/NDV
    return (1 / ndvCrossProduct);
}
Also used : HiveJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin) ImmutableMap(com.google.common.collect.ImmutableMap) JoinLeafPredicateInfo(org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo) JoinPredicateInfo(org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo) SemiJoin(org.apache.calcite.rel.core.SemiJoin) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RexNode(org.apache.calcite.rex.RexNode)

Example 2 with SemiJoin

use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.

the class HiveOpConverter method translateJoin.

private OpAttr translateJoin(RelNode joinRel) throws SemanticException {
    // 0. Additional data structures needed for the join optimization
    // through Hive
    String[] baseSrc = new String[joinRel.getInputs().size()];
    String tabAlias = getHiveDerivedTableAlias();
    // 1. Convert inputs
    OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
    List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
    for (int i = 0; i < inputs.length; i++) {
        inputs[i] = dispatch(joinRel.getInput(i));
        children.add(inputs[i].inputs.get(0));
        baseSrc[i] = inputs[i].tabAlias;
    }
    // 2. Generate tags
    for (int tag = 0; tag < children.size(); tag++) {
        ReduceSinkOperator reduceSinkOp = (ReduceSinkOperator) children.get(tag);
        reduceSinkOp.getConf().setTag(tag);
    }
    // 3. Virtual columns
    Set<Integer> newVcolsInCalcite = new HashSet<Integer>();
    newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite);
    if (joinRel instanceof HiveMultiJoin || !(joinRel instanceof SemiJoin)) {
        int shift = inputs[0].inputs.get(0).getSchema().getSignature().size();
        for (int i = 1; i < inputs.length; i++) {
            newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift));
            shift += inputs[i].inputs.get(0).getSchema().getSignature().size();
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Translating operator rel#" + joinRel.getId() + ":" + joinRel.getRelTypeName() + " with row type: [" + joinRel.getRowType() + "]");
    }
    // 4. Extract join key expressions from HiveSortExchange
    ExprNodeDesc[][] joinExpressions = new ExprNodeDesc[inputs.length][];
    for (int i = 0; i < inputs.length; i++) {
        joinExpressions[i] = ((HiveSortExchange) joinRel.getInput(i)).getJoinExpressions();
    }
    // 5. Extract rest of join predicate info. We infer the rest of join condition
    //    that will be added to the filters (join conditions that are not part of
    //    the join key)
    List<RexNode> joinFilters;
    if (joinRel instanceof HiveJoin) {
        joinFilters = ImmutableList.of(((HiveJoin) joinRel).getJoinFilter());
    } else if (joinRel instanceof HiveMultiJoin) {
        joinFilters = ((HiveMultiJoin) joinRel).getJoinFilters();
    } else if (joinRel instanceof HiveSemiJoin) {
        joinFilters = ImmutableList.of(((HiveSemiJoin) joinRel).getJoinFilter());
    } else {
        throw new SemanticException("Can't handle join type: " + joinRel.getClass().getName());
    }
    List<List<ExprNodeDesc>> filterExpressions = Lists.newArrayList();
    for (int i = 0; i < joinFilters.size(); i++) {
        List<ExprNodeDesc> filterExpressionsForInput = new ArrayList<ExprNodeDesc>();
        if (joinFilters.get(i) != null) {
            for (RexNode conj : RelOptUtil.conjunctions(joinFilters.get(i))) {
                ExprNodeDesc expr = convertToExprNode(conj, joinRel, null, newVcolsInCalcite);
                filterExpressionsForInput.add(expr);
            }
        }
        filterExpressions.add(filterExpressionsForInput);
    }
    // 6. Generate Join operator
    JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children, baseSrc, tabAlias);
    // 7. Return result
    return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) HiveMultiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin) ArrayList(java.util.ArrayList) HiveJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin) SemiJoin(org.apache.calcite.rel.core.SemiJoin) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) RexNode(org.apache.calcite.rex.RexNode)

Example 3 with SemiJoin

use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.

the class ASTConverter method convertSource.

private QueryBlockInfo convertSource(RelNode r) throws CalciteSemanticException {
    Schema s = null;
    ASTNode ast = null;
    if (r instanceof TableScan) {
        TableScan f = (TableScan) r;
        s = new Schema(f);
        ast = ASTBuilder.table(f);
    } else if (r instanceof DruidQuery) {
        DruidQuery f = (DruidQuery) r;
        s = new Schema(f);
        ast = ASTBuilder.table(f);
    } else if (r instanceof Join) {
        Join join = (Join) r;
        QueryBlockInfo left = convertSource(join.getLeft());
        QueryBlockInfo right = convertSource(join.getRight());
        s = new Schema(left.schema, right.schema);
        ASTNode cond = join.getCondition().accept(new RexVisitor(s));
        boolean semiJoin = join instanceof SemiJoin;
        if (join.getRight() instanceof Join && !semiJoin) {
            // should not be done for semijoin since it will change the semantics
            // Invert join inputs; this is done because otherwise the SemanticAnalyzer
            // methods to merge joins will not kick in
            JoinRelType type;
            if (join.getJoinType() == JoinRelType.LEFT) {
                type = JoinRelType.RIGHT;
            } else if (join.getJoinType() == JoinRelType.RIGHT) {
                type = JoinRelType.LEFT;
            } else {
                type = join.getJoinType();
            }
            ast = ASTBuilder.join(right.ast, left.ast, type, cond, semiJoin);
        } else {
            ast = ASTBuilder.join(left.ast, right.ast, join.getJoinType(), cond, semiJoin);
        }
        if (semiJoin) {
            s = left.schema;
        }
    } else if (r instanceof Union) {
        Union u = ((Union) r);
        ASTNode left = new ASTConverter(((Union) r).getInput(0), this.derivedTableCount).convert();
        for (int ind = 1; ind < u.getInputs().size(); ind++) {
            left = getUnionAllAST(left, new ASTConverter(((Union) r).getInput(ind), this.derivedTableCount).convert());
            String sqAlias = nextAlias();
            ast = ASTBuilder.subQuery(left, sqAlias);
            s = new Schema((Union) r, sqAlias);
        }
    } else {
        ASTConverter src = new ASTConverter(r, this.derivedTableCount);
        ASTNode srcAST = src.convert();
        String sqAlias = nextAlias();
        s = src.getRowSchema(sqAlias);
        ast = ASTBuilder.subQuery(srcAST, sqAlias);
    }
    return new QueryBlockInfo(s, ast);
}
Also used : HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) TableScan(org.apache.calcite.rel.core.TableScan) DruidQuery(org.apache.calcite.adapter.druid.DruidQuery) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SemiJoin(org.apache.calcite.rel.core.SemiJoin) Join(org.apache.calcite.rel.core.Join) Union(org.apache.calcite.rel.core.Union) JoinRelType(org.apache.calcite.rel.core.JoinRelType) ASTNode(org.apache.hadoop.hive.ql.parse.ASTNode) SemiJoin(org.apache.calcite.rel.core.SemiJoin)

Example 4 with SemiJoin

use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.

the class HiveOpConverter method genJoin.

private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions, List<List<ExprNodeDesc>> filterExpressions, List<Operator<?>> children, String[] baseSrc, String tabAlias) throws SemanticException {
    // 1. Extract join type
    JoinCondDesc[] joinCondns;
    boolean semiJoin;
    boolean noOuterJoin;
    if (join instanceof HiveMultiJoin) {
        HiveMultiJoin hmj = (HiveMultiJoin) join;
        joinCondns = new JoinCondDesc[hmj.getJoinInputs().size()];
        for (int i = 0; i < hmj.getJoinInputs().size(); i++) {
            joinCondns[i] = new JoinCondDesc(new JoinCond(hmj.getJoinInputs().get(i).left, hmj.getJoinInputs().get(i).right, transformJoinType(hmj.getJoinTypes().get(i))));
        }
        semiJoin = false;
        noOuterJoin = !hmj.isOuterJoin();
    } else {
        joinCondns = new JoinCondDesc[1];
        semiJoin = join instanceof SemiJoin;
        JoinType joinType;
        if (semiJoin) {
            joinType = JoinType.LEFTSEMI;
        } else {
            joinType = extractJoinType((Join) join);
        }
        joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType));
        noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER && joinType != JoinType.RIGHTOUTER;
    }
    // 2. We create the join aux structures
    ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
    ArrayList<String> outputColumnNames = new ArrayList<String>(join.getRowType().getFieldNames());
    Operator<?>[] childOps = new Operator[children.size()];
    Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
    Map<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> filters = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
    HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
    int outputPos = 0;
    for (int pos = 0; pos < children.size(); pos++) {
        // 2.1. Backtracking from RS
        ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
        if (inputRS.getNumParent() != 1) {
            throw new SemanticException("RS should have single parent");
        }
        Operator<?> parent = inputRS.getParentOperators().get(0);
        ReduceSinkDesc rsDesc = inputRS.getConf();
        int[] index = inputRS.getValueIndex();
        Byte tag = (byte) rsDesc.getTag();
        // 2.1.1. If semijoin...
        if (semiJoin && pos != 0) {
            exprMap.put(tag, new ArrayList<ExprNodeDesc>());
            childOps[pos] = inputRS;
            continue;
        }
        posToAliasMap.put(pos, new HashSet<String>(inputRS.getSchema().getTableNames()));
        List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
        List<String> valColNames = rsDesc.getOutputValueColumnNames();
        Map<String, ExprNodeDesc> descriptors = buildBacktrackFromReduceSinkForJoin(outputPos, outputColumnNames, keyColNames, valColNames, index, parent, baseSrc[pos]);
        List<ColumnInfo> parentColumns = parent.getSchema().getSignature();
        for (int i = 0; i < index.length; i++) {
            ColumnInfo info = new ColumnInfo(parentColumns.get(i));
            info.setInternalName(outputColumnNames.get(outputPos));
            info.setTabAlias(tabAlias);
            outputColumns.add(info);
            reversedExprs.put(outputColumnNames.get(outputPos), tag);
            outputPos++;
        }
        exprMap.put(tag, new ArrayList<ExprNodeDesc>(descriptors.values()));
        colExprMap.putAll(descriptors);
        childOps[pos] = inputRS;
    }
    // 3. We populate the filters and filterMap structure needed in the join descriptor
    List<List<ExprNodeDesc>> filtersPerInput = Lists.newArrayList();
    int[][] filterMap = new int[children.size()][];
    for (int i = 0; i < children.size(); i++) {
        filtersPerInput.add(new ArrayList<ExprNodeDesc>());
    }
    // 3. We populate the filters structure
    for (int i = 0; i < filterExpressions.size(); i++) {
        int leftPos = joinCondns[i].getLeft();
        int rightPos = joinCondns[i].getRight();
        for (ExprNodeDesc expr : filterExpressions.get(i)) {
            // We need to update the exprNode, as currently
            // they refer to columns in the output of the join;
            // they should refer to the columns output by the RS
            int inputPos = updateExprNode(expr, reversedExprs, colExprMap);
            if (inputPos == -1) {
                inputPos = leftPos;
            }
            filtersPerInput.get(inputPos).add(expr);
            if (joinCondns[i].getType() == JoinDesc.FULL_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.LEFT_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.RIGHT_OUTER_JOIN) {
                if (inputPos == leftPos) {
                    updateFilterMap(filterMap, leftPos, rightPos);
                } else {
                    updateFilterMap(filterMap, rightPos, leftPos);
                }
            }
        }
    }
    for (int pos = 0; pos < children.size(); pos++) {
        ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
        ReduceSinkDesc rsDesc = inputRS.getConf();
        Byte tag = (byte) rsDesc.getTag();
        filters.put(tag, filtersPerInput.get(pos));
    }
    // 4. We create the join operator with its descriptor
    JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns, filters, joinExpressions);
    desc.setReversedExprs(reversedExprs);
    desc.setFilterMap(filterMap);
    JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(childOps[0].getCompilationOpContext(), desc, new RowSchema(outputColumns), childOps);
    joinOp.setColumnExprMap(colExprMap);
    joinOp.setPosToAliasMap(posToAliasMap);
    joinOp.getConf().setBaseSrc(baseSrc);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generated " + joinOp + " with row schema: [" + joinOp.getSchema() + "]");
    }
    return joinOp;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) HiveMultiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) JoinCond(org.apache.hadoop.hive.ql.parse.JoinCond) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) SemiJoin(org.apache.calcite.rel.core.SemiJoin) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) JoinType(org.apache.hadoop.hive.ql.parse.JoinType) HiveJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin) SemiJoin(org.apache.calcite.rel.core.SemiJoin) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) Join(org.apache.calcite.rel.core.Join) HiveMultiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 5 with SemiJoin

use of org.apache.calcite.rel.core.SemiJoin in project hive by apache.

the class HiveRelMdRowCount method analyzeJoinForPKFK.

/*
   * For T1 join T2 on T1.x = T2.y if we identify 'y' s a key of T2 then we can
   * infer the join cardinality as: rowCount(T1) * selectivity(T2) i.e this is
   * like a SemiJoin where the T1(Fact side/FK side) is filtered by a factor
   * based on the Selectivity of the PK/Dim table side.
   *
   * 1. If both T1.x and T2.y are keys then use the larger one as the PK side.
   * 2. In case of outer Joins: a) The FK side should be the Null Preserving
   * side. It doesn't make sense to apply this heuristic in case of Dim loj Fact
   * or Fact roj Dim b) The selectivity factor applied on the Fact Table should
   * be 1.
   */
public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery mq) {
    RelNode left = joinRel.getInputs().get(0);
    RelNode right = joinRel.getInputs().get(1);
    final List<RexNode> initJoinFilters = RelOptUtil.conjunctions(joinRel.getCondition());
    /*
     * No joining condition.
     */
    if (initJoinFilters.isEmpty()) {
        return null;
    }
    List<RexNode> leftFilters = new ArrayList<RexNode>();
    List<RexNode> rightFilters = new ArrayList<RexNode>();
    List<RexNode> joinFilters = new ArrayList<RexNode>(initJoinFilters);
    // column counts that is not true for semiJoins.
    if (joinRel instanceof SemiJoin) {
        return null;
    }
    RelOptUtil.classifyFilters(joinRel, joinFilters, joinRel.getJoinType(), false, !joinRel.getJoinType().generatesNullsOnRight(), !joinRel.getJoinType().generatesNullsOnLeft(), joinFilters, leftFilters, rightFilters);
    Pair<Integer, Integer> joinCols = canHandleJoin(joinRel, leftFilters, rightFilters, joinFilters);
    if (joinCols == null) {
        return null;
    }
    int leftColIdx = joinCols.left;
    int rightColIdx = joinCols.right;
    RexBuilder rexBuilder = joinRel.getCluster().getRexBuilder();
    RexNode leftPred = RexUtil.composeConjunction(rexBuilder, leftFilters, true);
    RexNode rightPred = RexUtil.composeConjunction(rexBuilder, rightFilters, true);
    ImmutableBitSet lBitSet = ImmutableBitSet.of(leftColIdx);
    ImmutableBitSet rBitSet = ImmutableBitSet.of(rightColIdx);
    /*
     * If the form is Dim loj F or Fact roj Dim or Dim semij Fact then return
     * null.
     */
    boolean leftIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel.getJoinType() == JoinRelType.RIGHT) && !(joinRel instanceof SemiJoin) && isKey(lBitSet, left, mq);
    boolean rightIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel.getJoinType() == JoinRelType.LEFT) && isKey(rBitSet, right, mq);
    if (!leftIsKey && !rightIsKey) {
        return null;
    }
    double leftRowCount = mq.getRowCount(left);
    double rightRowCount = mq.getRowCount(right);
    if (leftIsKey && rightIsKey) {
        if (rightRowCount < leftRowCount) {
            leftIsKey = false;
        }
    }
    int pkSide = leftIsKey ? 0 : rightIsKey ? 1 : -1;
    boolean isPKSideSimpleTree = pkSide != -1 ? IsSimpleTreeOnJoinKey.check(pkSide == 0 ? left : right, pkSide == 0 ? leftColIdx : rightColIdx, mq) : false;
    double leftNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(left, lBitSet, leftPred) : -1;
    double rightNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(right, rBitSet, rightPred) : -1;
    /*
    * If the ndv of the PK - FK side don't match, and the PK side is a filter
    * on the Key column then scale the NDV on the FK side.
    *
    * As described by Peter Boncz: http://databasearchitects.blogspot.com/
    * in such cases we can be off by a large margin in the Join cardinality
    * estimate. The e.g. he provides is on the join of StoreSales and DateDim
    * on the TPCDS dataset. Since the DateDim is populated for 20 years into
    * the future, while the StoreSales only has 5 years worth of data, there
    * are 40 times fewer distinct dates in StoreSales.
    *
    * In general it is hard to infer the range for the foreign key on an
    * arbitrary expression. For e.g. the NDV for DayofWeek is the same
    * irrespective of NDV on the number of unique days, whereas the
    * NDV of Quarters has the same ratio as the NDV on the keys.
    *
    * But for expressions that apply only on columns that have the same NDV
    * as the key (implying that they are alternate keys) we can apply the
    * ratio. So in the case of StoreSales - DateDim joins for predicate on the
    * d_date column we can apply the scaling factor.
    */
    double ndvScalingFactor = 1.0;
    if (isPKSideSimpleTree) {
        ndvScalingFactor = pkSide == 0 ? leftNDV / rightNDV : rightNDV / leftNDV;
    }
    if (pkSide == 0) {
        FKSideInfo fkInfo = new FKSideInfo(rightRowCount, rightNDV);
        double pkSelectivity = pkSelectivity(joinRel, mq, true, left, leftRowCount);
        PKSideInfo pkInfo = new PKSideInfo(leftRowCount, leftNDV, joinRel.getJoinType().generatesNullsOnRight() ? 1.0 : pkSelectivity);
        return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
    }
    if (pkSide == 1) {
        FKSideInfo fkInfo = new FKSideInfo(leftRowCount, leftNDV);
        double pkSelectivity = pkSelectivity(joinRel, mq, false, right, rightRowCount);
        PKSideInfo pkInfo = new PKSideInfo(rightRowCount, rightNDV, joinRel.getJoinType().generatesNullsOnLeft() ? 1.0 : pkSelectivity);
        return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree);
    }
    return null;
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) ArrayList(java.util.ArrayList) RelNode(org.apache.calcite.rel.RelNode) RexBuilder(org.apache.calcite.rex.RexBuilder) SemiJoin(org.apache.calcite.rel.core.SemiJoin) RexNode(org.apache.calcite.rex.RexNode)

Aggregations

SemiJoin (org.apache.calcite.rel.core.SemiJoin)5 ArrayList (java.util.ArrayList)3 RexNode (org.apache.calcite.rex.RexNode)3 HiveJoin (org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin)3 ImmutableList (com.google.common.collect.ImmutableList)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Join (org.apache.calcite.rel.core.Join)2 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)2 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)2 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)2 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)2 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)2 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)2 HiveMultiJoin (org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin)2 HiveSemiJoin (org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin)2 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2