Search in sources :

Example 11 with HiveTableScan

use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.

the class HiveRelMdUniqueKeys method getUniqueKeys.

/*
   * Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col)
   * - min(col) = rowCount(col)
   * 
   * Why are we intercepting Project and not TableScan? Because if we
   * have a method for TableScan, it will not know which columns to check for.
   * Inferring Uniqueness for all columns is very expensive right now. The flip
   * side of doing this is, it only works post Field Trimming.
   */
public Set<ImmutableBitSet> getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) {
    HiveTableScan tScan = getTableScan(rel.getInput(), false);
    if (tScan == null) {
        // If HiveTableScan is not found, e.g., not sequence of Project and
        // Filter operators, execute the original getUniqueKeys method
        // LogicalProject maps a set of rows to a different set;
        // Without knowledge of the mapping function(whether it
        // preserves uniqueness), it is only safe to derive uniqueness
        // info from the child of a project when the mapping is f(a) => a.
        // 
        // Further more, the unique bitset coming from the child needs
        // to be mapped to match the output of the project.
        final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
        final List<RexNode> projExprs = rel.getProjects();
        final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
        // Build an input to output position map.
        for (int i = 0; i < projExprs.size(); i++) {
            RexNode projExpr = projExprs.get(i);
            if (projExpr instanceof RexInputRef) {
                mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
            }
        }
        if (mapInToOutPos.isEmpty()) {
            // return empty set.
            return projUniqueKeySet;
        }
        Set<ImmutableBitSet> childUniqueKeySet = mq.getUniqueKeys(rel.getInput(), ignoreNulls);
        if (childUniqueKeySet != null) {
            // projected.
            for (ImmutableBitSet colMask : childUniqueKeySet) {
                ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
                boolean completeKeyProjected = true;
                for (int bit : colMask) {
                    if (mapInToOutPos.containsKey(bit)) {
                        tmpMask.set(mapInToOutPos.get(bit));
                    } else {
                        // Skip the child unique key if part of it is not
                        // projected.
                        completeKeyProjected = false;
                        break;
                    }
                }
                if (completeKeyProjected) {
                    projUniqueKeySet.add(tmpMask.build());
                }
            }
        }
        return projUniqueKeySet;
    }
    Map<Integer, Integer> posMap = new HashMap<Integer, Integer>();
    int projectPos = 0;
    int colStatsPos = 0;
    BitSet projectedCols = new BitSet();
    for (RexNode r : rel.getProjects()) {
        if (r instanceof RexInputRef) {
            projectedCols.set(((RexInputRef) r).getIndex());
            posMap.put(colStatsPos, projectPos);
            colStatsPos++;
        }
        projectPos++;
    }
    double numRows = mq.getRowCount(tScan);
    List<ColStatistics> colStats = tScan.getColStat(BitSets.toList(projectedCols));
    Set<ImmutableBitSet> keys = new HashSet<ImmutableBitSet>();
    colStatsPos = 0;
    for (ColStatistics cStat : colStats) {
        boolean isKey = false;
        if (cStat.getCountDistint() >= numRows) {
            isKey = true;
        }
        if (!isKey && cStat.getRange() != null && cStat.getRange().maxValue != null && cStat.getRange().minValue != null) {
            double r = cStat.getRange().maxValue.doubleValue() - cStat.getRange().minValue.doubleValue() + 1;
            isKey = (Math.abs(numRows - r) < RelOptUtil.EPSILON);
        }
        if (isKey) {
            ImmutableBitSet key = ImmutableBitSet.of(posMap.get(colStatsPos));
            keys.add(key);
        }
        colStatsPos++;
    }
    return keys;
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) HashMap(java.util.HashMap) ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) BitSet(java.util.BitSet) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) RexInputRef(org.apache.calcite.rex.RexInputRef) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) RexNode(org.apache.calcite.rex.RexNode) HashSet(java.util.HashSet)

Example 12 with HiveTableScan

use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.

the class HiveMaterializedViewsRegistry method createMaterializedViewScan.

private static RelNode createMaterializedViewScan(HiveConf conf, Table viewTable) {
    // 0. Recreate cluster
    final RelOptPlanner planner = CalcitePlanner.createPlanner(conf);
    final RexBuilder rexBuilder = new RexBuilder(new JavaTypeFactoryImpl(new HiveTypeSystemImpl()));
    final RelOptCluster cluster = RelOptCluster.create(planner, rexBuilder);
    // 1. Create column schema
    final RowResolver rr = new RowResolver();
    // 1.1 Add Column info for non partion cols (Object Inspector fields)
    StructObjectInspector rowObjectInspector;
    try {
        rowObjectInspector = (StructObjectInspector) viewTable.getDeserializer().getObjectInspector();
    } catch (SerDeException e) {
        // Bail out
        return null;
    }
    List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
    ColumnInfo colInfo;
    String colName;
    ArrayList<ColumnInfo> cInfoLst = new ArrayList<>();
    for (StructField structField : fields) {
        colName = structField.getFieldName();
        colInfo = new ColumnInfo(structField.getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(structField.getFieldObjectInspector()), null, false);
        rr.put(null, colName, colInfo);
        cInfoLst.add(colInfo);
    }
    ArrayList<ColumnInfo> nonPartitionColumns = new ArrayList<ColumnInfo>(cInfoLst);
    // 1.2 Add column info corresponding to partition columns
    ArrayList<ColumnInfo> partitionColumns = new ArrayList<ColumnInfo>();
    for (FieldSchema part_col : viewTable.getPartCols()) {
        colName = part_col.getName();
        colInfo = new ColumnInfo(colName, TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), null, true);
        rr.put(null, colName, colInfo);
        cInfoLst.add(colInfo);
        partitionColumns.add(colInfo);
    }
    // 1.3 Build row type from field <type, name>
    RelDataType rowType;
    try {
        rowType = TypeConverter.getType(cluster, rr, null);
    } catch (CalciteSemanticException e) {
        // Bail out
        return null;
    }
    // 2. Build RelOptAbstractTable
    List<String> fullyQualifiedTabName = new ArrayList<>();
    if (viewTable.getDbName() != null && !viewTable.getDbName().isEmpty()) {
        fullyQualifiedTabName.add(viewTable.getDbName());
    }
    fullyQualifiedTabName.add(viewTable.getTableName());
    RelNode tableRel;
    // 3. Build operator
    if (obtainTableType(viewTable) == TableType.DRUID) {
        // Build Druid query
        String address = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
        String dataSource = viewTable.getParameters().get(Constants.DRUID_DATA_SOURCE);
        Set<String> metrics = new HashSet<>();
        List<RelDataType> druidColTypes = new ArrayList<>();
        List<String> druidColNames = new ArrayList<>();
        // @NOTE this code is very similar to the code at org/apache/hadoop/hive/ql/parse/CalcitePlanner.java:2362
        // @TODO it will be nice to refactor it
        RelDataTypeFactory dtFactory = cluster.getRexBuilder().getTypeFactory();
        for (RelDataTypeField field : rowType.getFieldList()) {
            if (DruidTable.DEFAULT_TIMESTAMP_COLUMN.equals(field.getName())) {
                // Druid's time column is always not null.
                druidColTypes.add(dtFactory.createTypeWithNullability(field.getType(), false));
            } else {
                druidColTypes.add(field.getType());
            }
            druidColNames.add(field.getName());
            if (field.getName().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
                // timestamp
                continue;
            }
            if (field.getType().getSqlTypeName() == SqlTypeName.VARCHAR) {
                // dimension
                continue;
            }
            metrics.add(field.getName());
        }
        List<Interval> intervals = Collections.singletonList(DruidTable.DEFAULT_INTERVAL);
        rowType = dtFactory.createStructType(druidColTypes, druidColNames);
        // We can pass null for Hive object because it is only used to retrieve tables
        // if constraints on a table object are existing, but constraints cannot be defined
        // for materialized views.
        RelOptHiveTable optTable = new RelOptHiveTable(null, cluster.getTypeFactory(), fullyQualifiedTabName, rowType, viewTable, nonPartitionColumns, partitionColumns, new ArrayList<>(), conf, null, new QueryTables(true), new HashMap<>(), new HashMap<>(), new AtomicInteger());
        DruidTable druidTable = new DruidTable(new DruidSchema(address, address, false), dataSource, RelDataTypeImpl.proto(rowType), metrics, DruidTable.DEFAULT_TIMESTAMP_COLUMN, intervals, null, null);
        final TableScan scan = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
        tableRel = DruidQuery.create(cluster, cluster.traitSetOf(BindableConvention.INSTANCE), optTable, druidTable, ImmutableList.<RelNode>of(scan), ImmutableMap.of());
    } else {
        // Build Hive Table Scan Rel.
        // We can pass null for Hive object because it is only used to retrieve tables
        // if constraints on a table object are existing, but constraints cannot be defined
        // for materialized views.
        RelOptHiveTable optTable = new RelOptHiveTable(null, cluster.getTypeFactory(), fullyQualifiedTabName, rowType, viewTable, nonPartitionColumns, partitionColumns, new ArrayList<>(), conf, null, new QueryTables(true), new HashMap<>(), new HashMap<>(), new AtomicInteger());
        tableRel = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
    }
    return tableRel;
}
Also used : RelOptCluster(org.apache.calcite.plan.RelOptCluster) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) DruidTable(org.apache.calcite.adapter.druid.DruidTable) RelDataType(org.apache.calcite.rel.type.RelDataType) RowResolver(org.apache.hadoop.hive.ql.parse.RowResolver) QueryTables(org.apache.hadoop.hive.ql.parse.QueryTables) RelOptPlanner(org.apache.calcite.plan.RelOptPlanner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) JavaTypeFactoryImpl(org.apache.calcite.jdbc.JavaTypeFactoryImpl) RelDataTypeFactory(org.apache.calcite.rel.type.RelDataTypeFactory) RexBuilder(org.apache.calcite.rex.RexBuilder) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HashSet(java.util.HashSet) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) TableScan(org.apache.calcite.rel.core.TableScan) DruidSchema(org.apache.calcite.adapter.druid.DruidSchema) RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) RelOptHiveTable(org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable) HiveRelNode(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode) RelNode(org.apache.calcite.rel.RelNode) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) HiveTypeSystemImpl(org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Interval(org.joda.time.Interval)

Example 13 with HiveTableScan

use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.

the class HiveMaterializedViewUtils method copyNodeScanNewCluster.

private static RelNode copyNodeScanNewCluster(RelOptCluster optCluster, RelNode scan) {
    final RelNode newScan;
    if (scan instanceof DruidQuery) {
        final DruidQuery dq = (DruidQuery) scan;
        // Ideally we should use HiveRelNode convention. However, since Volcano planner
        // throws in that case because DruidQuery does not implement the interface,
        // we set it as Bindable. Currently, we do not use convention in Hive, hence that
        // should be fine.
        // TODO: If we want to make use of convention (e.g., while directly generating operator
        // tree instead of AST), this should be changed.
        newScan = DruidQuery.create(optCluster, optCluster.traitSetOf(BindableConvention.INSTANCE), scan.getTable(), dq.getDruidTable(), ImmutableList.of(dq.getTableScan()), DruidSqlOperatorConverter.getDefaultMap());
    } else {
        newScan = new HiveTableScan(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION), (RelOptHiveTable) scan.getTable(), ((RelOptHiveTable) scan.getTable()).getName(), null, false, false);
    }
    return newScan;
}
Also used : DruidQuery(org.apache.calcite.adapter.druid.DruidQuery) RelOptHiveTable(org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable) HiveRelNode(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode) RelNode(org.apache.calcite.rel.RelNode) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan)

Example 14 with HiveTableScan

use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.

the class FilterSelectivityEstimator method visitCall.

@Override
public Double visitCall(RexCall call) {
    if (!deep) {
        return 1.0;
    }
    /*
     * Ignore any predicates on partition columns because we have already
     * accounted for these in the Table row count.
     */
    if (isPartitionPredicate(call, this.childRel)) {
        return 1.0;
    }
    Double selectivity = null;
    SqlKind op = getOp(call);
    switch(op) {
        case AND:
            {
                selectivity = computeConjunctionSelectivity(call);
                break;
            }
        case OR:
            {
                selectivity = computeDisjunctionSelectivity(call);
                break;
            }
        case NOT:
        case NOT_EQUALS:
            {
                selectivity = computeNotEqualitySelectivity(call);
                break;
            }
        case IS_NOT_NULL:
            {
                if (childRel instanceof HiveTableScan) {
                    double noOfNulls = getMaxNulls(call, (HiveTableScan) childRel);
                    double totalNoOfTuples = mq.getRowCount(childRel);
                    if (totalNoOfTuples >= noOfNulls) {
                        selectivity = (totalNoOfTuples - noOfNulls) / Math.max(totalNoOfTuples, 1);
                    } else {
                        // If we are running explain, we will print the warning in the console
                        // and the log files. Otherwise, we just print it in the log files.
                        HiveConfPlannerContext ctx = childRel.getCluster().getPlanner().getContext().unwrap(HiveConfPlannerContext.class);
                        String msg = "Invalid statistics: Number of null values > number of tuples. " + "Consider recomputing statistics for table: " + ((RelOptHiveTable) childRel.getTable()).getHiveTableMD().getFullyQualifiedName();
                        if (ctx.isExplainPlan()) {
                            SessionState.getConsole().printError("WARNING: " + msg);
                        }
                        LOG.warn(msg);
                        selectivity = ((double) 1 / (double) 3);
                    }
                } else {
                    selectivity = computeNotEqualitySelectivity(call);
                }
                break;
            }
        case LESS_THAN_OR_EQUAL:
        case GREATER_THAN_OR_EQUAL:
        case LESS_THAN:
        case GREATER_THAN:
            {
                selectivity = ((double) 1 / (double) 3);
                break;
            }
        case IN:
            {
                // TODO: 1) check for duplicates 2) We assume in clause values to be
                // present in NDV which may not be correct (Range check can find it) 3) We
                // assume values in NDV set is uniformly distributed over col values
                // (account for skewness - histogram).
                selectivity = computeFunctionSelectivity(call);
                if (selectivity != null) {
                    selectivity = selectivity * (call.operands.size() - 1);
                    if (selectivity <= 0.0) {
                        selectivity = 0.10;
                    } else if (selectivity >= 1.0) {
                        selectivity = 1.0;
                    }
                }
                break;
            }
        default:
            selectivity = computeFunctionSelectivity(call);
    }
    return selectivity;
}
Also used : HiveConfPlannerContext(org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfPlannerContext) SqlKind(org.apache.calcite.sql.SqlKind) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan)

Example 15 with HiveTableScan

use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.

the class EstimateUniqueKeys method getUniqueKeys.

private static Set<ImmutableBitSet> getUniqueKeys(HiveProject rel) {
    HiveTableScan tScan = getTableScan(rel.getInput(), false);
    if (tScan != null) {
        return generateKeysUsingStatsEstimation(rel, tScan);
    }
    // LogicalProject maps a set of rows to a different set;
    // Without knowledge of the mapping function(whether it
    // preserves uniqueness), it is only safe to derive uniqueness
    // info from the child of a project when the mapping is f(a) => a.
    // 
    // Further more, the unique bitset coming from the child needs
    // to be mapped to match the output of the project.
    final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
    final List<RexNode> projExprs = rel.getProjects();
    final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
    // Build an input to output position map.
    for (int i = 0; i < projExprs.size(); i++) {
        RexNode projExpr = projExprs.get(i);
        if (projExpr instanceof RexInputRef) {
            mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
        }
    }
    if (mapInToOutPos.isEmpty()) {
        // return empty set.
        return projUniqueKeySet;
    }
    Set<ImmutableBitSet> childUniqueKeySet = getUniqueKeys(rel.getInput());
    if (childUniqueKeySet != null) {
        // projected.
        for (ImmutableBitSet colMask : childUniqueKeySet) {
            ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
            boolean completeKeyProjected = true;
            for (int bit : colMask) {
                if (mapInToOutPos.containsKey(bit)) {
                    tmpMask.set(mapInToOutPos.get(bit));
                } else {
                    // Skip the child unique key if part of it is not
                    // projected.
                    completeKeyProjected = false;
                    break;
                }
            }
            if (completeKeyProjected) {
                projUniqueKeySet.add(tmpMask.build());
            }
        }
    }
    return projUniqueKeySet;
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) HashMap(java.util.HashMap) RexInputRef(org.apache.calcite.rex.RexInputRef) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) RexNode(org.apache.calcite.rex.RexNode) HashSet(java.util.HashSet)

Aggregations

HiveTableScan (org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan)18 RelOptHiveTable (org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable)12 RelNode (org.apache.calcite.rel.RelNode)7 RexNode (org.apache.calcite.rex.RexNode)6 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 RelDataTypeField (org.apache.calcite.rel.type.RelDataTypeField)4 RexBuilder (org.apache.calcite.rex.RexBuilder)4 Table (org.apache.hadoop.hive.ql.metadata.Table)4 DruidQuery (org.apache.calcite.adapter.druid.DruidQuery)3 RelOptCluster (org.apache.calcite.plan.RelOptCluster)3 Project (org.apache.calcite.rel.core.Project)3 HiveRelNode (org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode)3 BitSet (java.util.BitSet)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 DruidSchema (org.apache.calcite.adapter.druid.DruidSchema)2 DruidTable (org.apache.calcite.adapter.druid.DruidTable)2 JavaTypeFactoryImpl (org.apache.calcite.jdbc.JavaTypeFactoryImpl)2