use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.
the class HiveRelMdUniqueKeys method getUniqueKeys.
/*
* Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col)
* - min(col) = rowCount(col)
*
* Why are we intercepting Project and not TableScan? Because if we
* have a method for TableScan, it will not know which columns to check for.
* Inferring Uniqueness for all columns is very expensive right now. The flip
* side of doing this is, it only works post Field Trimming.
*/
public Set<ImmutableBitSet> getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) {
HiveTableScan tScan = getTableScan(rel.getInput(), false);
if (tScan == null) {
// If HiveTableScan is not found, e.g., not sequence of Project and
// Filter operators, execute the original getUniqueKeys method
// LogicalProject maps a set of rows to a different set;
// Without knowledge of the mapping function(whether it
// preserves uniqueness), it is only safe to derive uniqueness
// info from the child of a project when the mapping is f(a) => a.
//
// Further more, the unique bitset coming from the child needs
// to be mapped to match the output of the project.
final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
final List<RexNode> projExprs = rel.getProjects();
final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
// Build an input to output position map.
for (int i = 0; i < projExprs.size(); i++) {
RexNode projExpr = projExprs.get(i);
if (projExpr instanceof RexInputRef) {
mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
}
}
if (mapInToOutPos.isEmpty()) {
// return empty set.
return projUniqueKeySet;
}
Set<ImmutableBitSet> childUniqueKeySet = mq.getUniqueKeys(rel.getInput(), ignoreNulls);
if (childUniqueKeySet != null) {
// projected.
for (ImmutableBitSet colMask : childUniqueKeySet) {
ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
boolean completeKeyProjected = true;
for (int bit : colMask) {
if (mapInToOutPos.containsKey(bit)) {
tmpMask.set(mapInToOutPos.get(bit));
} else {
// Skip the child unique key if part of it is not
// projected.
completeKeyProjected = false;
break;
}
}
if (completeKeyProjected) {
projUniqueKeySet.add(tmpMask.build());
}
}
}
return projUniqueKeySet;
}
Map<Integer, Integer> posMap = new HashMap<Integer, Integer>();
int projectPos = 0;
int colStatsPos = 0;
BitSet projectedCols = new BitSet();
for (RexNode r : rel.getProjects()) {
if (r instanceof RexInputRef) {
projectedCols.set(((RexInputRef) r).getIndex());
posMap.put(colStatsPos, projectPos);
colStatsPos++;
}
projectPos++;
}
double numRows = mq.getRowCount(tScan);
List<ColStatistics> colStats = tScan.getColStat(BitSets.toList(projectedCols));
Set<ImmutableBitSet> keys = new HashSet<ImmutableBitSet>();
colStatsPos = 0;
for (ColStatistics cStat : colStats) {
boolean isKey = false;
if (cStat.getCountDistint() >= numRows) {
isKey = true;
}
if (!isKey && cStat.getRange() != null && cStat.getRange().maxValue != null && cStat.getRange().minValue != null) {
double r = cStat.getRange().maxValue.doubleValue() - cStat.getRange().minValue.doubleValue() + 1;
isKey = (Math.abs(numRows - r) < RelOptUtil.EPSILON);
}
if (isKey) {
ImmutableBitSet key = ImmutableBitSet.of(posMap.get(colStatsPos));
keys.add(key);
}
colStatsPos++;
}
return keys;
}
use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.
the class HiveMaterializedViewsRegistry method createMaterializedViewScan.
private static RelNode createMaterializedViewScan(HiveConf conf, Table viewTable) {
// 0. Recreate cluster
final RelOptPlanner planner = CalcitePlanner.createPlanner(conf);
final RexBuilder rexBuilder = new RexBuilder(new JavaTypeFactoryImpl(new HiveTypeSystemImpl()));
final RelOptCluster cluster = RelOptCluster.create(planner, rexBuilder);
// 1. Create column schema
final RowResolver rr = new RowResolver();
// 1.1 Add Column info for non partion cols (Object Inspector fields)
StructObjectInspector rowObjectInspector;
try {
rowObjectInspector = (StructObjectInspector) viewTable.getDeserializer().getObjectInspector();
} catch (SerDeException e) {
// Bail out
return null;
}
List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
ColumnInfo colInfo;
String colName;
ArrayList<ColumnInfo> cInfoLst = new ArrayList<>();
for (StructField structField : fields) {
colName = structField.getFieldName();
colInfo = new ColumnInfo(structField.getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(structField.getFieldObjectInspector()), null, false);
rr.put(null, colName, colInfo);
cInfoLst.add(colInfo);
}
ArrayList<ColumnInfo> nonPartitionColumns = new ArrayList<ColumnInfo>(cInfoLst);
// 1.2 Add column info corresponding to partition columns
ArrayList<ColumnInfo> partitionColumns = new ArrayList<ColumnInfo>();
for (FieldSchema part_col : viewTable.getPartCols()) {
colName = part_col.getName();
colInfo = new ColumnInfo(colName, TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), null, true);
rr.put(null, colName, colInfo);
cInfoLst.add(colInfo);
partitionColumns.add(colInfo);
}
// 1.3 Build row type from field <type, name>
RelDataType rowType;
try {
rowType = TypeConverter.getType(cluster, rr, null);
} catch (CalciteSemanticException e) {
// Bail out
return null;
}
// 2. Build RelOptAbstractTable
List<String> fullyQualifiedTabName = new ArrayList<>();
if (viewTable.getDbName() != null && !viewTable.getDbName().isEmpty()) {
fullyQualifiedTabName.add(viewTable.getDbName());
}
fullyQualifiedTabName.add(viewTable.getTableName());
RelNode tableRel;
// 3. Build operator
if (obtainTableType(viewTable) == TableType.DRUID) {
// Build Druid query
String address = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
String dataSource = viewTable.getParameters().get(Constants.DRUID_DATA_SOURCE);
Set<String> metrics = new HashSet<>();
List<RelDataType> druidColTypes = new ArrayList<>();
List<String> druidColNames = new ArrayList<>();
// @NOTE this code is very similar to the code at org/apache/hadoop/hive/ql/parse/CalcitePlanner.java:2362
// @TODO it will be nice to refactor it
RelDataTypeFactory dtFactory = cluster.getRexBuilder().getTypeFactory();
for (RelDataTypeField field : rowType.getFieldList()) {
if (DruidTable.DEFAULT_TIMESTAMP_COLUMN.equals(field.getName())) {
// Druid's time column is always not null.
druidColTypes.add(dtFactory.createTypeWithNullability(field.getType(), false));
} else {
druidColTypes.add(field.getType());
}
druidColNames.add(field.getName());
if (field.getName().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
// timestamp
continue;
}
if (field.getType().getSqlTypeName() == SqlTypeName.VARCHAR) {
// dimension
continue;
}
metrics.add(field.getName());
}
List<Interval> intervals = Collections.singletonList(DruidTable.DEFAULT_INTERVAL);
rowType = dtFactory.createStructType(druidColTypes, druidColNames);
// We can pass null for Hive object because it is only used to retrieve tables
// if constraints on a table object are existing, but constraints cannot be defined
// for materialized views.
RelOptHiveTable optTable = new RelOptHiveTable(null, cluster.getTypeFactory(), fullyQualifiedTabName, rowType, viewTable, nonPartitionColumns, partitionColumns, new ArrayList<>(), conf, null, new QueryTables(true), new HashMap<>(), new HashMap<>(), new AtomicInteger());
DruidTable druidTable = new DruidTable(new DruidSchema(address, address, false), dataSource, RelDataTypeImpl.proto(rowType), metrics, DruidTable.DEFAULT_TIMESTAMP_COLUMN, intervals, null, null);
final TableScan scan = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
tableRel = DruidQuery.create(cluster, cluster.traitSetOf(BindableConvention.INSTANCE), optTable, druidTable, ImmutableList.<RelNode>of(scan), ImmutableMap.of());
} else {
// Build Hive Table Scan Rel.
// We can pass null for Hive object because it is only used to retrieve tables
// if constraints on a table object are existing, but constraints cannot be defined
// for materialized views.
RelOptHiveTable optTable = new RelOptHiveTable(null, cluster.getTypeFactory(), fullyQualifiedTabName, rowType, viewTable, nonPartitionColumns, partitionColumns, new ArrayList<>(), conf, null, new QueryTables(true), new HashMap<>(), new HashMap<>(), new AtomicInteger());
tableRel = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
}
return tableRel;
}
use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.
the class HiveMaterializedViewUtils method copyNodeScanNewCluster.
private static RelNode copyNodeScanNewCluster(RelOptCluster optCluster, RelNode scan) {
final RelNode newScan;
if (scan instanceof DruidQuery) {
final DruidQuery dq = (DruidQuery) scan;
// Ideally we should use HiveRelNode convention. However, since Volcano planner
// throws in that case because DruidQuery does not implement the interface,
// we set it as Bindable. Currently, we do not use convention in Hive, hence that
// should be fine.
// TODO: If we want to make use of convention (e.g., while directly generating operator
// tree instead of AST), this should be changed.
newScan = DruidQuery.create(optCluster, optCluster.traitSetOf(BindableConvention.INSTANCE), scan.getTable(), dq.getDruidTable(), ImmutableList.of(dq.getTableScan()), DruidSqlOperatorConverter.getDefaultMap());
} else {
newScan = new HiveTableScan(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION), (RelOptHiveTable) scan.getTable(), ((RelOptHiveTable) scan.getTable()).getName(), null, false, false);
}
return newScan;
}
use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.
the class FilterSelectivityEstimator method visitCall.
@Override
public Double visitCall(RexCall call) {
if (!deep) {
return 1.0;
}
/*
* Ignore any predicates on partition columns because we have already
* accounted for these in the Table row count.
*/
if (isPartitionPredicate(call, this.childRel)) {
return 1.0;
}
Double selectivity = null;
SqlKind op = getOp(call);
switch(op) {
case AND:
{
selectivity = computeConjunctionSelectivity(call);
break;
}
case OR:
{
selectivity = computeDisjunctionSelectivity(call);
break;
}
case NOT:
case NOT_EQUALS:
{
selectivity = computeNotEqualitySelectivity(call);
break;
}
case IS_NOT_NULL:
{
if (childRel instanceof HiveTableScan) {
double noOfNulls = getMaxNulls(call, (HiveTableScan) childRel);
double totalNoOfTuples = mq.getRowCount(childRel);
if (totalNoOfTuples >= noOfNulls) {
selectivity = (totalNoOfTuples - noOfNulls) / Math.max(totalNoOfTuples, 1);
} else {
// If we are running explain, we will print the warning in the console
// and the log files. Otherwise, we just print it in the log files.
HiveConfPlannerContext ctx = childRel.getCluster().getPlanner().getContext().unwrap(HiveConfPlannerContext.class);
String msg = "Invalid statistics: Number of null values > number of tuples. " + "Consider recomputing statistics for table: " + ((RelOptHiveTable) childRel.getTable()).getHiveTableMD().getFullyQualifiedName();
if (ctx.isExplainPlan()) {
SessionState.getConsole().printError("WARNING: " + msg);
}
LOG.warn(msg);
selectivity = ((double) 1 / (double) 3);
}
} else {
selectivity = computeNotEqualitySelectivity(call);
}
break;
}
case LESS_THAN_OR_EQUAL:
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case GREATER_THAN:
{
selectivity = ((double) 1 / (double) 3);
break;
}
case IN:
{
// TODO: 1) check for duplicates 2) We assume in clause values to be
// present in NDV which may not be correct (Range check can find it) 3) We
// assume values in NDV set is uniformly distributed over col values
// (account for skewness - histogram).
selectivity = computeFunctionSelectivity(call);
if (selectivity != null) {
selectivity = selectivity * (call.operands.size() - 1);
if (selectivity <= 0.0) {
selectivity = 0.10;
} else if (selectivity >= 1.0) {
selectivity = 1.0;
}
}
break;
}
default:
selectivity = computeFunctionSelectivity(call);
}
return selectivity;
}
use of org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan in project hive by apache.
the class EstimateUniqueKeys method getUniqueKeys.
private static Set<ImmutableBitSet> getUniqueKeys(HiveProject rel) {
HiveTableScan tScan = getTableScan(rel.getInput(), false);
if (tScan != null) {
return generateKeysUsingStatsEstimation(rel, tScan);
}
// LogicalProject maps a set of rows to a different set;
// Without knowledge of the mapping function(whether it
// preserves uniqueness), it is only safe to derive uniqueness
// info from the child of a project when the mapping is f(a) => a.
//
// Further more, the unique bitset coming from the child needs
// to be mapped to match the output of the project.
final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
final List<RexNode> projExprs = rel.getProjects();
final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
// Build an input to output position map.
for (int i = 0; i < projExprs.size(); i++) {
RexNode projExpr = projExprs.get(i);
if (projExpr instanceof RexInputRef) {
mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
}
}
if (mapInToOutPos.isEmpty()) {
// return empty set.
return projUniqueKeySet;
}
Set<ImmutableBitSet> childUniqueKeySet = getUniqueKeys(rel.getInput());
if (childUniqueKeySet != null) {
// projected.
for (ImmutableBitSet colMask : childUniqueKeySet) {
ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
boolean completeKeyProjected = true;
for (int bit : colMask) {
if (mapInToOutPos.containsKey(bit)) {
tmpMask.set(mapInToOutPos.get(bit));
} else {
// Skip the child unique key if part of it is not
// projected.
completeKeyProjected = false;
break;
}
}
if (completeKeyProjected) {
projUniqueKeySet.add(tmpMask.build());
}
}
}
return projUniqueKeySet;
}
Aggregations