use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsRulesProcFactory method updateStats.
static void updateStats(Statistics stats, long newNumRows, boolean useColStats, Operator<? extends OperatorDesc> op, boolean updateNDV) {
if (newNumRows < 0) {
LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE");
newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
}
if (newNumRows == 0) {
LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1");
newNumRows = 1;
}
long oldRowCount = stats.getNumRows();
double ratio = (double) newNumRows / (double) oldRowCount;
stats.setNumRows(newNumRows);
if (useColStats) {
List<ColStatistics> colStats = stats.getColumnStats();
for (ColStatistics cs : colStats) {
long oldNumNulls = cs.getNumNulls();
long oldDV = cs.getCountDistint();
long newNumNulls = Math.round(ratio * oldNumNulls);
cs.setNumNulls(newNumNulls);
if (updateNDV) {
long newDV = oldDV;
// the output number of rows is less than input number of rows.
if (ratio <= 1.0) {
newDV = (long) Math.ceil(ratio * oldDV);
}
cs.setCountDistint(newDV);
}
}
stats.setColumnStats(colStats);
long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
} else {
long newDataSize = (long) (ratio * stats.getDataSize());
stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
}
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class HiveRelMdSize method averageColumnSizes.
//~ Methods ----------------------------------------------------------------
public List<Double> averageColumnSizes(HiveTableScan scan, RelMetadataQuery mq) {
List<Integer> neededcolsLst = scan.getNeededColIndxsFrmReloptHT();
List<ColStatistics> columnStatistics = ((RelOptHiveTable) scan.getTable()).getColStat(neededcolsLst, true);
// Obtain list of col stats, or use default if they are not available
final ImmutableList.Builder<Double> list = ImmutableList.builder();
int indxRqdCol = 0;
int nFields = scan.getRowType().getFieldCount();
for (int i = 0; i < nFields; i++) {
if (neededcolsLst.contains(i)) {
ColStatistics columnStatistic = columnStatistics.get(indxRqdCol);
indxRqdCol++;
if (columnStatistic == null) {
RelDataTypeField field = scan.getRowType().getFieldList().get(i);
list.add(averageTypeValueSize(field.getType()));
} else {
list.add(columnStatistic.getAvgColLen());
}
} else {
list.add(new Double(0));
}
}
return list.build();
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class HiveRelMdUniqueKeys method getUniqueKeys.
/*
* Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col)
* - min(col) = rowCount(col)
*
* Why are we intercepting Project and not TableScan? Because if we
* have a method for TableScan, it will not know which columns to check for.
* Inferring Uniqueness for all columns is very expensive right now. The flip
* side of doing this is, it only works post Field Trimming.
*/
public Set<ImmutableBitSet> getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) {
HiveTableScan tScan = getTableScan(rel.getInput(), false);
if (tScan == null) {
// If HiveTableScan is not found, e.g., not sequence of Project and
// Filter operators, execute the original getUniqueKeys method
// LogicalProject maps a set of rows to a different set;
// Without knowledge of the mapping function(whether it
// preserves uniqueness), it is only safe to derive uniqueness
// info from the child of a project when the mapping is f(a) => a.
//
// Further more, the unique bitset coming from the child needs
// to be mapped to match the output of the project.
final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
final List<RexNode> projExprs = rel.getProjects();
final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
// Build an input to output position map.
for (int i = 0; i < projExprs.size(); i++) {
RexNode projExpr = projExprs.get(i);
if (projExpr instanceof RexInputRef) {
mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
}
}
if (mapInToOutPos.isEmpty()) {
// return empty set.
return projUniqueKeySet;
}
Set<ImmutableBitSet> childUniqueKeySet = mq.getUniqueKeys(rel.getInput(), ignoreNulls);
if (childUniqueKeySet != null) {
// projected.
for (ImmutableBitSet colMask : childUniqueKeySet) {
ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
boolean completeKeyProjected = true;
for (int bit : colMask) {
if (mapInToOutPos.containsKey(bit)) {
tmpMask.set(mapInToOutPos.get(bit));
} else {
// Skip the child unique key if part of it is not
// projected.
completeKeyProjected = false;
break;
}
}
if (completeKeyProjected) {
projUniqueKeySet.add(tmpMask.build());
}
}
}
return projUniqueKeySet;
}
Map<Integer, Integer> posMap = new HashMap<Integer, Integer>();
int projectPos = 0;
int colStatsPos = 0;
BitSet projectedCols = new BitSet();
for (RexNode r : rel.getProjects()) {
if (r instanceof RexInputRef) {
projectedCols.set(((RexInputRef) r).getIndex());
posMap.put(colStatsPos, projectPos);
colStatsPos++;
}
projectPos++;
}
double numRows = tScan.getRows();
List<ColStatistics> colStats = tScan.getColStat(BitSets.toList(projectedCols));
Set<ImmutableBitSet> keys = new HashSet<ImmutableBitSet>();
colStatsPos = 0;
for (ColStatistics cStat : colStats) {
boolean isKey = false;
if (cStat.getCountDistint() >= numRows) {
isKey = true;
}
if (!isKey && cStat.getRange() != null && cStat.getRange().maxValue != null && cStat.getRange().minValue != null) {
double r = cStat.getRange().maxValue.doubleValue() - cStat.getRange().minValue.doubleValue() + 1;
isKey = (Math.abs(numRows - r) < RelOptUtil.EPSILON);
}
if (isKey) {
ImmutableBitSet key = ImmutableBitSet.of(posMap.get(colStatsPos));
keys.add(key);
}
colStatsPos++;
}
return keys;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class HiveRelMdDistinctRowCount method getDistinctRowCount.
private Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, ImmutableBitSet groupKey, RexNode predicate) {
List<Integer> projIndxLst = HiveCalciteUtil.translateBitSetToProjIndx(groupKey);
List<ColStatistics> colStats = htRel.getColStat(projIndxLst);
Double noDistinctRows = 1.0;
for (ColStatistics cStat : colStats) {
noDistinctRows *= cStat.getCountDistint();
}
return Math.min(noDistinctRows, htRel.getRows());
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method collectStatistics.
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
Statistics stats = new Statistics();
float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
if (!table.isPartitioned()) {
long ds = getDataSize(conf, table);
long nr = getNumRows(conf, schema, neededColumns, table, ds);
stats.setNumRows(nr);
List<ColStatistics> colStats = Lists.newArrayList();
if (fetchColStats) {
colStats = getTableColumnStats(table, schema, neededColumns);
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
stats.setDataSize(ds);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
long nr = 0;
long ds = 0;
List<Long> rowCounts = Lists.newArrayList();
List<Long> dataSizes = Lists.newArrayList();
if (fetchPartStats) {
rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
ds = getSumIgnoreNegatives(dataSizes);
}
}
// sizes
if (ds <= 0) {
dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
}
ds = getSumIgnoreNegatives(dataSizes);
ds = (long) (ds * deserFactor);
int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
if (avgRowSize > 0) {
setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
// number of rows -1 means that statistics from metastore is not reliable
if (nr <= 0) {
nr = ds / avgRowSize;
}
}
if (nr == 0) {
nr = 1;
}
stats.addToNumRows(nr);
stats.addToDataSize(ds);
// if at least a partition does not contain row count then mark basic stats state as PARTIAL
if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
stats.setBasicStatsState(State.PARTIAL);
}
if (fetchColStats) {
List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
neededColumns = processNeededColumns(schema, neededColumns);
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (neededColumns.size() > 0 && partNames.size() > 0) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
}
if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
List<ColStatistics> emptyStats = Lists.newArrayList();
// add partition column stats
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
stats.addToColumnStats(emptyStats);
stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
} else {
List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
if (colStats.size() != neededColumns.size()) {
LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
}
List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
State colState = deriveStatType(columnStats, referencedColumns);
if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
colState = State.PARTIAL;
}
stats.setColumnStatsState(colState);
}
}
}
return stats;
}
Aggregations