use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class StatsUtils method collectStatistics.
private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean fetchColStats, boolean failIfCacheMiss) throws HiveException {
Statistics stats = null;
float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
if (!table.isPartitioned()) {
// getDataSize tries to estimate stats if it doesn't exist using file size
// we would like to avoid file system calls if it too expensive
long ds = shouldEstimateStats ? getDataSize(conf, table) : getRawDataSize(table);
long nr = getNumRows(conf, schema, table, ds);
List<ColStatistics> colStats = Lists.newArrayList();
if (fetchColStats) {
colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache);
if (colStats == null) {
colStats = Lists.newArrayList();
}
estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
// we should have stats for all columns (estimated or actual)
assert (neededColumns.size() == colStats.size());
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
stats = new Statistics(nr, ds);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
long nr = 0;
long ds = 0;
List<Long> rowCounts = Lists.newArrayList();
List<Long> dataSizes = Lists.newArrayList();
rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
dataSizes = safeMult(dataSizes, deserFactor);
ds = getSumIgnoreNegatives(dataSizes);
}
// sizes
if (ds <= 0 && shouldEstimateStats) {
dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
dataSizes = safeMult(dataSizes, deserFactor);
ds = getSumIgnoreNegatives(dataSizes);
}
int avgRowSize = estimateRowSizeFromSchema(conf, schema);
if (avgRowSize > 0) {
setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
// number of rows -1 means that statistics from metastore is not reliable
if (nr <= 0) {
nr = ds / avgRowSize;
}
}
// Minimum values
if (nr == 0) {
nr = 1;
}
stats = new Statistics(nr, ds);
// if at least a partition does not contain row count then mark basic stats state as PARTIAL
if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
stats.setBasicStatsState(State.PARTIAL);
}
if (fetchColStats) {
List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
// We will retrieve stats from the metastore only for columns that are not cached
List<String> neededColsToRetrieve;
List<String> partitionColsToRetrieve;
List<ColStatistics> columnStats = new ArrayList<>();
if (colStatsCache != null) {
neededColsToRetrieve = new ArrayList<String>(neededColumns.size());
for (String colName : neededColumns) {
ColStatistics colStats = colStatsCache.getColStats().get(colName);
if (colStats == null) {
neededColsToRetrieve.add(colName);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " could not be retrieved from cache");
}
} else {
columnStats.add(colStats);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " retrieved from cache");
}
}
}
partitionColsToRetrieve = new ArrayList<>(partitionCols.size());
for (String colName : partitionCols) {
ColStatistics colStats = colStatsCache.getColStats().get(colName);
if (colStats == null) {
partitionColsToRetrieve.add(colName);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " could not be retrieved from cache");
}
} else {
columnStats.add(colStats);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " retrieved from cache");
}
}
}
} else {
neededColsToRetrieve = neededColumns;
partitionColsToRetrieve = partitionCols;
}
// List of partitions
List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (neededColsToRetrieve.size() > 0 && partNames.size() > 0) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames);
}
boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
if (neededColumns.size() == 0 || (neededColsToRetrieve.size() > 0 && !statsRetrieved)) {
estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
// add partition column stats
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
// FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
stats.addToColumnStats(columnStats);
} else {
if (statsRetrieved) {
columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
}
int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
if (columnStats.size() != colStatsAvailable) {
LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
}
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
// Infer column stats state
stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
// Include state for cached columns
stats.updateColumnStatsState(colStatsCache.getState());
}
// Change if we could not retrieve for all partitions
if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
stats.updateColumnStatsState(State.PARTIAL);
LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
}
}
if (rowCounts.size() == 0) {
// all partitions are filtered by partition pruning
stats.setBasicStatsState(State.COMPLETE);
}
// stats from metastore only once.
if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
}
}
}
return stats;
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class SparkMapJoinOptimizer method getMapJoinConversionInfo.
/**
* This method returns the big table position in a map-join. If the given join
* cannot be converted to a map-join (This could happen for several reasons - one
* of them being presence of 2 or more big tables that cannot fit in-memory), it returns -1.
*
* Otherwise, it returns an int value that is the index of the big table in the set
* MapJoinProcessor.bigTableCandidateSet
*
* @param joinOp
* @param context
* @return an array of 3 long values, first value is the position,
* second value is the connected map join size, and the third is big table data size.
*/
private long[] getMapJoinConversionInfo(JoinOperator joinOp, OptimizeSparkProcContext context) {
Set<Integer> bigTableCandidateSet = MapJoinProcessor.getBigTableCandidates(joinOp.getConf().getConds());
long maxSize = context.getConf().getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
int bigTablePosition = -1;
Statistics bigInputStat = null;
long totalSize = 0;
int pos = 0;
// bigTableFound means we've encountered a table that's bigger than the
// max. This table is either the big table or we cannot convert.
boolean bigTableFound = false;
boolean useTsStats = context.getConf().getBoolean(HiveConf.ConfVars.SPARK_USE_TS_STATS_FOR_MAPJOIN.varname, false);
// If so, mark that branch as the big table branch.
if (useTsStats) {
LOG.debug("Checking map join optimization for operator {} using TS stats", joinOp);
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (isBigTableBranch(parentOp)) {
if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos) && !containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos);
bigTablePosition = pos;
bigTableFound = true;
bigInputStat = new Statistics(0, Long.MAX_VALUE);
} else {
// Either we've found multiple big table branches, or the current branch cannot
// be a big table branch. Disable mapjoin for these cases.
LOG.debug("Cannot enable map join optimization for operator {}", joinOp);
return new long[] { -1, 0, 0 };
}
}
pos++;
}
}
pos = 0;
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
// Skip the potential big table identified above
if (pos == bigTablePosition) {
pos++;
continue;
}
Statistics currInputStat = null;
if (useTsStats) {
// Not adding other stats (e.g., # of rows, col stats) since only data size is used here
for (TableScanOperator root : OperatorUtils.findOperatorsUpstream(parentOp, TableScanOperator.class)) {
if (currInputStat == null) {
currInputStat = root.getStatistics().clone();
} else {
currInputStat.addBasicStats(root.getStatistics());
}
}
} else {
currInputStat = parentOp.getStatistics();
}
if (currInputStat == null) {
LOG.warn("Couldn't get statistics from: " + parentOp);
return new long[] { -1, 0, 0 };
}
// But, this is tricky to implement, and we'll leave it as a future work for now.
if (containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
return new long[] { -1, 0, 0 };
}
long inputSize = currInputStat.getDataSize();
if (bigInputStat == null || inputSize > bigInputStat.getDataSize()) {
if (bigTableFound) {
// on size and there's another one that's bigger.
return new long[] { -1, 0, 0 };
}
if (inputSize > maxSize) {
if (!bigTableCandidateSet.contains(pos)) {
// big for the map side.
return new long[] { -1, 0, 0 };
}
bigTableFound = true;
}
if (bigInputStat != null) {
// we're replacing the current big table with a new one. Need
// to count the current one as a map table then.
totalSize += bigInputStat.getDataSize();
}
if (totalSize > maxSize) {
// hence cannot convert.
return new long[] { -1, 0, 0 };
}
if (bigTableCandidateSet.contains(pos)) {
bigTablePosition = pos;
bigInputStat = currInputStat;
}
} else {
totalSize += currInputStat.getDataSize();
if (totalSize > maxSize) {
// cannot hold all map tables in memory. Cannot convert.
return new long[] { -1, 0, 0 };
}
}
pos++;
}
if (bigTablePosition == -1) {
// No big table candidates.
return new long[] { -1, 0, 0 };
}
// Final check, find size of already-calculated Mapjoin Operators in same work (spark-stage).
// We need to factor this in to prevent overwhelming Spark executor-memory.
long connectedMapJoinSize = getConnectedMapJoinSize(joinOp.getParentOperators().get(bigTablePosition), joinOp, context);
if ((connectedMapJoinSize + totalSize) > maxSize) {
return new long[] { -1, 0, 0 };
}
return new long[] { bigTablePosition, connectedMapJoinSize, totalSize };
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method getBloomFilterCost.
private static double getBloomFilterCost(SelectOperator sel, FilterOperator fil) {
double cost = -1;
Statistics selStats = sel.getStatistics();
if (selStats != null) {
cost = selStats.getNumRows();
// Some other things that could be added here to model cost:
// Cost of computing/sending partial BloomFilter results? BloomFilterSize * # mappers
// For reduce-side join, add the cost of the semijoin table scan/dependent tablescans?
}
return cost;
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method computeBloomFilterNetBenefit.
private static double computeBloomFilterNetBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
double netBenefit = -1;
double benefit = getBloomFilterBenefit(sel, selExpr, fil, tsExpr);
Statistics filStats = fil.getStatistics();
if (benefit > 0 && filStats != null) {
double cost = getBloomFilterCost(sel, fil);
if (cost > 0) {
long filDataSize = filStats.getNumRows();
netBenefit = (benefit - cost) / filDataSize;
LOG.debug("BloomFilter benefit=" + benefit + ", cost=" + cost + ", tsDataSize=" + filDataSize + ", netBenefit=" + (benefit - cost));
}
}
LOG.debug("netBenefit=" + netBenefit);
return netBenefit;
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TezCompiler method getBloomFilterBenefit.
private static double getBloomFilterBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
double benefit = -1;
Statistics selStats = sel.getStatistics();
Statistics filStats = fil.getStatistics();
if (selStats == null || filStats == null) {
LOG.debug("No stats available to compute BloomFilter benefit");
return benefit;
}
// For cardinality values use numRows as default, try to use ColStats if available
long selKeyCardinality = selStats.getNumRows();
long tsKeyCardinality = filStats.getNumRows();
long tsRows = filStats.getNumRows();
long tsRowSize = filStats.getAvgRowSize();
long keyDomainCardinality = selKeyCardinality + tsKeyCardinality;
ExprNodeColumnDesc selCol = ExprNodeDescUtils.getColumnExpr(selExpr);
ExprNodeColumnDesc tsCol = ExprNodeDescUtils.getColumnExpr(tsExpr);
if (selCol != null && tsCol != null) {
// Check if there are column stats available for these columns
ColStatistics selColStat = selStats.getColumnStatisticsFromColName(selCol.getColumn());
ColStatistics filColStat = filStats.getColumnStatisticsFromColName(tsCol.getColumn());
if (canUseNDV(selColStat)) {
selKeyCardinality = selColStat.getCountDistint();
}
if (canUseNDV(filColStat)) {
tsKeyCardinality = filColStat.getCountDistint();
}
// Get colstats for the original table column for selCol if possible, this would have
// more accurate information about the original NDV of the column before any filtering.
ColStatistics selColSourceStat = null;
if (selColStat != null) {
ExprNodeDescUtils.ColumnOrigin selColSource = ExprNodeDescUtils.findColumnOrigin(selCol, sel);
if (selColSource != null && selColSource.op.getStatistics() != null) {
selColSourceStat = selColSource.op.getStatistics().getColumnStatisticsFromColName(selColSource.col.getColumn());
}
}
long domainCardinalityFromColStats = getCombinedKeyDomainCardinality(selColStat, selColSourceStat, filColStat);
if (domainCardinalityFromColStats >= 0) {
keyDomainCardinality = domainCardinalityFromColStats;
}
}
// Selectivity: key cardinality of semijoin / domain cardinality
// Benefit (rows filtered from ts): (1 - selectivity) * # ts rows
double selectivity = selKeyCardinality / (double) keyDomainCardinality;
selectivity = Math.min(selectivity, 1);
benefit = tsRows * (1 - selectivity);
if (LOG.isDebugEnabled()) {
LOG.debug("BloomFilter benefit for " + selCol + " to " + tsCol + ", selKeyCardinality=" + selKeyCardinality + ", tsKeyCardinality=" + tsKeyCardinality + ", tsRows=" + tsRows + ", keyDomainCardinality=" + keyDomainCardinality);
LOG.debug("SemiJoin key selectivity=" + selectivity + ", benefit=" + benefit);
}
return benefit;
}
Aggregations