use of org.apache.hadoop.hive.ql.stats.BasicStats.Factory in project hive by apache.
the class StatsUtils method collectStatistics.
private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
Statistics stats = null;
boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
if (!table.isPartitioned()) {
Factory basicStatsFactory = new BasicStats.Factory();
if (estimateStats) {
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
// long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
// long nr = getNumRows(conf, schema, neededColumns, table, ds);
long ds = basicStats.getDataSize();
long nr = basicStats.getNumRows();
long fs = basicStats.getTotalFileSize();
List<ColStatistics> colStats = Collections.emptyList();
long numErasureCodedFiles = getErasureCodedFiles(table);
if (needColStats) {
colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
if (estimateStats) {
estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
}
// we should have stats for all columns (estimated or actual)
if (neededColumns.size() == colStats.size()) {
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
Factory basicStatsFactory = new Factory();
if (estimateStats) {
// FIXME: misses parallel
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
List<BasicStats> partStats = new ArrayList<>();
for (Partition p : partList.getNotDeniedPartns()) {
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
partStats.add(basicStats);
}
BasicStats bbs = BasicStats.buildFrom(partStats);
long nr = bbs.getNumRows();
long ds = bbs.getDataSize();
long fs = bbs.getTotalFileSize();
List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
if (nr == 0) {
nr = 1;
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
stats.setBasicStatsState(bbs.getState());
if (nr > 0) {
// FIXME: this promotion process should be removed later
if (State.PARTIAL.morePreciseThan(bbs.getState())) {
stats.setBasicStatsState(State.PARTIAL);
}
}
if (needColStats) {
List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
// We will retrieve stats from the metastore only for columns that are not cached
List<ColStatistics> columnStats = new ArrayList<>();
List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
// List of partitions
List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
}
boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
// add partition column stats
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
// FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
stats.addToColumnStats(columnStats);
} else {
if (statsRetrieved) {
columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
}
int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
if (columnStats.size() != colStatsAvailable) {
LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
}
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
// Infer column stats state
stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
// Include state for cached columns
stats.updateColumnStatsState(colStatsCache.getState());
}
// Change if we could not retrieve for all partitions
if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
stats.updateColumnStatsState(State.PARTIAL);
LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
}
}
if (partStats.isEmpty()) {
// all partitions are filtered by partition pruning
stats.setBasicStatsState(State.COMPLETE);
}
// stats from metastore only once.
if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
}
}
}
return stats;
}
use of org.apache.hadoop.hive.ql.stats.BasicStats.Factory in project hive by apache.
the class StatsUtils method getNumRows.
/**
* Returns number of rows if it exists. Otherwise it estimates number of rows
* based on estimated data size for both partition and non-partitioned table
* RelOptHiveTable's getRowCount uses this.
*/
public static long getNumRows(HiveConf conf, List<ColumnInfo> schema, Table table, PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) {
List<Partish> inputs = new ArrayList<>();
if (table.isPartitioned()) {
for (Partition part : partitionList.getNotDeniedPartns()) {
inputs.add(Partish.buildFor(table, part));
}
} else {
inputs.add(Partish.buildFor(table));
}
Factory basicStatsFactory = new BasicStats.Factory();
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS)) {
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
}
List<BasicStats> results = new ArrayList<>();
for (Partish pi : inputs) {
BasicStats bStats = new BasicStats(pi);
long nr = bStats.getNumRows();
// FIXME: this point will be lost after the factory; check that it's really a warning....cleanup/etc
if (nr <= 0) {
// log warning if row count is missing
noColsMissingStats.getAndIncrement();
}
}
results = basicStatsFactory.buildAll(conf, inputs);
BasicStats aggregateStat = BasicStats.buildFrom(results);
aggregateStat.apply(new BasicStats.SetMinRowNumber01());
return aggregateStat.getNumRows();
}
Aggregations