use of org.apache.hadoop.hive.ql.parse.ColumnStatsList in project hive by apache.
the class StatsUtils method collectStatistics.
private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
Statistics stats = null;
boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
if (!table.isPartitioned()) {
Factory basicStatsFactory = new BasicStats.Factory();
if (estimateStats) {
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
// long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
// long nr = getNumRows(conf, schema, neededColumns, table, ds);
long ds = basicStats.getDataSize();
long nr = basicStats.getNumRows();
long fs = basicStats.getTotalFileSize();
List<ColStatistics> colStats = Collections.emptyList();
long numErasureCodedFiles = getErasureCodedFiles(table);
if (needColStats) {
colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
if (estimateStats) {
estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
}
// we should have stats for all columns (estimated or actual)
if (neededColumns.size() == colStats.size()) {
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
Factory basicStatsFactory = new Factory();
if (estimateStats) {
// FIXME: misses parallel
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
List<BasicStats> partStats = new ArrayList<>();
for (Partition p : partList.getNotDeniedPartns()) {
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
partStats.add(basicStats);
}
BasicStats bbs = BasicStats.buildFrom(partStats);
long nr = bbs.getNumRows();
long ds = bbs.getDataSize();
long fs = bbs.getTotalFileSize();
List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
if (nr == 0) {
nr = 1;
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
stats.setBasicStatsState(bbs.getState());
if (nr > 0) {
// FIXME: this promotion process should be removed later
if (State.PARTIAL.morePreciseThan(bbs.getState())) {
stats.setBasicStatsState(State.PARTIAL);
}
}
if (needColStats) {
List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
// We will retrieve stats from the metastore only for columns that are not cached
List<ColStatistics> columnStats = new ArrayList<>();
List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
// List of partitions
List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
}
boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
// add partition column stats
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
// FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
stats.addToColumnStats(columnStats);
} else {
if (statsRetrieved) {
columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
}
int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
if (columnStats.size() != colStatsAvailable) {
LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
}
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
// Infer column stats state
stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
// Include state for cached columns
stats.updateColumnStatsState(colStatsCache.getState());
}
// Change if we could not retrieve for all partitions
if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
stats.updateColumnStatsState(State.PARTIAL);
LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
}
}
if (partStats.isEmpty()) {
// all partitions are filtered by partition pruning
stats.setBasicStatsState(State.COMPLETE);
}
// stats from metastore only once.
if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
}
}
}
return stats;
}
use of org.apache.hadoop.hive.ql.parse.ColumnStatsList in project hive by apache.
the class RelOptHiveTable method updateColStats.
private void updateColStats(Set<Integer> projIndxLst, boolean allowMissingStats) {
List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
List<String> partColNamesThatRqrStats = new ArrayList<String>();
List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
Set<String> colNamesFailedStats = new HashSet<String>();
// 1. Separate required columns to Non Partition and Partition Cols
ColumnInfo tmp;
for (Integer pi : projIndxLst) {
if (hiveColStatsMap.get(pi) == null) {
if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
nonPartColNamesThatRqrStats.add(tmp.getInternalName());
nonPartColIndxsThatRqrStats.add(pi);
} else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
partColNamesThatRqrStats.add(tmp.getInternalName());
partColIndxsThatRqrStats.add(pi);
} else {
noColsMissingStats.getAndIncrement();
String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
if (null == partitionList) {
// We could be here either because its an unpartitioned table or because
// there are no pruning predicates on a partitioned table.
computePartitionList(hiveConf, null, new HashSet<Integer>());
}
String partitionListKey = partitionList.getKey().orElse(null);
ColumnStatsList colStatsCached = colStatsCache.get(partitionListKey);
if (colStatsCached == null) {
colStatsCached = new ColumnStatsList();
colStatsCache.put(partitionListKey, colStatsCached);
}
// 2. Obtain Col Stats for Non Partition Cols
if (nonPartColNamesThatRqrStats.size() > 0) {
List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>();
if (!hiveTblMetadata.isPartitioned()) {
// 2.1 Handle the case for unpartitioned table.
try {
Statistics stats = StatsUtils.collectStatistics(hiveConf, null, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
rowCount = stats.getNumRows();
for (String c : nonPartColNamesThatRqrStats) {
ColStatistics cs = stats.getColumnStatisticsFromColName(c);
if (cs != null) {
hiveColStats.add(cs);
}
}
colStatsCached.updateState(stats.getColumnStatsState());
// 2.1.1 Record Column Names that we needed stats for but couldn't
if (hiveColStats.isEmpty()) {
colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
} else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
Set<String> setOfObtainedColStats = new HashSet<String>();
for (ColStatistics cs : hiveColStats) {
setOfObtainedColStats.add(cs.getColumnName());
}
setOfFiledCols.removeAll(setOfObtainedColStats);
colNamesFailedStats.addAll(setOfFiledCols);
} else {
// Column stats in hiveColStats might not be in the same order as the columns in
// nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
// using nonPartColIndxsThatRqrStats as below
Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
for (ColStatistics cs : hiveColStats) {
columnStatsMap.put(cs.getColumnName(), cs);
// stats are not available
if (cs.isEstimated()) {
colNamesFailedStats.add(cs.getColumnName());
}
}
hiveColStats.clear();
for (String colName : nonPartColNamesThatRqrStats) {
hiveColStats.add(columnStatsMap.get(colName));
}
}
} catch (HiveException e) {
String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed.";
LOG.error(logMsg, e);
throw new RuntimeException(logMsg, e);
}
} else {
// 2.2 Obtain col stats for partitioned table.
try {
if (partitionList.getNotDeniedPartns().isEmpty()) {
// no need to make a metastore call
rowCount = 0;
hiveColStats = new ArrayList<ColStatistics>();
for (int i = 0; i < nonPartColNamesThatRqrStats.size(); i++) {
// add empty stats object for each column
hiveColStats.add(new ColStatistics(nonPartColNamesThatRqrStats.get(i), hiveNonPartitionColsMap.get(nonPartColIndxsThatRqrStats.get(i)).getTypeName()));
}
colNamesFailedStats.clear();
colStatsCached.updateState(State.COMPLETE);
} else {
Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
rowCount = stats.getNumRows();
hiveColStats = new ArrayList<ColStatistics>();
for (String c : nonPartColNamesThatRqrStats) {
ColStatistics cs = stats.getColumnStatisticsFromColName(c);
if (cs != null) {
hiveColStats.add(cs);
if (cs.isEstimated()) {
colNamesFailedStats.add(c);
}
} else {
colNamesFailedStats.add(c);
}
}
colStatsCached.updateState(stats.getColumnStatsState());
}
} catch (HiveException e) {
String logMsg = "Collecting stats failed.";
LOG.error(logMsg, e);
throw new RuntimeException(logMsg, e);
}
}
if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
for (int i = 0; i < hiveColStats.size(); i++) {
// the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
// are in same order
hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
colStatsCached.put(hiveColStats.get(i).getColumnName(), hiveColStats.get(i));
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + hiveColStats.get(i).getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
LOG.debug(hiveColStats.get(i).toString());
}
}
}
}
// 3. Obtain Stats for Partition Cols
if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
ColStatistics cStats = null;
for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
cStats = StatsUtils.getColStatsForPartCol(hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)), new PartitionIterable(partitionList.getNotDeniedPartns()), hiveConf);
hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
colStatsCached.put(cStats.getColumnName(), cStats);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + cStats.getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
LOG.debug(cStats.toString());
}
}
}
// 4. Warn user if we could get stats for required columns
if (!colNamesFailedStats.isEmpty()) {
String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
noColsMissingStats.getAndAdd(colNamesFailedStats.size());
if (allowMissingStats) {
LOG.warn(logMsg);
HiveConf conf = SessionState.getSessionConf();
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
LogHelper console = SessionState.getConsole();
console.printInfo(logMsg);
}
} else {
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
Aggregations