use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class CachedStore method prewarm.
@VisibleForTesting
static /**
* This initializes the caches in SharedCache by getting the objects from Metastore DB via
* ObjectStore and populating the respective caches
*
* @param rawStore
* @throws Exception
*/
void prewarm(RawStore rawStore) {
if (isCachePrewarmed.get()) {
return;
}
long startTime = System.nanoTime();
LOG.info("Prewarming CachedStore");
while (!isCachePrewarmed.get()) {
// Prevents throwing exceptions in our raw store calls since we're not using RawStoreProxy
Deadline.registerIfNot(1000000);
List<String> dbNames;
try {
dbNames = rawStore.getAllDatabases();
} catch (MetaException e) {
// Try again
continue;
}
LOG.info("Number of databases to prewarm: {}", dbNames.size());
List<Database> databases = new ArrayList<>(dbNames.size());
for (String dbName : dbNames) {
try {
databases.add(rawStore.getDatabase(dbName));
} catch (NoSuchObjectException e) {
// Continue with next database
continue;
}
}
sharedCache.populateDatabasesInCache(databases);
LOG.debug("Databases cache is now prewarmed. Now adding tables, partitions and statistics to the cache");
int numberOfDatabasesCachedSoFar = 0;
for (String dbName : dbNames) {
dbName = StringUtils.normalizeIdentifier(dbName);
List<String> tblNames;
try {
tblNames = rawStore.getAllTables(dbName);
} catch (MetaException e) {
// Continue with next database
continue;
}
int numberOfTablesCachedSoFar = 0;
for (String tblName : tblNames) {
tblName = StringUtils.normalizeIdentifier(tblName);
if (!shouldCacheTable(dbName, tblName)) {
continue;
}
Table table;
try {
table = rawStore.getTable(dbName, tblName);
} catch (MetaException e) {
// in that case, continue with the next table
continue;
}
List<String> colNames = MetaStoreUtils.getColumnNamesForTable(table);
try {
ColumnStatistics tableColStats = null;
List<Partition> partitions = null;
List<ColumnStatistics> partitionColStats = null;
AggrStats aggrStatsAllPartitions = null;
AggrStats aggrStatsAllButDefaultPartition = null;
if (table.isSetPartitionKeys()) {
Deadline.startTimer("getPartitions");
partitions = rawStore.getPartitions(dbName, tblName, Integer.MAX_VALUE);
Deadline.stopTimer();
List<String> partNames = new ArrayList<>(partitions.size());
for (Partition p : partitions) {
partNames.add(Warehouse.makePartName(table.getPartitionKeys(), p.getValues()));
}
if (!partNames.isEmpty()) {
// Get partition column stats for this table
Deadline.startTimer("getPartitionColumnStatistics");
partitionColStats = rawStore.getPartitionColumnStatistics(dbName, tblName, partNames, colNames);
Deadline.stopTimer();
// Get aggregate stats for all partitions of a table and for all but default
// partition
Deadline.startTimer("getAggrPartitionColumnStatistics");
aggrStatsAllPartitions = rawStore.get_aggr_stats_for(dbName, tblName, partNames, colNames);
Deadline.stopTimer();
// Remove default partition from partition names and get aggregate
// stats again
List<FieldSchema> partKeys = table.getPartitionKeys();
String defaultPartitionValue = MetastoreConf.getVar(rawStore.getConf(), ConfVars.DEFAULTPARTITIONNAME);
List<String> partCols = new ArrayList<>();
List<String> partVals = new ArrayList<>();
for (FieldSchema fs : partKeys) {
partCols.add(fs.getName());
partVals.add(defaultPartitionValue);
}
String defaultPartitionName = FileUtils.makePartName(partCols, partVals);
partNames.remove(defaultPartitionName);
Deadline.startTimer("getAggrPartitionColumnStatistics");
aggrStatsAllButDefaultPartition = rawStore.get_aggr_stats_for(dbName, tblName, partNames, colNames);
Deadline.stopTimer();
}
} else {
Deadline.startTimer("getTableColumnStatistics");
tableColStats = rawStore.getTableColumnStatistics(dbName, tblName, colNames);
Deadline.stopTimer();
}
sharedCache.populateTableInCache(table, tableColStats, partitions, partitionColStats, aggrStatsAllPartitions, aggrStatsAllButDefaultPartition);
} catch (MetaException | NoSuchObjectException e) {
// Continue with next table
continue;
}
LOG.debug("Processed database: {}'s table: {}. Cached {} / {} tables so far.", dbName, tblName, ++numberOfTablesCachedSoFar, tblNames.size());
}
LOG.debug("Processed database: {}. Cached {} / {} databases so far.", dbName, ++numberOfDatabasesCachedSoFar, dbNames.size());
}
isCachePrewarmed.set(true);
}
LOG.info("CachedStore initialized");
long endTime = System.nanoTime();
LOG.info("Time taken in prewarming = " + (endTime - startTime) / 1000000 + "ms");
sharedCache.completeTableCachePrewarm();
}
use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class StatsUtils method collectStatistics.
private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
Statistics stats = null;
boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
if (!table.isPartitioned()) {
Factory basicStatsFactory = new BasicStats.Factory();
if (estimateStats) {
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
// long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
// long nr = getNumRows(conf, schema, neededColumns, table, ds);
long ds = basicStats.getDataSize();
long nr = basicStats.getNumRows();
long fs = basicStats.getTotalFileSize();
List<ColStatistics> colStats = Collections.emptyList();
long numErasureCodedFiles = getErasureCodedFiles(table);
if (needColStats) {
colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
if (estimateStats) {
estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
}
// we should have stats for all columns (estimated or actual)
if (neededColumns.size() == colStats.size()) {
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
Factory basicStatsFactory = new Factory();
if (estimateStats) {
// FIXME: misses parallel
basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
}
basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
List<BasicStats> partStats = new ArrayList<>();
for (Partition p : partList.getNotDeniedPartns()) {
BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
partStats.add(basicStats);
}
BasicStats bbs = BasicStats.buildFrom(partStats);
long nr = bbs.getNumRows();
long ds = bbs.getDataSize();
long fs = bbs.getTotalFileSize();
List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
if (nr == 0) {
nr = 1;
}
stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
stats.setBasicStatsState(bbs.getState());
if (nr > 0) {
// FIXME: this promotion process should be removed later
if (State.PARTIAL.morePreciseThan(bbs.getState())) {
stats.setBasicStatsState(State.PARTIAL);
}
}
if (needColStats) {
List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
// We will retrieve stats from the metastore only for columns that are not cached
List<ColStatistics> columnStats = new ArrayList<>();
List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
// List of partitions
List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
}
boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
// add partition column stats
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
// FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
stats.addToColumnStats(columnStats);
} else {
if (statsRetrieved) {
columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
}
int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
if (columnStats.size() != colStatsAvailable) {
LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
}
addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
// Infer column stats state
stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
// Include state for cached columns
stats.updateColumnStatsState(colStatsCache.getState());
}
// Change if we could not retrieve for all partitions
if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
stats.updateColumnStatsState(State.PARTIAL);
LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
}
}
if (partStats.isEmpty()) {
// all partitions are filtered by partition pruning
stats.setBasicStatsState(State.COMPLETE);
}
// stats from metastore only once.
if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
}
}
}
return stats;
}
use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class HiveMetaStoreClientWithLocalCache method getAggrStatsForInternal.
@Override
protected AggrStats getAggrStatsForInternal(PartitionsStatsRequest req) throws TException {
if (isCacheEnabledAndInitialized()) {
TableWatermark watermark = new TableWatermark(req.getValidWriteIdList(), getTable(req.getDbName(), req.getTblName()).getId());
if (watermark.isValid()) {
CacheKey cacheKey = new CacheKey(KeyType.AGGR_COL_STATS, watermark, req);
AggrStats r = (AggrStats) mscLocalCache.getIfPresent(cacheKey);
if (r == null) {
r = super.getAggrStatsForInternal(req);
mscLocalCache.put(cacheKey, r);
} else {
LOG.debug("HS2 level HMS cache: method=getAggrStatsForInternal, dbName={}, tblName={}, partNames={}", req.getDbName(), req.getTblName(), req.getPartNames());
}
if (LOG.isDebugEnabled() && recordStats) {
LOG.debug(cacheObjName + ": " + mscLocalCache.stats().toString());
}
return r;
}
}
return super.getAggrStatsForInternal(req);
}
use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class SessionHiveMetaStoreClient method getAggrStatsForInternal.
@Override
protected AggrStats getAggrStatsForInternal(PartitionsStatsRequest req) throws TException {
Map<Object, Object> queryCache = getQueryCache();
if (queryCache != null) {
// Retrieve or populate cache
CacheKey cacheKey = new CacheKey(KeyType.AGGR_COL_STATS, req);
AggrStats v = (AggrStats) queryCache.get(cacheKey);
if (v == null) {
v = super.getAggrStatsForInternal(req);
queryCache.put(cacheKey, v);
} else {
LOG.debug("Query level HMS cache: method=getAggrStatsForInternal, dbName={}, tblName={}, partNames={}", req.getDbName(), req.getTblName(), req.getPartNames());
}
return v;
}
return super.getAggrStatsForInternal(req);
}
use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class TestStats method compareStatsForPartitions.
private void compareStatsForPartitions(String catName, String dbName, String tableName, List<String> partNames, final Map<String, Column> colMap) throws TException {
Map<String, List<ColumnStatisticsObj>> partObjs = catName.equals(NO_CAT) ? client.getPartitionColumnStatistics(dbName, tableName, partNames, new ArrayList<>(colMap.keySet()), ENGINE) : client.getPartitionColumnStatistics(catName, dbName, tableName, partNames, new ArrayList<>(colMap.keySet()), ENGINE);
for (int i = 0; i < partNames.size(); i++) {
compareStatsForOneTableOrPartition(partObjs.get(partNames.get(i)), i, colMap);
}
AggrStats aggr = catName.equals(NO_CAT) ? client.getAggrColStatsFor(dbName, tableName, new ArrayList<>(colMap.keySet()), partNames, ENGINE) : client.getAggrColStatsFor(catName, dbName, tableName, new ArrayList<>(colMap.keySet()), partNames, ENGINE);
Assert.assertEquals(partNames.size(), aggr.getPartsFound());
Assert.assertEquals(colMap.size(), aggr.getColStatsSize());
aggr.getColStats().forEach(cso -> colMap.get(cso.getColName()).compareAggr(cso));
// Test column stats obtained through getPartitions call
for (int i = 0; i < partNames.size(); i++) {
String partName = partNames.get(i);
List<Partition> partitions = catName.equals(NO_CAT) ? client.getPartitionsByNames(dbName, tableName, Collections.singletonList(partName), true, ENGINE) : client.getPartitionsByNames(catName, dbName, tableName, Collections.singletonList(partName), true, ENGINE);
Partition partition = partitions.get(0);
compareStatsForOneTableOrPartition(partition.getColStats().getStatsObj(), i, colMap);
// Also test that we do not get statistics when not requested
partitions = catName.equals(NO_CAT) ? client.getPartitionsByNames(dbName, tableName, Collections.singletonList(partName), true, ENGINE) : client.getPartitionsByNames(catName, dbName, tableName, Collections.singletonList(partName), true, ENGINE);
partition = partitions.get(0);
Assert.assertFalse(partition.isSetColStats());
}
}
Aggregations