use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class StatsUtils method collectStatistics.
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
Statistics stats = new Statistics();
float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
if (!table.isPartitioned()) {
long ds = getDataSize(conf, table);
long nr = getNumRows(conf, schema, neededColumns, table, ds);
stats.setNumRows(nr);
List<ColStatistics> colStats = Lists.newArrayList();
if (fetchColStats) {
colStats = getTableColumnStats(table, schema, neededColumns);
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
stats.setDataSize(ds);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
long nr = 0;
long ds = 0;
List<Long> rowCounts = Lists.newArrayList();
List<Long> dataSizes = Lists.newArrayList();
if (fetchPartStats) {
rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
ds = getSumIgnoreNegatives(dataSizes);
}
}
// sizes
if (ds <= 0) {
dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
}
ds = getSumIgnoreNegatives(dataSizes);
ds = (long) (ds * deserFactor);
int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
if (avgRowSize > 0) {
setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
// number of rows -1 means that statistics from metastore is not reliable
if (nr <= 0) {
nr = ds / avgRowSize;
}
}
if (nr == 0) {
nr = 1;
}
stats.addToNumRows(nr);
stats.addToDataSize(ds);
// if at least a partition does not contain row count then mark basic stats state as PARTIAL
if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
stats.setBasicStatsState(State.PARTIAL);
}
if (fetchColStats) {
List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
neededColumns = processNeededColumns(schema, neededColumns);
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (neededColumns.size() > 0 && partNames.size() > 0) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
}
if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
List<ColStatistics> emptyStats = Lists.newArrayList();
// add partition column stats
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
stats.addToColumnStats(emptyStats);
stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
} else {
List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
if (colStats.size() != neededColumns.size()) {
LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
}
List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
State colState = deriveStatType(columnStats, referencedColumns);
if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
colState = State.PARTIAL;
}
stats.setColumnStatsState(colState);
}
}
}
return stats;
}
use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.
the class TestHiveMetaStore method testStatsFastTrivial.
// Tests that in the absence of stats for partitions, and/or absence of columns
// to get stats for, the metastore does not break. See HIVE-12083 for motivation.
public void testStatsFastTrivial() throws Throwable {
String dbName = "tstatsfast";
String tblName = "t1";
String tblOwner = "statstester";
String typeName = "Person";
int lastAccessed = 12083;
cleanUp(dbName, tblName, typeName);
List<List<String>> values = new ArrayList<List<String>>();
values.add(makeVals("2008-07-01 14:13:12", "14"));
values.add(makeVals("2008-07-01 14:13:12", "15"));
values.add(makeVals("2008-07-02 14:13:12", "15"));
values.add(makeVals("2008-07-03 14:13:12", "151"));
createMultiPartitionTableSchema(dbName, tblName, typeName, values);
List<String> emptyColNames = new ArrayList<String>();
List<String> emptyPartNames = new ArrayList<String>();
List<String> colNames = new ArrayList<String>();
colNames.add("name");
colNames.add("income");
List<String> partNames = client.listPartitionNames(dbName, tblName, (short) -1);
assertEquals(0, emptyColNames.size());
assertEquals(0, emptyPartNames.size());
assertEquals(2, colNames.size());
assertEquals(4, partNames.size());
// Test for both colNames and partNames being empty:
AggrStats aggrStatsEmpty = client.getAggrColStatsFor(dbName, tblName, emptyColNames, emptyPartNames);
// short-circuited on client-side, verifying that it's an empty object, not null
assertNotNull(aggrStatsEmpty);
assertEquals(0, aggrStatsEmpty.getPartsFound());
assertNotNull(aggrStatsEmpty.getColStats());
assert (aggrStatsEmpty.getColStats().isEmpty());
// Test for only colNames being empty
AggrStats aggrStatsOnlyParts = client.getAggrColStatsFor(dbName, tblName, emptyColNames, partNames);
// short-circuited on client-side, verifying that it's an empty object, not null
assertNotNull(aggrStatsOnlyParts);
assertEquals(0, aggrStatsOnlyParts.getPartsFound());
assertNotNull(aggrStatsOnlyParts.getColStats());
assert (aggrStatsOnlyParts.getColStats().isEmpty());
// Test for only partNames being empty
AggrStats aggrStatsOnlyCols = client.getAggrColStatsFor(dbName, tblName, colNames, emptyPartNames);
// short-circuited on client-side, verifying that it's an empty object, not null
assertNotNull(aggrStatsOnlyCols);
assertEquals(0, aggrStatsOnlyCols.getPartsFound());
assertNotNull(aggrStatsOnlyCols.getColStats());
assert (aggrStatsOnlyCols.getColStats().isEmpty());
// Test for valid values for both.
AggrStats aggrStatsFull = client.getAggrColStatsFor(dbName, tblName, colNames, partNames);
assertNotNull(aggrStatsFull);
// would still be empty, because no stats are actually populated.
assertEquals(0, aggrStatsFull.getPartsFound());
assertNotNull(aggrStatsFull.getColStats());
assert (aggrStatsFull.getColStats().isEmpty());
}
Aggregations