use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreUtils method getMergableCols.
public static void getMergableCols(ColumnStatistics csNew, Map<String, String> parameters) {
List<ColumnStatisticsObj> list = new ArrayList<>();
for (int index = 0; index < csNew.getStatsObj().size(); index++) {
ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index);
// canColumnStatsMerge guarantees that it is accurate before we do merge
if (StatsSetupConst.canColumnStatsMerge(parameters, statsObjNew.getColName())) {
list.add(statsObjNew);
}
// in all the other cases, we can not merge
}
csNew.setStatsObj(list);
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreUtils method mergeColStats.
// this function will merge csOld into csNew.
public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) throws InvalidObjectException {
List<ColumnStatisticsObj> list = new ArrayList<>();
if (csNew.getStatsObj().size() != csOld.getStatsObjSize()) {
// Some of the columns' stats are missing
// This implies partition schema has changed. We will merge columns
// present in both, overwrite stats for columns absent in metastore and
// leave alone columns stats missing from stats task. This last case may
// leave stats in stale state. This will be addressed later.
LOG.debug("New ColumnStats size is {}, but old ColumnStats size is {}", csNew.getStatsObj().size(), csOld.getStatsObjSize());
}
// In this case, we have to find out which columns can be merged.
Map<String, ColumnStatisticsObj> map = new HashMap<>();
// We build a hash map from colName to object for old ColumnStats.
for (ColumnStatisticsObj obj : csOld.getStatsObj()) {
map.put(obj.getColName(), obj);
}
for (int index = 0; index < csNew.getStatsObj().size(); index++) {
ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index);
ColumnStatisticsObj statsObjOld = map.get(statsObjNew.getColName());
if (statsObjOld != null) {
// column stats is still accurate.
assert (statsObjNew.getStatsData().getSetField() == statsObjOld.getStatsData().getSetField());
// If statsObjOld is found, we can merge.
ColumnStatsMerger merger = ColumnStatsMergerFactory.getColumnStatsMerger(statsObjNew, statsObjOld);
merger.merge(statsObjNew, statsObjOld);
}
// If statsObjOld is not found, we just use statsObjNew as it is accurate.
list.add(statsObjNew);
}
// in all the other cases, we can not merge
csNew.setStatsObj(list);
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreUtils method aggrPartitionStats.
public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
int numProcessors = Runtime.getRuntime().availableProcessors();
final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
long start = System.currentTimeMillis();
for (final Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {
@Override
public ColumnStatisticsObj call() throws MetaException {
List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
ColumnStatsAggregator aggregator = entry.getKey();
try {
ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
return statsObj;
} catch (MetaException e) {
LOG.debug(e.getMessage());
throw e;
}
}
}));
}
pool.shutdown();
if (!futures.isEmpty()) {
for (Future<ColumnStatisticsObj> future : futures) {
try {
if (future.get() != null) {
aggrColStatObjs.add(future.get());
}
} catch (InterruptedException | ExecutionException e) {
LOG.debug(e.getMessage());
pool.shutdownNow();
throw new MetaException(e.toString());
}
}
}
LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
return aggrColStatObjs;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreDirectSql method makeColumnStats.
private ColumnStatistics makeColumnStats(List<Object[]> list, ColumnStatisticsDesc csd, int offset) throws MetaException {
ColumnStatistics result = new ColumnStatistics();
result.setStatsDesc(csd);
List<ColumnStatisticsObj> csos = new ArrayList<ColumnStatisticsObj>(list.size());
for (Object[] row : list) {
// LastAnalyzed is stored per column but thrift has it per several;
// get the lowest for now as nobody actually uses this field.
Object laObj = row[offset + 15];
if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) {
csd.setLastAnalyzed(extractSqlLong(laObj));
}
csos.add(prepareCSObj(row, offset));
Deadline.checkTimeout();
}
result.setStatsObj(csos);
return result;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class HiveAlterHandler method alterTableUpdateTableColumnStats.
@VisibleForTesting
void alterTableUpdateTableColumnStats(RawStore msdb, Table oldTable, Table newTable) throws MetaException, InvalidObjectException {
String dbName = oldTable.getDbName().toLowerCase();
String tableName = org.apache.hadoop.hive.metastore.utils.StringUtils.normalizeIdentifier(oldTable.getTableName());
String newDbName = newTable.getDbName().toLowerCase();
String newTableName = org.apache.hadoop.hive.metastore.utils.StringUtils.normalizeIdentifier(newTable.getTableName());
try {
List<FieldSchema> oldCols = oldTable.getSd().getCols();
List<FieldSchema> newCols = newTable.getSd().getCols();
List<ColumnStatisticsObj> newStatsObjs = new ArrayList<>();
ColumnStatistics colStats = null;
boolean updateColumnStats = true;
// Nothing to update if everything is the same
if (newDbName.equals(dbName) && newTableName.equals(tableName) && MetaStoreUtils.columnsIncludedByNameType(oldCols, newCols)) {
updateColumnStats = false;
}
if (updateColumnStats) {
List<String> oldColNames = new ArrayList<>(oldCols.size());
for (FieldSchema oldCol : oldCols) {
oldColNames.add(oldCol.getName());
}
// Collect column stats which need to be rewritten and remove old stats
colStats = msdb.getTableColumnStatistics(dbName, tableName, oldColNames);
if (colStats == null) {
updateColumnStats = false;
} else {
List<ColumnStatisticsObj> statsObjs = colStats.getStatsObj();
if (statsObjs != null) {
List<String> deletedCols = new ArrayList<>();
for (ColumnStatisticsObj statsObj : statsObjs) {
boolean found = false;
for (FieldSchema newCol : newCols) {
if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) && statsObj.getColType().equalsIgnoreCase(newCol.getType())) {
found = true;
break;
}
}
if (found) {
if (!newDbName.equals(dbName) || !newTableName.equals(tableName)) {
msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName());
newStatsObjs.add(statsObj);
deletedCols.add(statsObj.getColName());
}
} else {
msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName());
deletedCols.add(statsObj.getColName());
}
}
StatsSetupConst.removeColumnStatsState(newTable.getParameters(), deletedCols);
}
}
}
// Change to new table and append stats for the new table
msdb.alterTable(dbName, tableName, newTable);
if (updateColumnStats && !newStatsObjs.isEmpty()) {
ColumnStatisticsDesc statsDesc = colStats.getStatsDesc();
statsDesc.setDbName(newDbName);
statsDesc.setTableName(newTableName);
colStats.setStatsObj(newStatsObjs);
msdb.updateTableColumnStatistics(colStats);
}
} catch (NoSuchObjectException nsoe) {
LOG.debug("Could not find db entry." + nsoe);
} catch (InvalidInputException e) {
// should not happen since the input were verified before passed in
throw new InvalidObjectException("Invalid inputs to update table column stats: " + e);
}
}
Aggregations