use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.
the class StatsTask method aggregateStats.
private int aggregateStats(Hive db) {
StatsAggregator statsAggregator = null;
int ret = 0;
StatsCollectionContext scc = null;
EnvironmentContext environmentContext = null;
try {
// Stats setup:
final Warehouse wh = new Warehouse(conf);
if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) {
try {
scc = getContext();
statsAggregator = createStatsAggregator(scc, conf);
} catch (HiveException e) {
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw e;
}
console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString()));
}
}
List<Partition> partitions = getPartitionsList(db);
boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC);
String tableFullName = table.getDbName() + "." + table.getTableName();
if (partitions == null) {
org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
Map<String, String> parameters = tTable.getParameters();
// acidTable will not have accurate stats unless it is set through analyze command.
if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
} else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
}
// non-partitioned tables:
if (!existStats(parameters) && atomic) {
return 0;
}
// For eg. if a file is being loaded, the old number of rows are not valid
if (work.isClearAggregatorStats()) {
// we choose to keep the invalid stats and only change the setting.
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
}
updateQuickStats(wh, parameters, tTable.getSd());
if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
if (statsAggregator != null) {
String prefix = getAggregationPrefix(table, null);
updateStats(statsAggregator, parameters, prefix, atomic);
}
// write table stats to metastore
if (!getWork().getNoStatsAggregator()) {
environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
}
}
getHive().alterTable(tableFullName, new Table(tTable), environmentContext);
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
}
LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
} else {
// Partitioned table:
// Need to get the old stats of the partition
// and update the table stats based on the old and new stats.
List<Partition> updates = new ArrayList<Partition>();
//Get the file status up-front for all partitions. Beneficial in cases of blob storage systems
final Map<String, FileStatus[]> fileStatusMap = new ConcurrentHashMap<String, FileStatus[]>();
int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1);
// In case thread count is set to 0, use single thread.
poolSize = Math.max(poolSize, 1);
final ExecutorService pool = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("stats-updater-thread-%d").build());
final List<Future<Void>> futures = Lists.newLinkedList();
LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize);
try {
for (final Partition partn : partitions) {
final String partitionName = partn.getName();
final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
Map<String, String> parameters = tPart.getParameters();
if (!existStats(parameters) && atomic) {
continue;
}
futures.add(pool.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd());
fileStatusMap.put(partitionName, partfileStatus);
return null;
}
}));
}
pool.shutdown();
for (Future<Void> future : futures) {
future.get();
}
} catch (InterruptedException e) {
LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks");
//cancel other futures
for (Future future : futures) {
future.cancel(true);
}
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (pool != null) {
pool.shutdownNow();
}
LOG.debug("Finished getting file stats of all partitions");
}
for (Partition partn : partitions) {
//
// get the old partition stats
//
org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
Map<String, String> parameters = tPart.getParameters();
if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
} else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
}
//only when the stats exist, it is added to fileStatusMap
if (!fileStatusMap.containsKey(partn.getName())) {
continue;
}
// For eg. if a file is being loaded, the old number of rows are not valid
if (work.isClearAggregatorStats()) {
// we choose to keep the invalid stats and only change the setting.
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
}
updateQuickStats(parameters, fileStatusMap.get(partn.getName()));
if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
if (statsAggregator != null) {
String prefix = getAggregationPrefix(table, partn);
updateStats(statsAggregator, parameters, prefix, atomic);
}
if (!getWork().getNoStatsAggregator()) {
environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
}
}
updates.add(new Partition(table, tPart));
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
}
LOG.info("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
}
if (!updates.isEmpty()) {
db.alterPartitions(tableFullName, updates, environmentContext);
}
}
} catch (Exception e) {
console.printInfo("[Warning] could not update stats.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (statsAggregator != null) {
statsAggregator.closeConnection(scc);
}
}
// anything else indicates failure
return ret;
}
use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.
the class ExplainSemanticAnalyzer method aggregateStats.
private Map<String, Long> aggregateStats(Path localTmpPath) {
Map<String, Long> opIdToRuntimeNumRows = new HashMap<String, Long>();
// localTmpPath is the root of all the stats.
// Under it, there will be SEL_1/statsfiles, SEL_2/statsfiles etc where SEL_1 and SEL_2 are the op ids.
FileSystem fs;
FileStatus[] statuses = null;
try {
fs = localTmpPath.getFileSystem(conf);
statuses = fs.listStatus(localTmpPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
// statuses can be null if it is DDL, etc
} catch (IOException e) {
LOG.warn(e.toString());
}
if (statuses != null) {
for (FileStatus status : statuses) {
if (status.isDir()) {
StatsCollectionContext scc = new StatsCollectionContext(conf);
String[] names = status.getPath().toString().split(Path.SEPARATOR);
String opId = names[names.length - 1];
scc.setStatsTmpDir(status.getPath().toString());
StatsAggregator statsAggregator = new FSStatsAggregator();
if (!statsAggregator.connect(scc)) {
// -1 means that there is no stats
opIdToRuntimeNumRows.put(opId, -1L);
} else {
String value = statsAggregator.aggregateStats("", StatsSetupConst.RUN_TIME_ROW_COUNT);
opIdToRuntimeNumRows.put(opId, Long.parseLong(value));
}
if (statsAggregator != null) {
statsAggregator.closeConnection(scc);
}
}
}
}
return opIdToRuntimeNumRows;
}
use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.
the class StatsTask method createStatsAggregator.
private StatsAggregator createStatsAggregator(StatsCollectionContext scc, HiveConf conf) throws HiveException {
String statsImpl = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
StatsFactory factory = StatsFactory.newFactory(statsImpl, conf);
if (factory == null) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
}
// initialize stats publishing table for noscan which has only stats task
// the rest of MR task following stats task initializes it in ExecDriver.java
StatsPublisher statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(scc)) {
// creating stats table if not exists
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
// manufacture a StatsAggregator
StatsAggregator statsAggregator = factory.getStatsAggregator();
if (!statsAggregator.connect(scc)) {
throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg(statsImpl));
}
return statsAggregator;
}
Aggregations