Search in sources :

Example 1 with StatsAggregator

use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.

the class StatsTask method aggregateStats.

private int aggregateStats(Hive db) {
    StatsAggregator statsAggregator = null;
    int ret = 0;
    StatsCollectionContext scc = null;
    EnvironmentContext environmentContext = null;
    try {
        // Stats setup:
        final Warehouse wh = new Warehouse(conf);
        if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) {
            try {
                scc = getContext();
                statsAggregator = createStatsAggregator(scc, conf);
            } catch (HiveException e) {
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw e;
                }
                console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString()));
            }
        }
        List<Partition> partitions = getPartitionsList(db);
        boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC);
        String tableFullName = table.getDbName() + "." + table.getTableName();
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            // acidTable will not have accurate stats unless it is set through analyze command.
            if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
            } else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
            }
            // non-partitioned tables:
            if (!existStats(parameters) && atomic) {
                return 0;
            }
            // For eg. if a file is being loaded, the old number of rows are not valid
            if (work.isClearAggregatorStats()) {
                // we choose to keep the invalid stats and only change the setting.
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
            }
            updateQuickStats(wh, parameters, tTable.getSd());
            if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
                if (statsAggregator != null) {
                    String prefix = getAggregationPrefix(table, null);
                    updateStats(statsAggregator, parameters, prefix, atomic);
                }
                // write table stats to metastore
                if (!getWork().getNoStatsAggregator()) {
                    environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                }
            }
            getHive().alterTable(tableFullName, new Table(tTable), environmentContext);
            if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
                console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
            }
            LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
        } else {
            // Partitioned table:
            // Need to get the old stats of the partition
            // and update the table stats based on the old and new stats.
            List<Partition> updates = new ArrayList<Partition>();
            //Get the file status up-front for all partitions. Beneficial in cases of blob storage systems
            final Map<String, FileStatus[]> fileStatusMap = new ConcurrentHashMap<String, FileStatus[]>();
            int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1);
            // In case thread count is set to 0, use single thread.
            poolSize = Math.max(poolSize, 1);
            final ExecutorService pool = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("stats-updater-thread-%d").build());
            final List<Future<Void>> futures = Lists.newLinkedList();
            LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize);
            try {
                for (final Partition partn : partitions) {
                    final String partitionName = partn.getName();
                    final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
                    Map<String, String> parameters = tPart.getParameters();
                    if (!existStats(parameters) && atomic) {
                        continue;
                    }
                    futures.add(pool.submit(new Callable<Void>() {

                        @Override
                        public Void call() throws Exception {
                            FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd());
                            fileStatusMap.put(partitionName, partfileStatus);
                            return null;
                        }
                    }));
                }
                pool.shutdown();
                for (Future<Void> future : futures) {
                    future.get();
                }
            } catch (InterruptedException e) {
                LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks");
                //cancel other futures
                for (Future future : futures) {
                    future.cancel(true);
                }
                // Fail the query if the stats are supposed to be reliable
                if (work.isStatsReliable()) {
                    ret = 1;
                }
            } finally {
                if (pool != null) {
                    pool.shutdownNow();
                }
                LOG.debug("Finished getting file stats of all partitions");
            }
            for (Partition partn : partitions) {
                //
                // get the old partition stats
                //
                org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
                Map<String, String> parameters = tPart.getParameters();
                if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
                } else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
                }
                //only when the stats exist, it is added to fileStatusMap
                if (!fileStatusMap.containsKey(partn.getName())) {
                    continue;
                }
                // For eg. if a file is being loaded, the old number of rows are not valid
                if (work.isClearAggregatorStats()) {
                    // we choose to keep the invalid stats and only change the setting.
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
                }
                updateQuickStats(parameters, fileStatusMap.get(partn.getName()));
                if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
                    if (statsAggregator != null) {
                        String prefix = getAggregationPrefix(table, partn);
                        updateStats(statsAggregator, parameters, prefix, atomic);
                    }
                    if (!getWork().getNoStatsAggregator()) {
                        environmentContext = new EnvironmentContext();
                        environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    }
                }
                updates.add(new Partition(table, tPart));
                if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
                    console.printInfo("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
                }
                LOG.info("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
            }
            if (!updates.isEmpty()) {
                db.alterPartitions(tableFullName, updates, environmentContext);
            }
        }
    } catch (Exception e) {
        console.printInfo("[Warning] could not update stats.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = 1;
        }
    } finally {
        if (statsAggregator != null) {
            statsAggregator.closeConnection(scc);
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : Warehouse(org.apache.hadoop.hive.metastore.Warehouse) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) StatsAggregator(org.apache.hadoop.hive.ql.stats.StatsAggregator) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 2 with StatsAggregator

use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.

the class ExplainSemanticAnalyzer method aggregateStats.

private Map<String, Long> aggregateStats(Path localTmpPath) {
    Map<String, Long> opIdToRuntimeNumRows = new HashMap<String, Long>();
    // localTmpPath is the root of all the stats.
    // Under it, there will be SEL_1/statsfiles, SEL_2/statsfiles etc where SEL_1 and SEL_2 are the op ids.
    FileSystem fs;
    FileStatus[] statuses = null;
    try {
        fs = localTmpPath.getFileSystem(conf);
        statuses = fs.listStatus(localTmpPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
    // statuses can be null if it is DDL, etc
    } catch (IOException e) {
        LOG.warn(e.toString());
    }
    if (statuses != null) {
        for (FileStatus status : statuses) {
            if (status.isDir()) {
                StatsCollectionContext scc = new StatsCollectionContext(conf);
                String[] names = status.getPath().toString().split(Path.SEPARATOR);
                String opId = names[names.length - 1];
                scc.setStatsTmpDir(status.getPath().toString());
                StatsAggregator statsAggregator = new FSStatsAggregator();
                if (!statsAggregator.connect(scc)) {
                    // -1 means that there is no stats
                    opIdToRuntimeNumRows.put(opId, -1L);
                } else {
                    String value = statsAggregator.aggregateStats("", StatsSetupConst.RUN_TIME_ROW_COUNT);
                    opIdToRuntimeNumRows.put(opId, Long.parseLong(value));
                }
                if (statsAggregator != null) {
                    statsAggregator.closeConnection(scc);
                }
            }
        }
    }
    return opIdToRuntimeNumRows;
}
Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) FileStatus(org.apache.hadoop.fs.FileStatus) HashMap(java.util.HashMap) FileSystem(org.apache.hadoop.fs.FileSystem) FSStatsAggregator(org.apache.hadoop.hive.ql.stats.fs.FSStatsAggregator) IOException(java.io.IOException) StatsAggregator(org.apache.hadoop.hive.ql.stats.StatsAggregator) FSStatsAggregator(org.apache.hadoop.hive.ql.stats.fs.FSStatsAggregator)

Example 3 with StatsAggregator

use of org.apache.hadoop.hive.ql.stats.StatsAggregator in project hive by apache.

the class StatsTask method createStatsAggregator.

private StatsAggregator createStatsAggregator(StatsCollectionContext scc, HiveConf conf) throws HiveException {
    String statsImpl = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
    StatsFactory factory = StatsFactory.newFactory(statsImpl, conf);
    if (factory == null) {
        throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
    }
    // initialize stats publishing table for noscan which has only stats task
    // the rest of MR task following stats task initializes it in ExecDriver.java
    StatsPublisher statsPublisher = factory.getStatsPublisher();
    if (!statsPublisher.init(scc)) {
        // creating stats table if not exists
        throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
    }
    // manufacture a StatsAggregator
    StatsAggregator statsAggregator = factory.getStatsAggregator();
    if (!statsAggregator.connect(scc)) {
        throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg(statsImpl));
    }
    return statsAggregator;
}
Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) StatsAggregator(org.apache.hadoop.hive.ql.stats.StatsAggregator)

Aggregations

StatsAggregator (org.apache.hadoop.hive.ql.stats.StatsAggregator)3 FileStatus (org.apache.hadoop.fs.FileStatus)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Callable (java.util.concurrent.Callable)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 ExecutorService (java.util.concurrent.ExecutorService)1 Future (java.util.concurrent.Future)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Warehouse (org.apache.hadoop.hive.metastore.Warehouse)1 EnvironmentContext (org.apache.hadoop.hive.metastore.api.EnvironmentContext)1 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)1 StatsPublisher (org.apache.hadoop.hive.ql.stats.StatsPublisher)1