Search in sources :

Example 1 with StatsProvidingRecordReader

use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader in project hive by apache.

the class StatsNoJobTask method aggregateStats.

private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
        Collection<Partition> partitions = null;
        if (work.getPrunedPartitionList() == null) {
            partitions = getPartitionsList();
        } else {
            partitions = work.getPrunedPartitionList().getPartitions();
        }
        // non-partitioned table
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            try {
                Path dir = new Path(tTable.getSd().getLocation());
                long numRows = 0;
                long rawDataSize = 0;
                long fileSize = 0;
                long numFiles = 0;
                FileSystem fs = dir.getFileSystem(conf);
                FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
                boolean statsAvailable = false;
                for (FileStatus file : fileList) {
                    if (!file.isDir()) {
                        InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
                        InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
                        if (file.getLen() == 0) {
                            numFiles += 1;
                            statsAvailable = true;
                        } else {
                            org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
                            StatsProvidingRecordReader statsRR;
                            if (recordReader instanceof StatsProvidingRecordReader) {
                                statsRR = (StatsProvidingRecordReader) recordReader;
                                numRows += statsRR.getStats().getRowCount();
                                rawDataSize += statsRR.getStats().getRawDataSize();
                                fileSize += file.getLen();
                                numFiles += 1;
                                statsAvailable = true;
                            }
                            recordReader.close();
                        }
                    }
                }
                if (statsAvailable) {
                    parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
                    parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
                    parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
                    parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
                    EnvironmentContext environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    db.alterTable(tableFullName, new Table(tTable), environmentContext);
                    String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
                    LOG.debug(msg);
                    console.printInfo(msg);
                } else {
                    String msg = "Table " + tableFullName + " does not provide stats.";
                    LOG.debug(msg);
                }
            } catch (Exception e) {
                console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            }
        } else {
            // Partitioned table
            for (Partition partn : partitions) {
                threadPool.execute(new StatsCollection(partn));
            }
            LOG.debug("Stats collection waiting for threadpool to shutdown..");
            shutdownAndAwaitTermination(threadPool);
            LOG.debug("Stats collection threadpool shutdown successful.");
            ret = updatePartitions(db);
        }
    } catch (Exception e) {
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = -1;
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 EnvironmentContext (org.apache.hadoop.hive.metastore.api.EnvironmentContext)1 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)1 StatsProvidingRecordReader (org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1