Search in sources :

Example 46 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreUtils method getMergableCols.

public static void getMergableCols(ColumnStatistics csNew, Map<String, String> parameters) {
    List<ColumnStatisticsObj> list = new ArrayList<>();
    for (int index = 0; index < csNew.getStatsObj().size(); index++) {
        ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index);
        // canColumnStatsMerge guarantees that it is accurate before we do merge
        if (StatsSetupConst.canColumnStatsMerge(parameters, statsObjNew.getColName())) {
            list.add(statsObjNew);
        }
    // in all the other cases, we can not merge
    }
    csNew.setStatsObj(list);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList)

Example 47 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreUtils method mergeColStats.

// this function will merge csOld into csNew.
public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) throws InvalidObjectException {
    List<ColumnStatisticsObj> list = new ArrayList<>();
    if (csNew.getStatsObj().size() != csOld.getStatsObjSize()) {
        // Some of the columns' stats are missing
        // This implies partition schema has changed. We will merge columns
        // present in both, overwrite stats for columns absent in metastore and
        // leave alone columns stats missing from stats task. This last case may
        // leave stats in stale state. This will be addressed later.
        LOG.debug("New ColumnStats size is {}, but old ColumnStats size is {}", csNew.getStatsObj().size(), csOld.getStatsObjSize());
    }
    // In this case, we have to find out which columns can be merged.
    Map<String, ColumnStatisticsObj> map = new HashMap<>();
    // We build a hash map from colName to object for old ColumnStats.
    for (ColumnStatisticsObj obj : csOld.getStatsObj()) {
        map.put(obj.getColName(), obj);
    }
    for (int index = 0; index < csNew.getStatsObj().size(); index++) {
        ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index);
        ColumnStatisticsObj statsObjOld = map.get(statsObjNew.getColName());
        if (statsObjOld != null) {
            // column stats is still accurate.
            assert (statsObjNew.getStatsData().getSetField() == statsObjOld.getStatsData().getSetField());
            // If statsObjOld is found, we can merge.
            ColumnStatsMerger merger = ColumnStatsMergerFactory.getColumnStatsMerger(statsObjNew, statsObjOld);
            merger.merge(statsObjNew, statsObjOld);
        }
        // If statsObjOld is not found, we just use statsObjNew as it is accurate.
        list.add(statsObjNew);
    }
    // in all the other cases, we can not merge
    csNew.setStatsObj(list);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnStatsMerger(org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger)

Example 48 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreUtils method aggrPartitionStats.

public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
    List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
    int numProcessors = Runtime.getRuntime().availableProcessors();
    final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
    final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
    LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
    long start = System.currentTimeMillis();
    for (final Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
        futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {

            @Override
            public ColumnStatisticsObj call() throws MetaException {
                List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
                ColumnStatsAggregator aggregator = entry.getKey();
                try {
                    ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
                    return statsObj;
                } catch (MetaException e) {
                    LOG.debug(e.getMessage());
                    throw e;
                }
            }
        }));
    }
    pool.shutdown();
    if (!futures.isEmpty()) {
        for (Future<ColumnStatisticsObj> future : futures) {
            try {
                if (future.get() != null) {
                    aggrColStatObjs.add(future.get());
                }
            } catch (InterruptedException | ExecutionException e) {
                LOG.debug(e.getMessage());
                pool.shutdownNow();
                throw new MetaException(e.toString());
            }
        }
    }
    LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
    return aggrColStatObjs;
}
Also used : ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) Future(java.util.concurrent.Future) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 49 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreDirectSql method makeColumnStats.

private ColumnStatistics makeColumnStats(List<Object[]> list, ColumnStatisticsDesc csd, int offset) throws MetaException {
    ColumnStatistics result = new ColumnStatistics();
    result.setStatsDesc(csd);
    List<ColumnStatisticsObj> csos = new ArrayList<ColumnStatisticsObj>(list.size());
    for (Object[] row : list) {
        // LastAnalyzed is stored per column but thrift has it per several;
        // get the lowest for now as nobody actually uses this field.
        Object laObj = row[offset + 15];
        if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) {
            csd.setLastAnalyzed(extractSqlLong(laObj));
        }
        csos.add(prepareCSObj(row, offset));
        Deadline.checkTimeout();
    }
    result.setStatsObj(csos);
    return result;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) MPartitionColumnStatistics(org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList)

Example 50 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class HiveAlterHandler method alterTableUpdateTableColumnStats.

@VisibleForTesting
void alterTableUpdateTableColumnStats(RawStore msdb, Table oldTable, Table newTable) throws MetaException, InvalidObjectException {
    String dbName = oldTable.getDbName().toLowerCase();
    String tableName = org.apache.hadoop.hive.metastore.utils.StringUtils.normalizeIdentifier(oldTable.getTableName());
    String newDbName = newTable.getDbName().toLowerCase();
    String newTableName = org.apache.hadoop.hive.metastore.utils.StringUtils.normalizeIdentifier(newTable.getTableName());
    try {
        List<FieldSchema> oldCols = oldTable.getSd().getCols();
        List<FieldSchema> newCols = newTable.getSd().getCols();
        List<ColumnStatisticsObj> newStatsObjs = new ArrayList<>();
        ColumnStatistics colStats = null;
        boolean updateColumnStats = true;
        // Nothing to update if everything is the same
        if (newDbName.equals(dbName) && newTableName.equals(tableName) && MetaStoreUtils.columnsIncludedByNameType(oldCols, newCols)) {
            updateColumnStats = false;
        }
        if (updateColumnStats) {
            List<String> oldColNames = new ArrayList<>(oldCols.size());
            for (FieldSchema oldCol : oldCols) {
                oldColNames.add(oldCol.getName());
            }
            // Collect column stats which need to be rewritten and remove old stats
            colStats = msdb.getTableColumnStatistics(dbName, tableName, oldColNames);
            if (colStats == null) {
                updateColumnStats = false;
            } else {
                List<ColumnStatisticsObj> statsObjs = colStats.getStatsObj();
                if (statsObjs != null) {
                    List<String> deletedCols = new ArrayList<>();
                    for (ColumnStatisticsObj statsObj : statsObjs) {
                        boolean found = false;
                        for (FieldSchema newCol : newCols) {
                            if (statsObj.getColName().equalsIgnoreCase(newCol.getName()) && statsObj.getColType().equalsIgnoreCase(newCol.getType())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            if (!newDbName.equals(dbName) || !newTableName.equals(tableName)) {
                                msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName());
                                newStatsObjs.add(statsObj);
                                deletedCols.add(statsObj.getColName());
                            }
                        } else {
                            msdb.deleteTableColumnStatistics(dbName, tableName, statsObj.getColName());
                            deletedCols.add(statsObj.getColName());
                        }
                    }
                    StatsSetupConst.removeColumnStatsState(newTable.getParameters(), deletedCols);
                }
            }
        }
        // Change to new table and append stats for the new table
        msdb.alterTable(dbName, tableName, newTable);
        if (updateColumnStats && !newStatsObjs.isEmpty()) {
            ColumnStatisticsDesc statsDesc = colStats.getStatsDesc();
            statsDesc.setDbName(newDbName);
            statsDesc.setTableName(newTableName);
            colStats.setStatsObj(newStatsObjs);
            msdb.updateTableColumnStatistics(colStats);
        }
    } catch (NoSuchObjectException nsoe) {
        LOG.debug("Could not find db entry." + nsoe);
    } catch (InvalidInputException e) {
        // should not happen since the input were verified before passed in
        throw new InvalidObjectException("Invalid inputs to update table column stats: " + e);
    }
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) InvalidInputException(org.apache.hadoop.hive.metastore.api.InvalidInputException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)219 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)104 ArrayList (java.util.ArrayList)98 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)82 Test (org.junit.Test)79 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)68 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)43 Table (org.apache.hadoop.hive.metastore.api.Table)43 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 Partition (org.apache.hadoop.hive.metastore.api.Partition)35 List (java.util.List)34 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)29 HashMap (java.util.HashMap)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)28 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22