Examples with ColStatistics - org.apache.hadoop.hive.ql.plan.ColStatistics

Example 1 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class DDLTask method describeTable.

/**
 * Write the description of a table to a file.
 *
 * @param db
 *          The database in question.
 * @param descTbl
 *          This is the table we're interested in.
 * @return Returns 0 when execution succeeds and above 0 if it fails.
 * @throws HiveException
 *           Throws this exception if an unexpected error occurs.
 * @throws MetaException
 */
private int describeTable(Hive db, DescTableDesc descTbl) throws HiveException, MetaException {
    String colPath = descTbl.getColumnPath();
    String tableName = descTbl.getTableName();
    // describe the table - populate the output stream
    Table tbl = db.getTable(tableName, false);
    if (tbl == null) {
        throw new HiveException(ErrorMsg.INVALID_TABLE, tableName);
    }
    Partition part = null;
    if (descTbl.getPartSpec() != null) {
        part = db.getPartition(tbl, descTbl.getPartSpec(), false);
        if (part == null) {
            throw new HiveException(ErrorMsg.INVALID_PARTITION, StringUtils.join(descTbl.getPartSpec().keySet(), ','), tableName);
        }
        tbl = part.getTable();
    }
    DataOutputStream outStream = getOutputStream(descTbl.getResFile());
    try {
        LOG.debug("DDLTask: got data for {}", tableName);
        List<FieldSchema> cols = null;
        List<ColumnStatisticsObj> colStats = null;
        Deserializer deserializer = tbl.getDeserializer(true);
        if (deserializer instanceof AbstractSerDe) {
            String errorMsgs = ((AbstractSerDe) deserializer).getConfigurationErrors();
            if (errorMsgs != null && !errorMsgs.isEmpty()) {
                throw new SQLException(errorMsgs);
            }
        }
        if (colPath.equals(tableName)) {
            cols = (part == null || tbl.getTableType() == TableType.VIRTUAL_VIEW) ? tbl.getCols() : part.getCols();
            if (!descTbl.isFormatted()) {
                cols.addAll(tbl.getPartCols());
            }
            if (tbl.isPartitioned() && part == null) {
                // No partitioned specified for partitioned table, lets fetch all.
                Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
                Map<String, Long> valueMap = new HashMap<>();
                Map<String, Boolean> stateMap = new HashMap<>();
                for (String stat : StatsSetupConst.supportedStats) {
                    valueMap.put(stat, 0L);
                    stateMap.put(stat, true);
                }
                PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
                int numParts = 0;
                for (Partition partition : parts) {
                    Map<String, String> props = partition.getParameters();
                    Boolean state = StatsSetupConst.areBasicStatsUptoDate(props);
                    for (String stat : StatsSetupConst.supportedStats) {
                        stateMap.put(stat, stateMap.get(stat) && state);
                        if (props != null && props.get(stat) != null) {
                            valueMap.put(stat, valueMap.get(stat) + Long.parseLong(props.get(stat)));
                        }
                    }
                    numParts++;
                }
                for (String stat : StatsSetupConst.supportedStats) {
                    StatsSetupConst.setBasicStatsState(tblProps, Boolean.toString(stateMap.get(stat)));
                    tblProps.put(stat, valueMap.get(stat).toString());
                }
                tblProps.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(numParts));
                tbl.setParameters(tblProps);
            }
        } else {
            if (descTbl.isFormatted()) {
                // when column name is specified in describe table DDL, colPath will
                // will be table_name.column_name
                String colName = colPath.split("\\.")[1];
                String[] dbTab = Utilities.getDbTableName(tableName);
                List<String> colNames = new ArrayList<String>();
                colNames.add(colName.toLowerCase());
                if (null == part) {
                    if (tbl.isPartitioned()) {
                        Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
                        if (tbl.isPartitionKey(colNames.get(0))) {
                            FieldSchema partCol = tbl.getPartColByName(colNames.get(0));
                            cols = Collections.singletonList(partCol);
                            PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
                            ColumnInfo ci = new ColumnInfo(partCol.getName(), TypeInfoUtils.getTypeInfoFromTypeString(partCol.getType()), null, false);
                            ColStatistics cs = StatsUtils.getColStatsForPartCol(ci, parts, conf);
                            ColumnStatisticsData data = new ColumnStatisticsData();
                            ColStatistics.Range r = cs.getRange();
                            StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses());
                            ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data);
                            colStats = Collections.singletonList(cso);
                            StatsSetupConst.setColumnStatsState(tblProps, colNames);
                        } else {
                            cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
                            List<String> parts = db.getPartitionNames(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), (short) -1);
                            AggrStats aggrStats = db.getAggrColStatsFor(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames, parts);
                            colStats = aggrStats.getColStats();
                            if (parts.size() == aggrStats.getPartsFound()) {
                                StatsSetupConst.setColumnStatsState(tblProps, colNames);
                            } else {
                                StatsSetupConst.removeColumnStatsState(tblProps, colNames);
                            }
                        }
                        tbl.setParameters(tblProps);
                    } else {
                        cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
                        colStats = db.getTableColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames);
                    }
                } else {
                    List<String> partitions = new ArrayList<String>();
                    partitions.add(part.getName());
                    cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
                    colStats = db.getPartitionColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), partitions, colNames).get(part.getName());
                }
            } else {
                cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
            }
        }
        PrimaryKeyInfo pkInfo = null;
        ForeignKeyInfo fkInfo = null;
        UniqueConstraint ukInfo = null;
        NotNullConstraint nnInfo = null;
        DefaultConstraint dInfo = null;
        CheckConstraint cInfo = null;
        if (descTbl.isExt() || descTbl.isFormatted()) {
            pkInfo = db.getPrimaryKeys(tbl.getDbName(), tbl.getTableName());
            fkInfo = db.getForeignKeys(tbl.getDbName(), tbl.getTableName());
            ukInfo = db.getUniqueConstraints(tbl.getDbName(), tbl.getTableName());
            nnInfo = db.getNotNullConstraints(tbl.getDbName(), tbl.getTableName());
            dInfo = db.getDefaultConstraints(tbl.getDbName(), tbl.getTableName());
            cInfo = db.getCheckConstraints(tbl.getDbName(), tbl.getTableName());
        }
        fixDecimalColumnTypeName(cols);
        // In case the query is served by HiveServer2, don't pad it with spaces,
        // as HiveServer2 output is consumed by JDBC/ODBC clients.
        boolean isOutputPadded = !SessionState.get().isHiveServerQuery();
        formatter.describeTable(outStream, colPath, tableName, tbl, part, cols, descTbl.isFormatted(), descTbl.isExt(), isOutputPadded, colStats, pkInfo, fkInfo, ukInfo, nnInfo, dInfo, cInfo);
        LOG.debug("DDLTask: written data for {}", tableName);
    } catch (SQLException e) {
        throw new HiveException(e, ErrorMsg.GENERIC_ERROR, tableName);
    } finally {
        IOUtils.closeStream(outStream);
    }
    return 0;
}

Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SQLException(java.sql.SQLException) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) DataOutputStream(java.io.DataOutputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) PrimaryKeyInfo(org.apache.hadoop.hive.ql.metadata.PrimaryKeyInfo) ForeignKeyInfo(org.apache.hadoop.hive.ql.metadata.ForeignKeyInfo) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) TextMetaDataTable(org.apache.hadoop.hive.ql.metadata.formatting.TextMetaDataTable) Table(org.apache.hadoop.hive.ql.metadata.Table) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 2 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class RelOptHiveTable method getColStat.

/**
 * Note: DOES NOT CHECK txn stats.
 */
public List<ColStatistics> getColStat(List<Integer> projIndxLst, boolean allowMissingStats) {
    List<ColStatistics> colStatsBldr = Lists.newArrayList();
    Set<Integer> projIndxSet = new HashSet<>(projIndxLst);
    for (Integer i : projIndxLst) {
        if (i >= noOfNonVirtualCols) {
            projIndxSet.remove(i);
        } else if (hiveColStatsMap.get(i) != null) {
            colStatsBldr.add(hiveColStatsMap.get(i));
            projIndxSet.remove(i);
        }
    }
    if (!projIndxSet.isEmpty()) {
        LOG.info("Calculating column statistics for {}, projIndxSet: {}, allowMissingStats: {}", name, projIndxLst, allowMissingStats);
        updateColStats(projIndxSet, allowMissingStats);
        for (Integer i : projIndxSet) {
            colStatsBldr.add(hiveColStatsMap.get(i));
        }
    }
    return colStatsBldr;
}

Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) HashSet(java.util.HashSet)

Example 3 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class HiveRelJsonImpl method explain_.

// ~ Methods ------------------------------------------------------------------
@Override
protected void explain_(RelNode rel, List<Pair<String, Object>> values) {
    super.explain_(rel, values);
    RelMetadataQuery mq = rel.getCluster().getMetadataQuery();
    Map<String, Object> map = (Map<String, Object>) relList.get(relList.size() - 1);
    map.put("rowCount", mq.getRowCount(rel));
    if (rel.getInputs().size() == 0) {
        // This is a leaf, we will print the average row size and schema
        map.put("avgRowSize", mq.getAverageRowSize(rel));
        map.put("rowType", relJson.toJson(rel.getRowType()));
        // We also include partition columns information
        RelOptHiveTable table = (RelOptHiveTable) rel.getTable();
        List<Object> list = jsonBuilder.list();
        list.addAll(table.getHiveTableMD().getPartColNames());
        if (!list.isEmpty()) {
            map.put("partitionColumns", list);
        }
        // We also include column stats
        List<ColStatistics> colStats = table.getColStat(ImmutableBitSet.range(0, table.getNoOfNonVirtualCols()).asList(), true);
        list = jsonBuilder.list();
        for (ColStatistics cs : colStats) {
            final Map<String, Object> csMap = jsonBuilder.map();
            csMap.put("name", cs.getColumnName());
            csMap.put("ndv", cs.getCountDistint());
            if (cs.getRange() != null) {
                csMap.put("minValue", cs.getRange().minValue);
                csMap.put("maxValue", cs.getRange().maxValue);
            }
            list.add(csMap);
        }
        if (!list.isEmpty()) {
            map.put("colStats", list);
        }
    }
}

Also used : RelMetadataQuery(org.apache.calcite.rel.metadata.RelMetadataQuery) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) Map(java.util.Map)

Example 4 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class TezCompiler method markSemiJoinForDPP.

private void markSemiJoinForDPP(OptimizeTezProcContext procCtx) throws SemanticException {
    // Stores the Tablescan operators processed to avoid redoing them.
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    for (ReduceSinkOperator rs : map.keySet()) {
        SemiJoinBranchInfo sjInfo = map.get(rs);
        TableScanOperator ts = sjInfo.getTsOp();
        if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
            continue;
        }
        // A TS can have multiple branches due to DPP Or Semijoin Opt.
        // Use DFS to traverse all the branches until RS or DPP is hit.
        Deque<Operator<?>> deque = new LinkedList<>();
        deque.add(ts);
        while (!deque.isEmpty()) {
            Operator<?> op = deque.pollLast();
            if (op instanceof AppMasterEventOperator && ((AppMasterEventOperator) op).getConf() instanceof DynamicPruningEventDesc) {
                // DPP. Now look up nDVs on both sides to see the selectivity.
                // <Parent Ops>-SEL-GB1-RS1-GB2-RS2
                SelectOperator selOp = OperatorUtils.ancestor(rs, SelectOperator.class, 0, 0, 0, 0);
                try {
                    // Get nDVs on Semijoin edge side
                    Statistics stats = selOp.getStatistics();
                    if (stats == null) {
                        // No stats found on semijoin edge, do nothing
                        break;
                    }
                    String selCol = ExprNodeDescUtils.extractColName(selOp.getConf().getColList().get(0));
                    ColStatistics colStatisticsSJ = stats.getColumnStatisticsFromColName(selCol);
                    if (colStatisticsSJ == null) {
                        // No column stats found for semijoin edge
                        break;
                    }
                    long nDVs = colStatisticsSJ.getCountDistint();
                    if (nDVs > 0) {
                        // Lookup nDVs on TS side.
                        RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
                        // TODO Handle multi column semi-joins as part of HIVE-23934
                        ExprNodeDesc tsExpr = rti.getTargetColumns().get(0);
                        FilterOperator fil = (FilterOperator) (ts.getChildOperators().get(0));
                        Statistics filStats = fil.getStatistics();
                        if (filStats == null) {
                            // No stats found on target, do nothing
                            break;
                        }
                        String colName = ExprNodeDescUtils.extractColName(tsExpr);
                        ColStatistics colStatisticsTarget = filStats.getColumnStatisticsFromColName(colName);
                        if (colStatisticsTarget == null) {
                            // No column stats found on target
                            break;
                        }
                        long nDVsOfTS = colStatisticsTarget.getCountDistint();
                        double nDVsOfTSFactored = nDVsOfTS * procCtx.conf.getFloatVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_DPP_FACTOR);
                        if ((long) nDVsOfTSFactored > nDVs) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("nDVs = " + nDVs + ", nDVsOfTS = " + nDVsOfTS + " and nDVsOfTSFactored = " + nDVsOfTSFactored + "Adding semijoin branch from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                            }
                            sjInfo.setShouldRemove(false);
                        }
                    }
                } catch (NullPointerException e) {
                    // Do nothing
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Caught NPE in markSemiJoinForDPP from ReduceSink " + rs + " to TS " + sjInfo.getTsOp());
                    }
                }
                break;
            }
            if (op instanceof TerminalOperator) {
                // Done with this branch
                continue;
            }
            deque.addAll(op.getChildOperators());
        }
    }
}

Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TopNKeyOperator(org.apache.hadoop.hive.ql.exec.TopNKeyOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) LinkedList(java.util.LinkedList) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 5 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method extractColumnStates.

private static List<String> extractColumnStates(Table table, List<String> columns, ColumnStatsList colStatsCache, List<ColStatistics> columnStats) {
    if (colStatsCache == null) {
        return columns;
    }
    List<String> neededColsToRetrieve = new ArrayList<>(columns.size());
    for (String colName : columns) {
        ColStatistics colStats = colStatsCache.getColStats().get(colName);
        if (colStats == null) {
            neededColsToRetrieve.add(colName);
            LOG.debug("Stats for column {} in table {} could not be retrieved from cache", colName, table.getCompleteName());
        } else {
            columnStats.add(colStats);
            LOG.debug("Stats for column {} in table {} retrieved from cache", colName, table.getCompleteName());
        }
    }
    return neededColsToRetrieve;
}

Also used : ArrayList(java.util.ArrayList) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)42 ArrayList (java.util.ArrayList)14 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)8 HashSet (java.util.HashSet)5 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)5 HashMap (java.util.HashMap)4 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 List (java.util.List)3 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)3 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 RelMetadataQuery (org.apache.calcite.rel.metadata.RelMetadataQuery)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)2 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2