Search in sources :

Example 16 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method getColStatisticsFromExpression.

/**
   * Get column statistics expression nodes
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param end
   *          - expression nodes
   * @return column statistics
   */
public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats, ExprNodeDesc end) {
    if (end == null) {
        return null;
    }
    String colName = null;
    String colType = null;
    double avgColSize = 0;
    long countDistincts = 0;
    long numNulls = 0;
    ObjectInspector oi = end.getWritableObjectInspector();
    long numRows = parentStats.getNumRows();
    if (end instanceof ExprNodeColumnDesc) {
        // column projection
        ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
        colName = encd.getColumn();
        if (encd.getIsPartitionColOrVirtualCol()) {
            ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
            if (colStats != null) {
                /* If statistics for the column already exist use it. */
                try {
                    return colStats.clone();
                } catch (CloneNotSupportedException e) {
                    return null;
                }
            }
            // virtual columns
            colType = encd.getTypeInfo().getTypeName();
            countDistincts = numRows;
        } else {
            // clone the column stats and return
            ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
            if (result != null) {
                try {
                    return result.clone();
                } catch (CloneNotSupportedException e) {
                    return null;
                }
            }
            return null;
        }
    } else if (end instanceof ExprNodeConstantDesc) {
        // constant projection
        ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
        colName = encd.getName();
        colType = encd.getTypeString();
        if (encd.getValue() == null) {
            // null projection
            numNulls = numRows;
        } else {
            countDistincts = 1;
        }
    } else if (end instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
        colName = engfd.getName();
        colType = engfd.getTypeString();
        // If it is a widening cast, we do not change NDV, min, max
        if (isWideningCast(engfd) && engfd.getChildren().get(0) instanceof ExprNodeColumnDesc) {
            // cast on single column
            ColStatistics stats = parentStats.getColumnStatisticsFromColName(engfd.getCols().get(0));
            if (stats != null) {
                ColStatistics newStats;
                try {
                    newStats = stats.clone();
                } catch (CloneNotSupportedException e) {
                    LOG.warn("error cloning stats, this should not happen");
                    return null;
                }
                newStats.setColumnName(colName);
                colType = colType.toLowerCase();
                newStats.setColumnType(colType);
                newStats.setAvgColLen(getAvgColLenOf(conf, oi, colType));
                return newStats;
            }
        }
        // fallback to default
        countDistincts = getNDVFor(engfd, numRows, parentStats);
    } else if (end instanceof ExprNodeColumnListDesc) {
        // column list
        ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
        colName = Joiner.on(",").join(encd.getCols());
        colType = serdeConstants.LIST_TYPE_NAME;
        countDistincts = numRows;
    } else if (end instanceof ExprNodeFieldDesc) {
        // field within complex type
        ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
        colName = enfd.getFieldName();
        colType = enfd.getTypeString();
        countDistincts = numRows;
    } else {
        throw new IllegalArgumentException("not supported expr type " + end.getClass());
    }
    colType = colType.toLowerCase();
    avgColSize = getAvgColLenOf(conf, oi, colType);
    ColStatistics colStats = new ColStatistics(colName, colType);
    colStats.setAvgColLen(avgColSize);
    colStats.setCountDistint(countDistincts);
    colStats.setNumNulls(numNulls);
    return colStats;
}
Also used : WritableIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector) WritableByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector) UnionObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector) WritableDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) StandardConstantListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector) StandardConstantMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector) WritableStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector) HiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector) HiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector) WritableBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector) WritableBinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector) WritableTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector) StandardConstantStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardConstantStructObjectInspector) WritableShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector) StandardMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) WritableFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector) WritableLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector) WritableDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector) ConstantObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector) WritableHiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeColumnListDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc) ExprNodeFieldDesc(org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)

Example 17 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method getDataSizeFromColumnStats.

/**
   * Compute raw data size from column statistics
   * @param numRows
   *          - number of rows
   * @param colStats
   *          - column statistics
   * @return raw data size
   */
public static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats) {
    long result = 0;
    if (numRows <= 0 || colStats == null) {
        return result;
    }
    if (colStats.isEmpty()) {
        // in such a case we estimate empty row to be of size of empty java object.
        return numRows * JavaDataModel.JAVA64_REF;
    }
    for (ColStatistics cs : colStats) {
        if (cs != null) {
            String colTypeLowerCase = cs.getColumnType().toLowerCase();
            long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
            double sizeOf = 0;
            if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
                sizeOf = cs.getAvgColLen();
            } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
                int acl = (int) Math.round(cs.getAvgColLen());
                sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
            } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
                int acl = (int) Math.round(cs.getAvgColLen());
                sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
            } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
                sizeOf = JavaDataModel.get().lengthOfTimestamp();
            } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
                sizeOf = JavaDataModel.get().lengthOfDecimal();
            } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
                sizeOf = JavaDataModel.get().lengthOfDate();
            } else {
                sizeOf = cs.getAvgColLen();
            }
            result = safeAdd(result, safeMult(nonNullCount, sizeOf));
        }
    }
    return result;
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 18 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method getColStatisticsFromExprMap.

/**
   * Get column statistics from parent statistics.
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param colExprMap
   *          - column expression map
   * @param rowSchema
   *          - row schema
   * @return column statistics
   */
public static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf, Statistics parentStats, Map<String, ExprNodeDesc> colExprMap, RowSchema rowSchema) {
    List<ColStatistics> cs = Lists.newArrayList();
    if (colExprMap != null && rowSchema != null) {
        for (ColumnInfo ci : rowSchema.getSignature()) {
            String outColName = ci.getInternalName();
            ExprNodeDesc end = colExprMap.get(outColName);
            ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
            if (colStat != null) {
                colStat.setColumnName(outColName);
                cs.add(colStat);
            }
        }
        // sometimes RowSchema is empty, so fetch stats of columns in exprMap
        for (Entry<String, ExprNodeDesc> pair : colExprMap.entrySet()) {
            if (rowSchema.getColumnInfo(pair.getKey()) == null) {
                ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, pair.getValue());
                if (colStat != null) {
                    colStat.setColumnName(pair.getKey());
                    cs.add(colStat);
                }
            }
        }
        return cs;
    }
    // internal names.
    if (colExprMap == null || rowSchema == null) {
        if (parentStats.getColumnStats() != null) {
            cs.addAll(parentStats.getColumnStats());
        }
    }
    return cs;
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 19 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method getNDVFor.

private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) {
    GenericUDF udf = engfd.getGenericUDF();
    if (!FunctionRegistry.isDeterministic(udf)) {
        return numRows;
    }
    List<Long> ndvs = Lists.newArrayList();
    Class<?> udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass();
    NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class);
    long udfNDV = Long.MAX_VALUE;
    if (ndv != null) {
        udfNDV = ndv.maxNdv();
    } else {
        for (String col : engfd.getCols()) {
            ColStatistics stats = parentStats.getColumnStatisticsFromColName(col);
            if (stats != null) {
                ndvs.add(stats.getCountDistint());
            }
        }
    }
    long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs);
    return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows));
}
Also used : NDV(org.apache.hadoop.hive.ql.udf.generic.NDV) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) GenericUDFBridge(org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge)

Example 20 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class RelOptHiveTable method updateColStats.

private void updateColStats(Set<Integer> projIndxLst, boolean allowNullColumnForMissingStats) {
    List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
    List<String> partColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
    Set<String> colNamesFailedStats = new HashSet<String>();
    // 1. Separate required columns to Non Partition and Partition Cols
    ColumnInfo tmp;
    for (Integer pi : projIndxLst) {
        if (hiveColStatsMap.get(pi) == null) {
            if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
                nonPartColNamesThatRqrStats.add(tmp.getInternalName());
                nonPartColIndxsThatRqrStats.add(pi);
            } else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
                partColNamesThatRqrStats.add(tmp.getInternalName());
                partColIndxsThatRqrStats.add(pi);
            } else {
                noColsMissingStats.getAndIncrement();
                String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
                LOG.error(logMsg);
                throw new RuntimeException(logMsg);
            }
        }
    }
    if (null == partitionList) {
        // We could be here either because its an unpartitioned table or because
        // there are no pruning predicates on a partitioned table.
        computePartitionList(hiveConf, null, new HashSet<Integer>());
    }
    // 2. Obtain Col Stats for Non Partition Cols
    if (nonPartColNamesThatRqrStats.size() > 0) {
        List<ColStatistics> hiveColStats;
        if (!hiveTblMetadata.isPartitioned()) {
            // 2.1 Handle the case for unpartitioned table.
            hiveColStats = StatsUtils.getTableColumnStats(hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats);
            // 2.1.1 Record Column Names that we needed stats for but couldn't
            if (hiveColStats == null) {
                colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
            } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
                Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
                Set<String> setOfObtainedColStats = new HashSet<String>();
                for (ColStatistics cs : hiveColStats) {
                    setOfObtainedColStats.add(cs.getColumnName());
                }
                setOfFiledCols.removeAll(setOfObtainedColStats);
                colNamesFailedStats.addAll(setOfFiledCols);
            } else {
                // Column stats in hiveColStats might not be in the same order as the columns in
                // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
                // using nonPartColIndxsThatRqrStats as below
                Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
                for (ColStatistics cs : hiveColStats) {
                    columnStatsMap.put(cs.getColumnName(), cs);
                }
                hiveColStats.clear();
                for (String colName : nonPartColNamesThatRqrStats) {
                    hiveColStats.add(columnStatsMap.get(colName));
                }
            }
        } else {
            // 2.2 Obtain col stats for partitioned table.
            try {
                if (partitionList.getNotDeniedPartns().isEmpty()) {
                    // no need to make a metastore call
                    rowCount = 0;
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (String c : nonPartColNamesThatRqrStats) {
                        // add empty stats object for each column
                        hiveColStats.add(new ColStatistics(c, null));
                    }
                    colNamesFailedStats.clear();
                } else {
                    Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, nonPartColNamesThatRqrStats, true, true);
                    rowCount = stats.getNumRows();
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (String c : nonPartColNamesThatRqrStats) {
                        ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                        if (cs != null) {
                            hiveColStats.add(cs);
                        } else {
                            colNamesFailedStats.add(c);
                        }
                    }
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        }
        if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
            for (int i = 0; i < hiveColStats.size(); i++) {
                // the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
                // are in same order
                hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
            }
        }
    }
    // 3. Obtain Stats for Partition Cols
    if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
        ColStatistics cStats = null;
        for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
            cStats = new ColStatistics(partColNamesThatRqrStats.get(i), hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)).getTypeName());
            cStats.setCountDistint(getDistinctCount(partitionList.getPartitions(), partColNamesThatRqrStats.get(i)));
            hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
        }
    }
    // 4. Warn user if we could get stats for required columns
    if (!colNamesFailedStats.isEmpty()) {
        String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
        noColsMissingStats.getAndAdd(colNamesFailedStats.size());
        if (allowNullColumnForMissingStats) {
            LOG.warn(logMsg);
            HiveConf conf = SessionState.getSessionConf();
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
                LogHelper console = SessionState.getConsole();
                console.printInfoNoLog(logMsg);
            }
        } else {
            LOG.error(logMsg);
            throw new RuntimeException(logMsg);
        }
    }
}
Also used : HashSet(java.util.HashSet) ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) Set(java.util.Set) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashSet(java.util.HashSet)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)20 ArrayList (java.util.ArrayList)6 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)4 HashSet (java.util.HashSet)3 HashMap (java.util.HashMap)2 List (java.util.List)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)2 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 DataOutputStream (java.io.DataOutputStream)1 BigDecimal (java.math.BigDecimal)1 BigInteger (java.math.BigInteger)1