use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class DescTableOperation method getColumnDataForPartitionKeyColumn.
private void getColumnDataForPartitionKeyColumn(Table table, List<FieldSchema> cols, List<ColumnStatisticsObj> colStats, List<String> colNames, Map<String, String> tableProps) throws HiveException, MetaException {
FieldSchema partCol = table.getPartColByName(colNames.get(0));
cols.add(partCol);
PartitionIterable parts = new PartitionIterable(context.getDb(), table, null, MetastoreConf.getIntVar(context.getConf(), MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX));
ColumnInfo ci = new ColumnInfo(partCol.getName(), TypeInfoUtils.getTypeInfoFromTypeString(partCol.getType()), null, false);
ColStatistics cs = StatsUtils.getColStatsForPartCol(ci, parts, context.getConf());
ColumnStatisticsData data = new ColumnStatisticsData();
ColStatistics.Range r = cs.getRange();
StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses());
ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data);
colStats.add(cso);
StatsSetupConst.setColumnStatsState(tableProps, colNames);
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method estimateNDV.
private static long estimateNDV(long numRows, List<ColStatistics> columnStats) {
// If there is a single column, return the number of distinct values
if (columnStats.size() == 1) {
return columnStats.get(0).getCountDistint();
}
// The expected number of distinct values when choosing p values
// with replacement from n integers is n . (1 - ((n - 1) / n) ^ p).
//
// If we have several uniformly distributed attributes A1 ... Am
// with N1 ... Nm distinct values, they behave as one uniformly
// distributed attribute with N1 * ... * Nm distinct values.
long n = 1L;
for (ColStatistics cs : columnStats) {
final long ndv = cs.getCountDistint();
if (ndv > 1) {
n = StatsUtils.safeMult(n, ndv);
}
}
final double nn = n;
final double a = (nn - 1d) / nn;
if (a == 1d) {
// A under-flows if nn is large.
return numRows;
}
final double v = nn * (1d - Math.pow(a, numRows));
// to go a few % over.
return Math.min(Math.round(v), numRows);
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method checkNumberOfEntriesForHashTable.
/* Returns true if it passes the test, false otherwise. */
private boolean checkNumberOfEntriesForHashTable(JoinOperator joinOp, int position, OptimizeTezProcContext context) {
long max = HiveConf.getLongVar(context.parseContext.getConf(), HiveConf.ConfVars.HIVECONVERTJOINMAXENTRIESHASHTABLE);
if (max < 1) {
// Max is disabled, we can safely return true
return true;
}
// Calculate number of different entries and evaluate
ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(position);
List<String> keys = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf().getOutputKeyColumnNames());
Statistics inputStats = rsOp.getStatistics();
List<ColStatistics> columnStats = new ArrayList<>();
for (String key : keys) {
ColStatistics cs = inputStats.getColumnStatisticsFromColName(key);
if (cs == null) {
return true;
}
columnStats.add(cs);
}
long numRows = inputStats.getNumRows();
long estimation = estimateNDV(numRows, columnStats);
LOG.debug("Estimated NDV for input {}: {}; Max NDV for MapJoin conversion: {}", position, estimation, max);
if (estimation > max) {
// Estimation larger than max
LOG.debug("Number of different entries for HashTable is greater than the max; " + "we do not convert to MapJoin");
return false;
}
// We can proceed with the conversion
return true;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method hashTableDataSizeAdjustment.
/**
* In data calculation logic, we include some overhead due to java object refs, etc.
* However, this overhead may be different when storing values in hashtable for mapjoin.
* Hence, we calculate a size adjustment to the original data size for a given input.
*/
private static long hashTableDataSizeAdjustment(long numRows, List<ColStatistics> colStats) {
long result = 0;
if (numRows <= 0 || colStats == null || colStats.isEmpty()) {
return result;
}
for (ColStatistics cs : colStats) {
if (cs != null) {
String colTypeLowerCase = cs.getColumnType().toLowerCase();
long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
double overhead = 0;
if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
overhead = JavaDataModel.get().lengthForStringOfLength(0);
} else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
overhead = JavaDataModel.get().lengthForByteArrayOfSize(0);
} else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
overhead = JavaDataModel.get().object();
}
result = StatsUtils.safeAdd(StatsUtils.safeMult(nonNullCount, overhead), result);
}
}
return result;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class HiveRelMdSize method averageColumnSizes.
// ~ Methods ----------------------------------------------------------------
public List<Double> averageColumnSizes(HiveTableScan scan, RelMetadataQuery mq) {
List<Integer> neededcolsLst = scan.getNeededColIndxsFrmReloptHT();
List<ColStatistics> columnStatistics = ((RelOptHiveTable) scan.getTable()).getColStat(neededcolsLst, true);
// Obtain list of col stats, or use default if they are not available
final ImmutableList.Builder<Double> list = ImmutableList.builder();
int indxRqdCol = 0;
int nNoVirtualColumns = ((RelOptHiveTable) scan.getTable()).getNoOfNonVirtualCols();
int nFields = scan.getRowType().getFieldCount();
for (int i = 0; i < nNoVirtualColumns; i++) {
if (neededcolsLst.contains(i)) {
ColStatistics columnStatistic = columnStatistics.get(indxRqdCol);
indxRqdCol++;
if (columnStatistic == null) {
RelDataTypeField field = scan.getRowType().getFieldList().get(i);
list.add(averageTypeValueSize(field.getType()));
} else {
list.add(columnStatistic.getAvgColLen());
}
} else {
list.add(Double.valueOf(0));
}
}
for (int i = nNoVirtualColumns; i < nFields; i++) {
if (neededcolsLst.contains(i)) {
RelDataTypeField field = scan.getRowType().getFieldList().get(i);
list.add(averageTypeValueSize(field.getType()));
} else {
list.add(Double.valueOf(0));
}
}
return list.build();
}
Aggregations