use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method scaleColStatistics.
public static void scaleColStatistics(List<ColStatistics> colStats, double factor) {
for (ColStatistics cs : colStats) {
cs.setNumFalses(StatsUtils.safeMult(cs.getNumFalses(), factor));
cs.setNumTrues(StatsUtils.safeMult(cs.getNumTrues(), factor));
cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor));
if (factor < 1.0) {
final double newNDV = Math.ceil(cs.getCountDistint() * factor);
cs.setCountDistint(newNDV > Long.MAX_VALUE ? Long.MAX_VALUE : (long) newNDV);
}
}
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method extractNDVGroupingColumns.
private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats, Statistics parentStats) {
List<Long> ndvValues = new ArrayList<>(colStats.size());
// compute product of distinct values of grouping columns
for (ColStatistics cs : colStats) {
if (cs != null) {
long ndv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
ndv = StatsUtils.safeAdd(ndv, 1);
}
ndvValues.add(ndv);
} else {
if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
// of NDVs
continue;
} else {
// partial column statistics on grouping attributes case.
// if column statistics on grouping attribute is missing, then
// assume worst case.
ndvValues = null;
}
break;
}
}
return ndvValues;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method hashTableDataSizeAdjustment.
/**
* In data calculation logic, we include some overhead due to java object refs, etc.
* However, this overhead may be different when storing values in hashtable for mapjoin.
* Hence, we calculate a size adjustment to the original data size for a given input.
*/
private static long hashTableDataSizeAdjustment(long numRows, List<ColStatistics> colStats) {
long result = 0;
if (numRows <= 0 || colStats == null || colStats.isEmpty()) {
return result;
}
for (ColStatistics cs : colStats) {
if (cs != null) {
String colTypeLowerCase = cs.getColumnType().toLowerCase();
long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
double overhead = 0;
if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
overhead = JavaDataModel.get().lengthForStringOfLength(0);
} else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
overhead = JavaDataModel.get().lengthForByteArrayOfSize(0);
} else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
overhead = JavaDataModel.get().object();
}
result = StatsUtils.safeAdd(StatsUtils.safeMult(nonNullCount, overhead), result);
}
}
return result;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method checkNumberOfEntriesForHashTable.
/* Returns true if it passes the test, false otherwise. */
private boolean checkNumberOfEntriesForHashTable(JoinOperator joinOp, int position, OptimizeTezProcContext context) {
long max = HiveConf.getLongVar(context.parseContext.getConf(), HiveConf.ConfVars.HIVECONVERTJOINMAXENTRIESHASHTABLE);
if (max < 1) {
// Max is disabled, we can safely return true
return true;
}
// Calculate number of different entries and evaluate
ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(position);
List<String> keys = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf().getOutputKeyColumnNames());
Statistics inputStats = rsOp.getStatistics();
List<ColStatistics> columnStats = new ArrayList<>();
for (String key : keys) {
ColStatistics cs = inputStats.getColumnStatisticsFromColName(key);
if (cs == null) {
return true;
}
columnStats.add(cs);
}
long numRows = inputStats.getNumRows();
long estimation = estimateNDV(numRows, columnStats);
LOG.debug("Estimated NDV for input {}: {}; Max NDV for MapJoin conversion: {}", position, estimation, max);
if (estimation > max) {
// Estimation larger than max
LOG.debug("Number of different entries for HashTable is greater than the max; " + "we do not convert to MapJoin");
return false;
}
// We can proceed with the conversion
return true;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class ConvertJoinMapJoin method estimateNDV.
private static long estimateNDV(long numRows, List<ColStatistics> columnStats) {
// If there is a single column, return the number of distinct values
if (columnStats.size() == 1) {
return columnStats.get(0).getCountDistint();
}
// The expected number of distinct values when choosing p values
// with replacement from n integers is n . (1 - ((n - 1) / n) ^ p).
//
// If we have several uniformly distributed attributes A1 ... Am
// with N1 ... Nm distinct values, they behave as one uniformly
// distributed attribute with N1 * ... * Nm distinct values.
long n = 1L;
for (ColStatistics cs : columnStats) {
final long ndv = cs.getCountDistint();
if (ndv > 1) {
n = StatsUtils.safeMult(n, ndv);
}
}
final double nn = n;
final double a = (nn - 1d) / nn;
if (a == 1d) {
// A under-flows if nn is large.
return numRows;
}
final double v = nn * (1d - Math.pow(a, numRows));
// to go a few % over.
return Math.min(Math.round(v), numRows);
}
Aggregations