use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class RelOptHiveTable method updateColStats.
private void updateColStats(Set<Integer> projIndxLst, boolean allowMissingStats) {
List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
List<String> partColNamesThatRqrStats = new ArrayList<String>();
List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
Set<String> colNamesFailedStats = new HashSet<String>();
// 1. Separate required columns to Non Partition and Partition Cols
ColumnInfo tmp;
for (Integer pi : projIndxLst) {
if (hiveColStatsMap.get(pi) == null) {
if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
nonPartColNamesThatRqrStats.add(tmp.getInternalName());
nonPartColIndxsThatRqrStats.add(pi);
} else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
partColNamesThatRqrStats.add(tmp.getInternalName());
partColIndxsThatRqrStats.add(pi);
} else {
noColsMissingStats.getAndIncrement();
String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
if (null == partitionList) {
// We could be here either because its an unpartitioned table or because
// there are no pruning predicates on a partitioned table.
computePartitionList(hiveConf, null, new HashSet<Integer>());
}
String partitionListKey = partitionList.getKey().orElse(null);
ColumnStatsList colStatsCached = colStatsCache.get(partitionListKey);
if (colStatsCached == null) {
colStatsCached = new ColumnStatsList();
colStatsCache.put(partitionListKey, colStatsCached);
}
// 2. Obtain Col Stats for Non Partition Cols
if (nonPartColNamesThatRqrStats.size() > 0) {
List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>();
if (!hiveTblMetadata.isPartitioned()) {
// 2.1 Handle the case for unpartitioned table.
try {
Statistics stats = StatsUtils.collectStatistics(hiveConf, null, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
rowCount = stats.getNumRows();
for (String c : nonPartColNamesThatRqrStats) {
ColStatistics cs = stats.getColumnStatisticsFromColName(c);
if (cs != null) {
hiveColStats.add(cs);
}
}
colStatsCached.updateState(stats.getColumnStatsState());
// 2.1.1 Record Column Names that we needed stats for but couldn't
if (hiveColStats.isEmpty()) {
colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
} else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
Set<String> setOfObtainedColStats = new HashSet<String>();
for (ColStatistics cs : hiveColStats) {
setOfObtainedColStats.add(cs.getColumnName());
}
setOfFiledCols.removeAll(setOfObtainedColStats);
colNamesFailedStats.addAll(setOfFiledCols);
} else {
// Column stats in hiveColStats might not be in the same order as the columns in
// nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
// using nonPartColIndxsThatRqrStats as below
Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
for (ColStatistics cs : hiveColStats) {
columnStatsMap.put(cs.getColumnName(), cs);
// stats are not available
if (cs.isEstimated()) {
colNamesFailedStats.add(cs.getColumnName());
}
}
hiveColStats.clear();
for (String colName : nonPartColNamesThatRqrStats) {
hiveColStats.add(columnStatsMap.get(colName));
}
}
} catch (HiveException e) {
String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed.";
LOG.error(logMsg, e);
throw new RuntimeException(logMsg, e);
}
} else {
// 2.2 Obtain col stats for partitioned table.
try {
if (partitionList.getNotDeniedPartns().isEmpty()) {
// no need to make a metastore call
rowCount = 0;
hiveColStats = new ArrayList<ColStatistics>();
for (int i = 0; i < nonPartColNamesThatRqrStats.size(); i++) {
// add empty stats object for each column
hiveColStats.add(new ColStatistics(nonPartColNamesThatRqrStats.get(i), hiveNonPartitionColsMap.get(nonPartColIndxsThatRqrStats.get(i)).getTypeName()));
}
colNamesFailedStats.clear();
colStatsCached.updateState(State.COMPLETE);
} else {
Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
rowCount = stats.getNumRows();
hiveColStats = new ArrayList<ColStatistics>();
for (String c : nonPartColNamesThatRqrStats) {
ColStatistics cs = stats.getColumnStatisticsFromColName(c);
if (cs != null) {
hiveColStats.add(cs);
if (cs.isEstimated()) {
colNamesFailedStats.add(c);
}
} else {
colNamesFailedStats.add(c);
}
}
colStatsCached.updateState(stats.getColumnStatsState());
}
} catch (HiveException e) {
String logMsg = "Collecting stats failed.";
LOG.error(logMsg, e);
throw new RuntimeException(logMsg, e);
}
}
if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
for (int i = 0; i < hiveColStats.size(); i++) {
// the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
// are in same order
hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
colStatsCached.put(hiveColStats.get(i).getColumnName(), hiveColStats.get(i));
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + hiveColStats.get(i).getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
LOG.debug(hiveColStats.get(i).toString());
}
}
}
}
// 3. Obtain Stats for Partition Cols
if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
ColStatistics cStats = null;
for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
cStats = StatsUtils.getColStatsForPartCol(hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)), new PartitionIterable(partitionList.getNotDeniedPartns()), hiveConf);
hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
colStatsCached.put(cStats.getColumnName(), cStats);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + cStats.getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
LOG.debug(cStats.toString());
}
}
}
// 4. Warn user if we could get stats for required columns
if (!colNamesFailedStats.isEmpty()) {
String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
noColsMissingStats.getAndAdd(colNamesFailedStats.size());
if (allowMissingStats) {
LOG.warn(logMsg);
HiveConf conf = SessionState.getSessionConf();
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
LogHelper console = SessionState.getConsole();
console.printInfo(logMsg);
}
} else {
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class SparkMapJoinOptimizer method getMapJoinConversionInfo.
/**
* This method returns the big table position in a map-join. If the given join
* cannot be converted to a map-join (This could happen for several reasons - one
* of them being presence of 2 or more big tables that cannot fit in-memory), it returns -1.
*
* Otherwise, it returns an int value that is the index of the big table in the set
* MapJoinProcessor.bigTableCandidateSet
*
* @param joinOp
* @param context
* @return an array of 3 long values, first value is the position,
* second value is the connected map join size, and the third is big table data size.
*/
private long[] getMapJoinConversionInfo(JoinOperator joinOp, OptimizeSparkProcContext context) {
Set<Integer> bigTableCandidateSet = MapJoinProcessor.getBigTableCandidates(joinOp.getConf().getConds());
long maxSize = context.getConf().getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
int bigTablePosition = -1;
Statistics bigInputStat = null;
long totalSize = 0;
int pos = 0;
// bigTableFound means we've encountered a table that's bigger than the
// max. This table is either the big table or we cannot convert.
boolean bigTableFound = false;
boolean useTsStats = context.getConf().getBoolean(HiveConf.ConfVars.SPARK_USE_TS_STATS_FOR_MAPJOIN.varname, false);
// If so, mark that branch as the big table branch.
if (useTsStats) {
LOG.debug("Checking map join optimization for operator {} using TS stats", joinOp);
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (isBigTableBranch(parentOp)) {
if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos) && !containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos);
bigTablePosition = pos;
bigTableFound = true;
bigInputStat = new Statistics(0, Long.MAX_VALUE, Long.MAX_VALUE, 0);
} else {
// Either we've found multiple big table branches, or the current branch cannot
// be a big table branch. Disable mapjoin for these cases.
LOG.debug("Cannot enable map join optimization for operator {}", joinOp);
return new long[] { -1, 0, 0 };
}
}
pos++;
}
}
pos = 0;
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
// Skip the potential big table identified above
if (pos == bigTablePosition) {
pos++;
continue;
}
Statistics currInputStat = null;
if (useTsStats) {
// Not adding other stats (e.g., # of rows, col stats) since only data size is used here
for (TableScanOperator root : OperatorUtils.findOperatorsUpstream(parentOp, TableScanOperator.class)) {
if (currInputStat == null) {
currInputStat = root.getStatistics().clone();
} else {
currInputStat.addBasicStats(root.getStatistics());
}
}
} else {
currInputStat = parentOp.getStatistics();
}
if (currInputStat == null) {
LOG.warn("Couldn't get statistics from: " + parentOp);
return new long[] { -1, 0, 0 };
}
// But, this is tricky to implement, and we'll leave it as a future work for now.
if (containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
return new long[] { -1, 0, 0 };
}
long inputSize = currInputStat.getDataSize();
if (bigInputStat == null || inputSize > bigInputStat.getDataSize()) {
if (bigTableFound) {
// on size and there's another one that's bigger.
return new long[] { -1, 0, 0 };
}
if (inputSize > maxSize) {
if (!bigTableCandidateSet.contains(pos)) {
// big for the map side.
return new long[] { -1, 0, 0 };
}
bigTableFound = true;
}
if (bigInputStat != null) {
// we're replacing the current big table with a new one. Need
// to count the current one as a map table then.
totalSize += bigInputStat.getDataSize();
}
if (totalSize > maxSize) {
// hence cannot convert.
return new long[] { -1, 0, 0 };
}
if (bigTableCandidateSet.contains(pos)) {
bigTablePosition = pos;
bigInputStat = currInputStat;
}
} else {
totalSize += currInputStat.getDataSize();
if (totalSize > maxSize) {
// cannot hold all map tables in memory. Cannot convert.
return new long[] { -1, 0, 0 };
}
}
pos++;
}
if (bigTablePosition == -1) {
// No big table candidates.
return new long[] { -1, 0, 0 };
}
// Final check, find size of already-calculated Mapjoin Operators in same work (spark-stage).
// We need to factor this in to prevent overwhelming Spark executor-memory.
long connectedMapJoinSize = getConnectedMapJoinSize(joinOp.getParentOperators().get(bigTablePosition), joinOp, context);
if ((connectedMapJoinSize + totalSize) > maxSize) {
return new long[] { -1, 0, 0 };
}
return new long[] { bigTablePosition, connectedMapJoinSize, totalSize };
}
use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.
the class TestSharedWorkOptimizer method testTSCmpOrdersByDataSizeDesc.
@Test
public void testTSCmpOrdersByDataSizeDesc() {
TableScanOperator ts1 = getTsOp();
TableScanOperator ts2 = getTsOp();
TableScanOperator ts3 = getTsOp();
ts1.setStatistics(new Statistics(100, 100, 1, 1));
ts2.setStatistics(new Statistics(1000, 1000, 1, 1));
ts3.setStatistics(new Statistics(10, 10, 1, 1));
ArrayList<TableScanOperator> li1 = Lists.newArrayList(ts1, ts3, ts2);
li1.sort(new TSComparator());
assertTrue(li1.get(0).getStatistics().getDataSize() == 1000);
assertTrue(li1.get(1).getStatistics().getDataSize() == 100);
assertTrue(li1.get(2).getStatistics().getDataSize() == 10);
}
Aggregations