use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class AbstractSMBJoinProc method isEligibleForBucketSortMergeJoin.
/**
* Whether this table is eligible for a sort-merge join.
*
* @param pctx parse context
* @param op map join operator being considered
* @param joinTree join tree being considered
* @param alias table alias in the join tree being checked
* @param pos position of the table
* @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
* It is not initialized when pos = 0.
* @return
* @throws SemanticException
*/
private boolean isEligibleForBucketSortMergeJoin(SortBucketJoinProcCtx smbJoinContext, List<ExprNodeDesc> keys, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, String[] aliases, int pos, List<Order> sortColumnsFirstTable) throws SemanticException {
String alias = aliases[pos];
/*
* Consider a query like:
*
* select -- mapjoin(subq1) -- * from
* (select a.key, a.value from tbl1 a) subq1
* join
* (select a.key, a.value from tbl2 a) subq2
* on subq1.key = subq2.key;
*
* aliasToOpInfo contains the SelectOperator for subq1 and subq2.
* We need to traverse the tree (using TableAccessAnalyzer) to get to the base
* table. If the object being map-joined is a base table, then aliasToOpInfo
* contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
*/
Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
if (topOp == null) {
return false;
}
// get all join columns from join keys
List<String> joinCols = toColumns(keys);
if (joinCols == null || joinCols.isEmpty()) {
return false;
}
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
if (tso == null) {
return false;
}
/*
* Consider a query like:
*
* select count(*) from
* (
* select key, count(*) from
* (
* select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
* from tbl1 a join tbl2 b on a.key = b.key
* ) subq1
* group by key
* ) subq2;
*
* The table alias should be subq2:subq1:a which needs to be fetched from topOps.
*/
if (pGraphContext.getTopOps().containsValue(tso)) {
for (Map.Entry<String, TableScanOperator> topOpEntry : this.pGraphContext.getTopOps().entrySet()) {
if (topOpEntry.getValue() == tso) {
alias = topOpEntry.getKey();
aliases[pos] = alias;
break;
}
}
} else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
Table tbl = tso.getConf().getTableMetadata();
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// first table
if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
Partition firstPartition = partitions.get(0);
sortColumnsFirstTable.addAll(firstPartition.getSortCols());
}
for (Partition partition : prunedParts.getNotDeniedPartns()) {
if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols, sortColumnsFirstTable)) {
return false;
}
}
return true;
}
// Populate the names and order of columns for the first table
if (pos == 0) {
sortColumnsFirstTable.addAll(tbl.getSortCols());
}
return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols, sortColumnsFirstTable);
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class ListBucketingPruner method transform.
/*
* (non-Javadoc)
*
* @see org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop.hive.ql.parse.
* ParseContext)
*/
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
// create a the context for walking operators
NodeProcessorCtx opPartWalkerCtx = new LBOpPartitionWalkerCtx(pctx);
// Retrieve all partitions generated from partition pruner and partition column pruner
PrunerUtils.walkOperatorTree(pctx, opPartWalkerCtx, LBPartitionProcFactory.getFilterProc(), LBPartitionProcFactory.getDefaultProc());
PrunedPartitionList partsList = ((LBOpPartitionWalkerCtx) opPartWalkerCtx).getPartitions();
if (partsList != null) {
Set<Partition> parts = partsList.getPartitions();
if ((parts != null) && (parts.size() > 0)) {
for (Partition part : parts) {
// only process partition which is skewed and list bucketed
if (ListBucketingPrunerUtils.isListBucketingPart(part)) {
// create a the context for walking operators
NodeProcessorCtx opWalkerCtx = new LBOpWalkerCtx(pctx.getOpToPartToSkewedPruner(), part);
// walk operator tree to create expression tree for list bucketing
PrunerUtils.walkOperatorTree(pctx, opWalkerCtx, LBProcFactory.getFilterProc(), LBProcFactory.getDefaultProc());
}
}
}
}
return pctx;
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class StatsUtils method collectStatistics.
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
Statistics stats = new Statistics();
float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
if (!table.isPartitioned()) {
long ds = getDataSize(conf, table);
long nr = getNumRows(conf, schema, neededColumns, table, ds);
stats.setNumRows(nr);
List<ColStatistics> colStats = Lists.newArrayList();
if (fetchColStats) {
colStats = getTableColumnStats(table, schema, neededColumns);
long betterDS = getDataSizeFromColumnStats(nr, colStats);
ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
}
stats.setDataSize(ds);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
stats.addToColumnStats(colStats);
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after pruning
// the partitions that are not required
long nr = 0;
long ds = 0;
List<Long> rowCounts = Lists.newArrayList();
List<Long> dataSizes = Lists.newArrayList();
if (fetchPartStats) {
rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
ds = getSumIgnoreNegatives(dataSizes);
}
}
// sizes
if (ds <= 0) {
dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
}
ds = getSumIgnoreNegatives(dataSizes);
ds = (long) (ds * deserFactor);
int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
if (avgRowSize > 0) {
setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
nr = getSumIgnoreNegatives(rowCounts);
ds = getSumIgnoreNegatives(dataSizes);
// number of rows -1 means that statistics from metastore is not reliable
if (nr <= 0) {
nr = ds / avgRowSize;
}
}
if (nr == 0) {
nr = 1;
}
stats.addToNumRows(nr);
stats.addToDataSize(ds);
// if at least a partition does not contain row count then mark basic stats state as PARTIAL
if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
stats.setBasicStatsState(State.PARTIAL);
}
if (fetchColStats) {
List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
for (Partition part : partList.getNotDeniedPartns()) {
partNames.add(part.getName());
}
neededColumns = processNeededColumns(schema, neededColumns);
AggrStats aggrStats = null;
// skip the step to connect to the metastore.
if (neededColumns.size() > 0 && partNames.size() > 0) {
aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
}
if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
// There are some partitions with no state (or we didn't fetch any state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
List<ColStatistics> emptyStats = Lists.newArrayList();
// add partition column stats
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
stats.addToColumnStats(emptyStats);
stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
} else {
List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
if (colStats.size() != neededColumns.size()) {
LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
}
List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
long betterDS = getDataSizeFromColumnStats(nr, columnStats);
stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
stats.addToColumnStats(columnStats);
State colState = deriveStatType(columnStats, referencedColumns);
if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
colState = State.PARTIAL;
}
stats.setColumnStatsState(colState);
}
}
}
return stats;
}
Aggregations