Search in sources :

Example 16 with PrunedPartitionList

use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.

the class AbstractSMBJoinProc method isEligibleForBucketSortMergeJoin.

/**
   * Whether this table is eligible for a sort-merge join.
   *
   * @param pctx                  parse context
   * @param op                    map join operator being considered
   * @param joinTree              join tree being considered
   * @param alias                 table alias in the join tree being checked
   * @param pos                   position of the table
   * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
   *                              It is not initialized when pos = 0.
   * @return
   * @throws SemanticException
   */
private boolean isEligibleForBucketSortMergeJoin(SortBucketJoinProcCtx smbJoinContext, List<ExprNodeDesc> keys, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, String[] aliases, int pos, List<Order> sortColumnsFirstTable) throws SemanticException {
    String alias = aliases[pos];
    /*
     * Consider a query like:
     *
     * select -- mapjoin(subq1) --  * from
     * (select a.key, a.value from tbl1 a) subq1
     *   join
     * (select a.key, a.value from tbl2 a) subq2
     * on subq1.key = subq2.key;
     *
     * aliasToOpInfo contains the SelectOperator for subq1 and subq2.
     * We need to traverse the tree (using TableAccessAnalyzer) to get to the base
     * table. If the object being map-joined is a base table, then aliasToOpInfo
     * contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
     */
    Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
    if (topOp == null) {
        return false;
    }
    // get all join columns from join keys
    List<String> joinCols = toColumns(keys);
    if (joinCols == null || joinCols.isEmpty()) {
        return false;
    }
    TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
    if (tso == null) {
        return false;
    }
    /*
     * Consider a query like:
     *
     * select count(*) from
     *   (
     *     select key, count(*) from
     *       (
     *         select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
     *         from tbl1 a join tbl2 b on a.key = b.key
     *       ) subq1
     *     group by key
     *   ) subq2;
     *
     * The table alias should be subq2:subq1:a which needs to be fetched from topOps.
     */
    if (pGraphContext.getTopOps().containsValue(tso)) {
        for (Map.Entry<String, TableScanOperator> topOpEntry : this.pGraphContext.getTopOps().entrySet()) {
            if (topOpEntry.getValue() == tso) {
                alias = topOpEntry.getKey();
                aliases[pos] = alias;
                break;
            }
        }
    } else {
        // Ideally, this should never happen, and this should be an assert.
        return false;
    }
    Table tbl = tso.getConf().getTableMetadata();
    if (tbl.isPartitioned()) {
        PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
        List<Partition> partitions = prunedParts.getNotDeniedPartns();
        // first table
        if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
            Partition firstPartition = partitions.get(0);
            sortColumnsFirstTable.addAll(firstPartition.getSortCols());
        }
        for (Partition partition : prunedParts.getNotDeniedPartns()) {
            if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols, sortColumnsFirstTable)) {
                return false;
            }
        }
        return true;
    }
    // Populate the names and order of columns for the first table
    if (pos == 0) {
        sortColumnsFirstTable.addAll(tbl.getSortCols());
    }
    return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols, sortColumnsFirstTable);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) HashMap(java.util.HashMap) Map(java.util.Map)

Example 17 with PrunedPartitionList

use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.

the class ListBucketingPruner method transform.

/*
   * (non-Javadoc)
   *
   * @see org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop.hive.ql.parse.
   * ParseContext)
   */
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    // create a the context for walking operators
    NodeProcessorCtx opPartWalkerCtx = new LBOpPartitionWalkerCtx(pctx);
    // Retrieve all partitions generated from partition pruner and partition column pruner
    PrunerUtils.walkOperatorTree(pctx, opPartWalkerCtx, LBPartitionProcFactory.getFilterProc(), LBPartitionProcFactory.getDefaultProc());
    PrunedPartitionList partsList = ((LBOpPartitionWalkerCtx) opPartWalkerCtx).getPartitions();
    if (partsList != null) {
        Set<Partition> parts = partsList.getPartitions();
        if ((parts != null) && (parts.size() > 0)) {
            for (Partition part : parts) {
                // only process partition which is skewed and list bucketed
                if (ListBucketingPrunerUtils.isListBucketingPart(part)) {
                    // create a the context for walking operators
                    NodeProcessorCtx opWalkerCtx = new LBOpWalkerCtx(pctx.getOpToPartToSkewedPruner(), part);
                    // walk operator tree to create expression tree for list bucketing
                    PrunerUtils.walkOperatorTree(pctx, opWalkerCtx, LBProcFactory.getFilterProc(), LBProcFactory.getDefaultProc());
                }
            }
        }
    }
    return pctx;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) NodeProcessorCtx(org.apache.hadoop.hive.ql.lib.NodeProcessorCtx) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList)

Example 18 with PrunedPartitionList

use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.

the class StatsUtils method collectStatistics.

public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
    Statistics stats = new Statistics();
    float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
    if (!table.isPartitioned()) {
        long ds = getDataSize(conf, table);
        long nr = getNumRows(conf, schema, neededColumns, table, ds);
        stats.setNumRows(nr);
        List<ColStatistics> colStats = Lists.newArrayList();
        if (fetchColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns);
            long betterDS = getDataSizeFromColumnStats(nr, colStats);
            ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
        stats.setDataSize(ds);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        long nr = 0;
        long ds = 0;
        List<Long> rowCounts = Lists.newArrayList();
        List<Long> dataSizes = Lists.newArrayList();
        if (fetchPartStats) {
            rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
            dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            if (ds <= 0) {
                dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
                ds = getSumIgnoreNegatives(dataSizes);
            }
        }
        // sizes
        if (ds <= 0) {
            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
        }
        ds = getSumIgnoreNegatives(dataSizes);
        ds = (long) (ds * deserFactor);
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
            setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            // number of rows -1 means that statistics from metastore is not reliable
            if (nr <= 0) {
                nr = ds / avgRowSize;
            }
        }
        if (nr == 0) {
            nr = 1;
        }
        stats.addToNumRows(nr);
        stats.addToDataSize(ds);
        // if at least a partition does not contain row count then mark basic stats state as PARTIAL
        if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
            stats.setBasicStatsState(State.PARTIAL);
        }
        if (fetchColStats) {
            List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            neededColumns = processNeededColumns(schema, neededColumns);
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (neededColumns.size() > 0 && partNames.size() > 0) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
            }
            if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                List<ColStatistics> emptyStats = Lists.newArrayList();
                // add partition column stats
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
                stats.addToColumnStats(emptyStats);
                stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
                stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
            } else {
                List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
                if (colStats.size() != neededColumns.size()) {
                    LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
                }
                List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                State colState = deriveStatType(columnStats, referencedColumns);
                if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
                    LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
                    colState = State.PARTIAL;
                }
                stats.setColumnStatsState(colState);
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) State(org.apache.hadoop.hive.ql.plan.Statistics.State) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) List(java.util.List) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)18 Partition (org.apache.hadoop.hive.ql.metadata.Partition)14 Table (org.apache.hadoop.hive.ql.metadata.Table)10 ArrayList (java.util.ArrayList)9 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)6 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 DruidSchema (org.apache.calcite.adapter.druid.DruidSchema)1