Search in sources :

Example 56 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class GenMapRedUtils method createTemporaryTableScanOperator.

public static TableScanOperator createTemporaryTableScanOperator(CompilationOpContext ctx, RowSchema rowSchema) {
    TableScanOperator tableScanOp = (TableScanOperator) OperatorFactory.get(ctx, new TableScanDesc(null), rowSchema);
    // Set needed columns for this dummy TableScanOperator
    List<Integer> neededColumnIds = new ArrayList<Integer>();
    List<String> neededColumnNames = new ArrayList<String>();
    List<ColumnInfo> parentColumnInfos = rowSchema.getSignature();
    for (int i = 0; i < parentColumnInfos.size(); i++) {
        neededColumnIds.add(i);
        neededColumnNames.add(parentColumnInfos.get(i).getInternalName());
    }
    tableScanOp.setNeededColumnIDs(neededColumnIds);
    tableScanOp.setNeededColumns(neededColumnNames);
    tableScanOp.setReferencedColumns(neededColumnNames);
    return tableScanOp;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo)

Example 57 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class GenMRFileSink1 method processLinkedFileDesc.

/*
   * Multiple file sink descriptors are linked.
   * Use the task created by the first linked file descriptor
   */
private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask) throws SemanticException {
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    TableScanOperator currTopOp = ctx.getCurrTopOp();
    if (currTopOp != null && !ctx.isSeenOp(currTask, currTopOp)) {
        String currAliasId = ctx.getCurrAliasId();
        GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
    }
    if (childTask != null) {
        currTask.addDependentTask(childTask);
    }
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator)

Example 58 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SimpleFetchOptimizer method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    if (pctx.getQueryProperties().isQuery() && !pctx.getQueryProperties().isAnalyzeCommand() && topOps.size() == 1) {
        // no join, no groupby, no distinct, no lateral view, no subq,
        // no CTAS or insert, not analyze command, and single sourced.
        String alias = (String) pctx.getTopOps().keySet().toArray()[0];
        TableScanOperator topOp = pctx.getTopOps().values().iterator().next();
        try {
            FetchTask fetchTask = optimize(pctx, alias, topOp);
            if (fetchTask != null) {
                pctx.setFetchTask(fetchTask);
            }
        } catch (Exception e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            if (e instanceof SemanticException) {
                throw (SemanticException) e;
            }
            throw new SemanticException(e.getMessage(), e);
        }
    }
    return pctx;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 59 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class AbstractBucketJoinProc method checkConvertBucketMapJoin.

/*
   * Can this mapjoin be converted to a bucketed mapjoin ?
   * The following checks are performed:
   * a. The join columns contains all the bucket columns.
   * b. The join keys are not transformed in the sub-query.
   * c. All partitions contain the expected number of files (number of buckets).
   * d. The number of buckets in the big table can be divided by no of buckets in small tables.
   */
protected boolean checkConvertBucketMapJoin(BucketJoinProcCtx context, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, Map<Byte, List<ExprNodeDesc>> keysMap, String baseBigAlias, List<String> joinAliases) throws SemanticException {
    LinkedHashMap<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition = new LinkedHashMap<String, List<Integer>>();
    LinkedHashMap<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition = new LinkedHashMap<String, List<List<String>>>();
    HashMap<String, TableScanOperator> topOps = pGraphContext.getTopOps();
    HashMap<String, String> aliasToNewAliasMap = new HashMap<String, String>();
    // (partition to bucket file names) and (partition to bucket number) for
    // the big table;
    LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
    LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
    // accessing order of join cols to bucket cols, should be same
    Integer[] joinKeyOrder = null;
    boolean bigTablePartitioned = true;
    for (int index = 0; index < joinAliases.size(); index++) {
        String alias = joinAliases.get(index);
        Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
        // The alias may not be present in case of a sub-query
        if (topOp == null) {
            return false;
        }
        List<String> keys = toColumns(keysMap.get((byte) index));
        if (keys == null || keys.isEmpty()) {
            return false;
        }
        int oldKeySize = keys.size();
        TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
        if (tso == null) {
            // between topOp and root TableScan operator. We don't handle that case, and simply return
            return false;
        }
        // For nested sub-queries, the alias mapping is not maintained in QB currently.
        if (topOps.containsValue(tso)) {
            for (Map.Entry<String, TableScanOperator> topOpEntry : topOps.entrySet()) {
                if (topOpEntry.getValue() == tso) {
                    String newAlias = topOpEntry.getKey();
                    if (!newAlias.equals(alias)) {
                        joinAliases.set(index, newAlias);
                        if (baseBigAlias.equals(alias)) {
                            baseBigAlias = newAlias;
                        }
                        aliasToNewAliasMap.put(alias, newAlias);
                        alias = newAlias;
                    }
                    break;
                }
            }
        } else {
            // Ideally, this should never happen, and this should be an assert.
            return false;
        }
        // be removed, and the size before and after the genRootTableScan will be different.
        if (keys.size() != oldKeySize) {
            return false;
        }
        if (joinKeyOrder == null) {
            joinKeyOrder = new Integer[keys.size()];
        }
        Table tbl = tso.getConf().getTableMetadata();
        if (AcidUtils.isInsertOnlyTable(tbl.getParameters())) {
            Utilities.FILE_OP_LOGGER.debug("No bucketed join on MM table " + tbl.getTableName());
            return false;
        }
        if (tbl.isPartitioned()) {
            PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
            List<Partition> partitions = prunedParts.getNotDeniedPartns();
            // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
            if (partitions.isEmpty()) {
                if (!alias.equals(baseBigAlias)) {
                    tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.<Integer>asList());
                    tblAliasToBucketedFilePathsInEachPartition.put(alias, new ArrayList<List<String>>());
                }
            } else {
                List<Integer> buckets = new ArrayList<Integer>();
                List<List<String>> files = new ArrayList<List<String>>();
                for (Partition p : partitions) {
                    if (!checkBucketColumns(p.getBucketCols(), keys, joinKeyOrder)) {
                        return false;
                    }
                    List<String> fileNames = getBucketFilePathsOfPartition(p.getDataLocation(), pGraphContext);
                    // The number of files for the table should be same as number of buckets.
                    int bucketCount = p.getBucketCount();
                    if (fileNames.size() != 0 && fileNames.size() != bucketCount) {
                        String msg = "The number of buckets for table " + tbl.getTableName() + " partition " + p.getName() + " is " + p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
                        throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
                    }
                    if (alias.equals(baseBigAlias)) {
                        bigTblPartsToBucketFileNames.put(p, fileNames);
                        bigTblPartsToBucketNumber.put(p, bucketCount);
                    } else {
                        files.add(fileNames);
                        buckets.add(bucketCount);
                    }
                }
                if (!alias.equals(baseBigAlias)) {
                    tblAliasToNumberOfBucketsInEachPartition.put(alias, buckets);
                    tblAliasToBucketedFilePathsInEachPartition.put(alias, files);
                }
            }
        } else {
            if (!checkBucketColumns(tbl.getBucketCols(), keys, joinKeyOrder)) {
                return false;
            }
            List<String> fileNames = getBucketFilePathsOfPartition(tbl.getDataLocation(), pGraphContext);
            Integer num = new Integer(tbl.getNumBuckets());
            // The number of files for the table should be same as number of buckets.
            if (fileNames.size() != 0 && fileNames.size() != num) {
                String msg = "The number of buckets for table " + tbl.getTableName() + " is " + tbl.getNumBuckets() + ", whereas the number of files is " + fileNames.size();
                throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
            }
            if (alias.equals(baseBigAlias)) {
                bigTblPartsToBucketFileNames.put(null, fileNames);
                bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
                bigTablePartitioned = false;
            } else {
                tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.asList(num));
                tblAliasToBucketedFilePathsInEachPartition.put(alias, Arrays.asList(fileNames));
            }
        }
    }
    // the big table can be divided by no of buckets in small tables.
    for (Integer numBucketsInPartitionOfBigTable : bigTblPartsToBucketNumber.values()) {
        if (!checkNumberOfBucketsAgainstBigTable(tblAliasToNumberOfBucketsInEachPartition, numBucketsInPartitionOfBigTable)) {
            return false;
        }
    }
    context.setTblAliasToNumberOfBucketsInEachPartition(tblAliasToNumberOfBucketsInEachPartition);
    context.setTblAliasToBucketedFilePathsInEachPartition(tblAliasToBucketedFilePathsInEachPartition);
    context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
    context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
    context.setJoinAliases(joinAliases);
    context.setBaseBigAlias(baseBigAlias);
    context.setBigTablePartitioned(bigTablePartitioned);
    if (!aliasToNewAliasMap.isEmpty()) {
        context.setAliasToNewAliasMap(aliasToNewAliasMap);
    }
    return true;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ArrayList(java.util.ArrayList) List(java.util.List) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 60 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class AbstractSMBJoinProc method isEligibleForBucketSortMergeJoin.

/**
 * Whether this table is eligible for a sort-merge join.
 *
 * @param pctx                  parse context
 * @param op                    map join operator being considered
 * @param joinTree              join tree being considered
 * @param alias                 table alias in the join tree being checked
 * @param pos                   position of the table
 * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
 *                              It is not initialized when pos = 0.
 * @return
 * @throws SemanticException
 */
private boolean isEligibleForBucketSortMergeJoin(SortBucketJoinProcCtx smbJoinContext, List<ExprNodeDesc> keys, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, String[] aliases, int pos, List<Order> sortColumnsFirstTable) throws SemanticException {
    String alias = aliases[pos];
    /*
     * Consider a query like:
     *
     * select -- mapjoin(subq1) --  * from
     * (select a.key, a.value from tbl1 a) subq1
     *   join
     * (select a.key, a.value from tbl2 a) subq2
     * on subq1.key = subq2.key;
     *
     * aliasToOpInfo contains the SelectOperator for subq1 and subq2.
     * We need to traverse the tree (using TableAccessAnalyzer) to get to the base
     * table. If the object being map-joined is a base table, then aliasToOpInfo
     * contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
     */
    Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
    if (topOp == null) {
        return false;
    }
    // get all join columns from join keys
    List<String> joinCols = toColumns(keys);
    if (joinCols == null || joinCols.isEmpty()) {
        return false;
    }
    TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
    if (tso == null) {
        return false;
    }
    /*
     * Consider a query like:
     *
     * select count(*) from
     *   (
     *     select key, count(*) from
     *       (
     *         select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
     *         from tbl1 a join tbl2 b on a.key = b.key
     *       ) subq1
     *     group by key
     *   ) subq2;
     *
     * The table alias should be subq2:subq1:a which needs to be fetched from topOps.
     */
    if (pGraphContext.getTopOps().containsValue(tso)) {
        for (Map.Entry<String, TableScanOperator> topOpEntry : this.pGraphContext.getTopOps().entrySet()) {
            if (topOpEntry.getValue() == tso) {
                alias = topOpEntry.getKey();
                aliases[pos] = alias;
                break;
            }
        }
    } else {
        // Ideally, this should never happen, and this should be an assert.
        return false;
    }
    Table tbl = tso.getConf().getTableMetadata();
    if (tbl.isPartitioned()) {
        PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
        List<Partition> partitions = prunedParts.getNotDeniedPartns();
        // first table
        if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
            Partition firstPartition = partitions.get(0);
            sortColumnsFirstTable.addAll(firstPartition.getSortCols());
        }
        for (Partition partition : prunedParts.getNotDeniedPartns()) {
            if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols, sortColumnsFirstTable)) {
                return false;
            }
        }
        return true;
    }
    // Populate the names and order of columns for the first table
    if (pos == 0) {
        sortColumnsFirstTable.addAll(tbl.getSortCols());
    }
    return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols, sortColumnsFirstTable);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)88 Operator (org.apache.hadoop.hive.ql.exec.Operator)35 ArrayList (java.util.ArrayList)33 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)28 Table (org.apache.hadoop.hive.ql.metadata.Table)21 HashMap (java.util.HashMap)20 Path (org.apache.hadoop.fs.Path)20 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)20 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)19 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)19 LinkedHashMap (java.util.LinkedHashMap)18 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)18 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)15 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)15 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)14 Map (java.util.Map)13 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)12 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)12