Search in sources :

Example 66 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class DynamicPartitionPruningOptimization method createFinalRsForSemiJoinOp.

private void createFinalRsForSemiJoinOp(ParseContext parseContext, TableScanOperator ts, GroupByOperator gb, ExprNodeDesc key, String keyBaseAlias, ExprNodeDesc colExpr, boolean isHint) throws SemanticException {
    ArrayList<String> gbOutputNames = new ArrayList<>();
    // One each for min, max and bloom filter
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
    int colPos = 0;
    ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
    for (int i = 0; i < gbOutputNames.size() - 1; i++) {
        ExprNodeColumnDesc expr = new ExprNodeColumnDesc(key.getTypeInfo(), gbOutputNames.get(colPos++), "", false);
        rsValueCols.add(expr);
    }
    // Bloom Filter uses binary
    ExprNodeColumnDesc colBFExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, gbOutputNames.get(colPos++), "", false);
    rsValueCols.add(colBFExpr);
    // Create the final Reduce Sink Operator
    ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc(new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false, -1, 0, 1, Operation.NOT_ACID);
    ReduceSinkOperator rsOpFinal = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDescFinal, new RowSchema(gb.getSchema()), gb);
    Map<String, ExprNodeDesc> columnExprMap = new HashMap<>();
    rsOpFinal.setColumnExprMap(columnExprMap);
    LOG.debug("DynamicSemiJoinPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
    SemiJoinBranchInfo sjInfo = new SemiJoinBranchInfo(ts, isHint);
    parseContext.getRsToSemiJoinBranchInfo().put(rsOpFinal, sjInfo);
    // Save the info that is required at query time to resolve dynamic/runtime values.
    RuntimeValuesInfo runtimeValuesInfo = new RuntimeValuesInfo();
    TableDesc rsFinalTableDesc = PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(rsValueCols, "_col"));
    List<String> dynamicValueIDs = new ArrayList<String>();
    dynamicValueIDs.add(keyBaseAlias + "_min");
    dynamicValueIDs.add(keyBaseAlias + "_max");
    dynamicValueIDs.add(keyBaseAlias + "_bloom_filter");
    runtimeValuesInfo.setTableDesc(rsFinalTableDesc);
    runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs);
    runtimeValuesInfo.setColExprs(rsValueCols);
    runtimeValuesInfo.setTsColExpr(colExpr);
    parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
    parseContext.getColExprToGBMap().put(key, gb);
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SemiJoinHint(org.apache.hadoop.hive.ql.parse.SemiJoinHint) RuntimeValuesInfo(org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 67 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class GenMRUnion1 method processSubQueryUnionCreateIntermediate.

/**
 * Process the union when the parent is a map-reduce job. Create a temporary
 * output, and let the union task read from the temporary output.
 *
 * The files created for all the inputs are in the union context and later
 * used to initialize the union plan
 *
 * @param parent
 * @param child
 * @param uTask
 * @param ctx
 * @param uCtxTask
 */
private void processSubQueryUnionCreateIntermediate(Operator<? extends OperatorDesc> parent, Operator<? extends OperatorDesc> child, Task<? extends Serializable> uTask, GenMRProcContext ctx, GenMRUnionCtx uCtxTask) {
    ParseContext parseCtx = ctx.getParseCtx();
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // generate the temporary file
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(parent, child, taskTmpDir, tt_desc, parseCtx);
    // Add the path to alias mapping
    uCtxTask.addTaskTmpDir(taskTmpDir.toUri().toString());
    uCtxTask.addTTDesc(tt_desc);
    uCtxTask.addListTopOperators(tableScanOp);
    // The union task is empty. The files created for all the inputs are
    // assembled in the union context and later used to initialize the union
    // plan
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    currTask.addDependentTask(uTask);
    if (ctx.getRootTasks().contains(uTask)) {
        ctx.getRootTasks().remove(uTask);
        if (!ctx.getRootTasks().contains(currTask) && shouldBeRootTask(currTask)) {
            ctx.getRootTasks().add(currTask);
        }
    }
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) UnionProcContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 68 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * Create a MapredWork based on input path, the top operator and the input
 * table descriptor.
 *
 * @param conf
 * @param topOp
 *          the table scan operator that is the root of the MapReduce task.
 * @param fsDesc
 *          the file sink descriptor that serves as the input to this merge task.
 * @param parentMR
 *          the parent MapReduce work
 * @param parentFS
 *          the last FileSinkOperator in the parent MapReduce work
 * @return the MapredWork
 */
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
    ArrayList<String> aliases = new ArrayList<String>();
    Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
    String inputDirStr = inputDir.toString().intern();
    TableDesc tblDesc = fsDesc.getTableInfo();
    // dummy alias: just use the input path
    aliases.add(inputDirStr);
    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.addPathToAlias(inputDir, aliases);
    cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDirStr, topOp);
    cplan.setMapperCannotSpanPartns(true);
    return cplan;
}
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 69 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class GenMapRedUtils method setUnionPlan.

private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, Task<? extends Serializable> currTask, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException {
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    if (currTopOp != null) {
        String currAliasId = opProcCtx.getCurrAliasId();
        if (mergeTask || !opProcCtx.isSeenOp(currTask, currTopOp)) {
            setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
        }
        currTopOp = null;
        opProcCtx.setCurrTopOp(currTopOp);
    } else {
        List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
        if ((taskTmpDirLst != null) && !(taskTmpDirLst.isEmpty())) {
            List<TableDesc> tt_descLst = uCtx.getTTDesc();
            assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
            assert taskTmpDirLst.size() == tt_descLst.size();
            int size = taskTmpDirLst.size();
            assert local == false;
            List<TableScanOperator> topOperators = uCtx.getListTopOperators();
            MapredWork plan = (MapredWork) currTask.getWork();
            for (int pos = 0; pos < size; pos++) {
                String taskTmpDir = taskTmpDirLst.get(pos);
                Path taskTmpDirPath = new Path(taskTmpDir);
                MapWork mWork = plan.getMapWork();
                if (!mWork.getPathToAliases().containsKey(taskTmpDirPath)) {
                    taskTmpDir = taskTmpDir.intern();
                    StringInternUtils.internUriStringsInPath(taskTmpDirPath);
                    TableDesc tt_desc = tt_descLst.get(pos);
                    mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
                    mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
                    mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 70 with TableDesc

use of org.apache.hadoop.hive.ql.plan.TableDesc in project hive by apache.

the class MapJoinProcessor method genMapJoinLocalWork.

/**
 * Generate the MapRed Local Work for the given map-join operator
 *
 * @param newWork
 * @param mapJoinOp
 *          map-join operator for which local work needs to be generated.
 * @param bigTablePos
 * @throws SemanticException
 */
private static void genMapJoinLocalWork(MapredWork newWork, MapJoinOperator mapJoinOp, int bigTablePos) throws SemanticException {
    // keep the small table alias to avoid concurrent modification exception
    ArrayList<String> smallTableAliasList = new ArrayList<String>();
    // create a new  MapredLocalWork
    MapredLocalWork newLocalWork = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : newWork.getMapWork().getAliasToWork().entrySet()) {
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> op = entry.getValue();
        // if the table scan is for big table; then skip it
        // tracing down the operator tree from the table scan operator
        Operator<? extends OperatorDesc> parentOp = op;
        Operator<? extends OperatorDesc> childOp = op.getChildOperators().get(0);
        while ((childOp != null) && (!childOp.equals(mapJoinOp))) {
            parentOp = childOp;
            assert parentOp.getChildOperators().size() == 1;
            childOp = parentOp.getChildOperators().get(0);
        }
        if (childOp == null) {
            throw new SemanticException("Cannot find join op by tracing down the table scan operator tree");
        }
        // skip the big table pos
        int i = childOp.getParentOperators().indexOf(parentOp);
        if (i == bigTablePos) {
            continue;
        }
        // set alias to work and put into smallTableAliasList
        newLocalWork.getAliasToWork().put(alias, op);
        smallTableAliasList.add(alias);
        // get input path and remove this alias from pathToAlias
        // because this file will be fetched by fetch operator
        LinkedHashMap<Path, ArrayList<String>> pathToAliases = newWork.getMapWork().getPathToAliases();
        // keep record all the input path for this alias
        HashSet<Path> pathSet = new HashSet<>();
        HashSet<Path> emptyPath = new HashSet<>();
        for (Map.Entry<Path, ArrayList<String>> entry2 : pathToAliases.entrySet()) {
            Path path = entry2.getKey();
            ArrayList<String> list = entry2.getValue();
            if (list.contains(alias)) {
                // add to path set
                pathSet.add(path);
                // remove this alias from the alias list
                list.remove(alias);
                if (list.size() == 0) {
                    emptyPath.add(path);
                }
            }
        }
        // remove the path, with which no alias associates
        for (Path path : emptyPath) {
            newWork.getMapWork().removePathToAlias(path);
        }
        // create fetch work
        FetchWork fetchWork = null;
        List<Path> partDir = new ArrayList<Path>();
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        for (Path tablePath : pathSet) {
            PartitionDesc partitionDesc = newWork.getMapWork().getPathToPartitionInfo().get(tablePath);
            // create fetchwork for non partitioned table
            if (partitionDesc.getPartSpec() == null || partitionDesc.getPartSpec().size() == 0) {
                fetchWork = new FetchWork(tablePath, partitionDesc.getTableDesc());
                break;
            }
            // if table is partitioned,add partDir and partitionDesc
            partDir.add(tablePath);
            partDesc.add(partitionDesc);
        }
        // create fetchwork for partitioned table
        if (fetchWork == null) {
            TableDesc table = newWork.getMapWork().getAliasToPartnInfo().get(alias).getTableDesc();
            fetchWork = new FetchWork(partDir, partDesc, table);
        }
        // set alias to fetch work
        newLocalWork.getAliasToFetchWork().put(alias, fetchWork);
    }
    // remove small table ailias from aliasToWork;Avoid concurrent modification
    for (String alias : smallTableAliasList) {
        newWork.getMapWork().getAliasToWork().remove(alias);
    }
    // set up local work
    newWork.getMapWork().setMapRedLocalWork(newLocalWork);
    // remove reducer
    newWork.setReduceWork(null);
}
Also used : LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HashSet(java.util.HashSet)

Aggregations

TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)93 ArrayList (java.util.ArrayList)47 Path (org.apache.hadoop.fs.Path)34 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)29 HashMap (java.util.HashMap)26 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)26 LinkedHashMap (java.util.LinkedHashMap)23 Properties (java.util.Properties)19 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)19 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)18 Operator (org.apache.hadoop.hive.ql.exec.Operator)16 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)16 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)16 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 List (java.util.List)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)14 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)14 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)11 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)11