Search in sources :

Example 41 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenMRFileSink1 method process.

/**
 * File Sink Operator encountered.
 *
 * @param nd
 *          the file sink operator encountered
 * @param opProcCtx
 *          context
 */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    // we should look take the parent of fsOp's task as the current task.
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(fsOp.getParentOperators().get(0));
    Task<?> currTask = mapredCtx.getCurrTask();
    ctx.setCurrTask(currTask);
    ctx.addRootIfPossible(currTask);
    // is INSERT OVERWRITE TABLE
    boolean isInsertTable = GenMapRedUtils.isInsertInto(parseCtx, fsOp);
    HiveConf hconf = parseCtx.getConf();
    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork) currTask.getWork()).setFinalMapRed(true);
    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<?>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
        Task<?> childTask = fileSinkDescs.get(fsOp.getConf());
        processLinkedFileDesc(ctx, childTask);
        return true;
    }
    // So, no need to attempt to merge the files again.
    if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {
        chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
    }
    Path finalName = processFS(fsOp, stack, opProcCtx, chDir);
    if (chDir) {
        // Merge the files in the destination table/partitions by creating Map-only merge job
        // If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
        // OrcFileStripeMerge task would be created.
        LOG.info("using CombineHiveInputformat for the merge job");
        GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName, ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(), hconf, currTask, parseCtx.getQueryState().getLineageState());
    }
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    // There are linked file sink operators and child tasks are present
    if (fileSinkDesc.isLinkedFileSink() && (currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
        Map<FileSinkDesc, Task<?>> linkedFileDescTasks = ctx.getLinkedFileDescTasks();
        if (linkedFileDescTasks == null) {
            linkedFileDescTasks = new HashMap<FileSinkDesc, Task<?>>();
            ctx.setLinkedFileDescTasks(linkedFileDescTasks);
        }
        for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
            linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
        }
    }
    FetchTask fetchTask = parseCtx.getFetchTask();
    if (fetchTask != null && currTask.getNumChild() == 0) {
        if (fetchTask.isFetchFrom(fileSinkDesc)) {
            currTask.setFetchSource(true);
        }
    }
    return true;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Path(org.apache.hadoop.fs.Path) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) HiveConf(org.apache.hadoop.hive.conf.HiveConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 42 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenMRRedSink2 method process.

/**
 * Reduce Scan encountered.
 *
 * @param nd
 *          the reduce sink operator encountered
 * @param opProcCtx
 *          context
 */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<?> currTask = mapredCtx.getCurrTask();
    String currAliasId = mapredCtx.getCurrAliasId();
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, Task<?>> opTaskMap = ctx.getOpTaskMap();
    Task<?> oldTask = opTaskMap.get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    if (oldTask == null) {
        GenMapRedUtils.splitPlan(op, ctx);
    } else {
        GenMapRedUtils.splitPlan(op, currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Task(org.apache.hadoop.hive.ql.exec.Task) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 43 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenMapRedUtils method setKeyAndValueDesc.

/**
 * set key and value descriptor.
 *
 * @param plan
 *          current plan
 * @param topOp
 *          current top operator in the path
 */
public static void setKeyAndValueDesc(ReduceWork plan, Operator<? extends OperatorDesc> topOp) {
    if (topOp == null) {
        return;
    }
    if (topOp instanceof ReduceSinkOperator) {
        ReduceSinkOperator rs = (ReduceSinkOperator) topOp;
        setKeyAndValueDesc(plan, rs);
    } else {
        List<Operator<? extends OperatorDesc>> children = topOp.getChildOperators();
        if (children != null) {
            for (Operator<? extends OperatorDesc> op : children) {
                setKeyAndValueDesc(plan, op);
            }
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 44 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
 * Split two tasks by creating a temporary file between them.
 *
 * @param op reduce sink operator being processed
 * @param parentTask the parent task
 * @param childTask the child task
 * @param opProcCtx context
 */
void splitTasks(ReduceSinkOperator op, Task<?> parentTask, Task<?> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<?>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 45 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenMapRedUtils method createMergeTask.

/**
 * Create a block level merge task for RCFiles or stripe level merge task for
 * ORCFiles
 *
 * @param fsInputDesc
 * @param finalName
 * @param hasDynamicPartitions
 * @param ctx
 * @return MergeWork if table is stored as RCFile or ORCFile,
 *         null otherwise
 */
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
    Path inputDir = fsInputDesc.getMergeInputDirName();
    TableDesc tblDesc = fsInputDesc.getTableInfo();
    List<Path> inputDirs = new ArrayList<Path>(1);
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    // in case of dynamic partitioning and list bucketing
    if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
        inputDirs.add(inputDir);
    }
    inputDirstr.add(inputDir.toString());
    // internal input format class for CombineHiveInputFormat
    final Class<? extends InputFormat> internalIFClass;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        internalIFClass = RCFileBlockMergeInputFormat.class;
    } else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
        internalIFClass = OrcFileStripeMergeInputFormat.class;
    } else {
        throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
    }
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("creating mergefilework from " + inputDirs + " to " + finalName);
    }
    // create the merge file work
    MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName(), tblDesc);
    Map<Path, List<String>> pathToAliases = new LinkedHashMap<>();
    pathToAliases.put(inputDir, inputDirstr);
    work.setMapperCannotSpanPartns(true);
    work.setPathToAliases(pathToAliases);
    PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
    pDesc.setInputFileFormatClass(internalIFClass);
    work.addPathToPartitionInfo(inputDir, pDesc);
    work.setListBucketingCtx(fsInputDesc.getLbCtx());
    // create alias to work which contains the merge operator
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    Operator<? extends OperatorDesc> mergeOp = null;
    final FileMergeDesc fmd;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        fmd = new OrcFileMergeDesc();
    }
    fmd.setIsMmTable(fsInputDesc.isMmTable());
    boolean isCompactionTable = AcidUtils.isCompactionTable(tblDesc.getProperties());
    fmd.setIsCompactionTable(isCompactionTable);
    fmd.setWriteId(fsInputDesc.getTableWriteId());
    int stmtId = fsInputDesc.getStatementId();
    fmd.setStmtId(stmtId == -1 ? 0 : stmtId);
    fmd.setDpCtx(fsInputDesc.getDynPartCtx());
    fmd.setOutputPath(finalName);
    fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
    fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
    int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
    fmd.setListBucketingDepth(lbLevel);
    mergeOp = OperatorFactory.get(ctx, fmd);
    aliasToWork.put(inputDir.toString(), mergeOp);
    work.setAliasToWork(aliasToWork);
    return work;
}
Also used : Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) OrcFileStripeMergeInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeInputFormat) LinkedHashMap(java.util.LinkedHashMap) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ArrayList(java.util.ArrayList) List(java.util.List) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

Operator (org.apache.hadoop.hive.ql.exec.Operator)215 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)167 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)156 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)134 ArrayList (java.util.ArrayList)123 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)119 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)118 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)107 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)103 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)97 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)85 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)85 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)79 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)71 HashMap (java.util.HashMap)65 LinkedHashMap (java.util.LinkedHashMap)64 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)62 LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)60 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)59 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)52