Examples with MapWork - org.apache.hadoop.hive.ql.plan.MapWork

Example 16 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // See the discussion in the implementation as to why we generate app ID.
        ApplicationId applicationId = coordinator.createExtClientAppId();
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 17 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project haivvreo by jghoman.

the class AvroGenericRecordReader method getSchema.

/**
   * Attempt to retrieve the reader schema.  Haivvreo has a couple opportunities
   * to provide this, depending on whether or not we're just selecting data
   * or running with a MR job.
   * @return  Reader schema for the Avro object, or null if it has not been provided.
   * @throws HaivvreoException
   */
private Schema getSchema(JobConf job, FileSplit split) throws HaivvreoException, IOException {
    FileSystem fs = split.getPath().getFileSystem(job);
    // Inside of a MR job, we can pull out the actual properties
    if (HaivvreoUtils.insideMRJob(job)) {
        MapWork mapWork = Utilities.getMapWork(job);
        // that matches our input split.
        for (Map.Entry<String, PartitionDesc> pathsAndParts : mapWork.getPathToPartitionInfo().entrySet()) {
            String partitionPath = pathsAndParts.getKey();
            if (pathIsInPartition(split.getPath().makeQualified(fs), partitionPath)) {
                if (LOG.isInfoEnabled())
                    LOG.info("Matching partition " + partitionPath + " with input split " + split);
                Properties props = pathsAndParts.getValue().getProperties();
                if (props.containsKey(HaivvreoUtils.SCHEMA_LITERAL) || props.containsKey(HaivvreoUtils.SCHEMA_URL)) {
                    return HaivvreoUtils.determineSchemaOrThrowException(job, props);
                } else
                    // If it's not in this property, it won't be in any others
                    return null;
            }
        }
        if (LOG.isInfoEnabled())
            LOG.info("Unable to match filesplit " + split + " with a partition.");
    }
    // In "select * from table" situations (non-MR), Haivvreo can add things to the job
    // It's safe to add this to the job since it's not *actually* a mapred job.
    // Here the global state is confined to just this process.
    String s = job.get(AvroSerDe.HAIVVREO_SCHEMA);
    if (s != null) {
        LOG.info("Found the avro schema in the job: " + s);
        return Schema.parse(s);
    }
    // No more places to get the schema from. Give up.  May have to re-encode later.
    return null;
}

Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Properties(java.util.Properties) Map(java.util.Map)

Example 18 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TezCompiler method setInputFormat.

@Override
protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof TezTask) {
        TezWork work = ((TezTask) task).getWork();
        List<BaseWork> all = work.getAllWork();
        for (BaseWork w : all) {
            if (w instanceof MapWork) {
                MapWork mapWork = (MapWork) w;
                HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork();
                if (!opMap.isEmpty()) {
                    for (Operator<? extends OperatorDesc> op : opMap.values()) {
                        setInputFormat(mapWork, op);
                    }
                }
            }
        }
    } else if (task instanceof ConditionalTask) {
        List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
        for (Task<? extends Serializable> tsk : listTasks) {
            setInputFormat(tsk);
        }
    }
    if (task.getChildTasks() != null) {
        for (Task<? extends Serializable> childTask : task.getChildTasks()) {
            setInputFormat(childTask);
        }
    }
}

Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 19 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SortMergeJoinTaskDispatcher method processCurrentTask.

@Override
public Task<? extends Serializable> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
    // whether it contains a sort merge join operator
    MapredWork currWork = currTask.getWork();
    SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
    if (!isEligibleForOptimization(originalSMBJoinOp)) {
        return null;
    }
    currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
    // Convert the work containing to sort-merge join into a work, as if it had a regular join.
    // Note that the operator tree is not changed - is still contains the SMB join, but the
    // plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
    // This is used to convert the plan to a map-join, and then the original SMB join plan is used
    // as a backup task.
    MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
    SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
    currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
    currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
    currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
    currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
    currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
    currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    // create task to aliases mapping and alias to input file mapping for resolver
    // Must be deterministic order map for consistent q-test output across Java versions
    HashMap<Task<? extends Serializable>, Set<String>> taskToAliases = new LinkedHashMap<Task<? extends Serializable>, Set<String>>();
    // Note that pathToAlias will behave as if the original plan was a join plan
    HashMap<Path, ArrayList<String>> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
    // generate a map join task for the big table
    SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
    Byte[] order = originalSMBJoinDesc.getTagOrder();
    int numAliases = order.length;
    Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
    HashMap<String, Long> aliasToSize = new HashMap<String, Long>();
    Configuration conf = context.getConf();
    try {
        long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(), pathToAliases, aliasToSize);
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(bigTablePosition)) {
                continue;
            }
            // create map join task for the given big table position
            MapRedTask newTask = convertSMBTaskToMapJoinTask(currJoinWork, bigTablePosition, newSMBJoinOp);
            MapWork mapWork = newTask.getWork().getMapWork();
            Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
            Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (aliasKnownSize > 0) {
                long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
                if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
                    // this table is not good to be a big table.
                    continue;
                }
            }
            // add into conditional task
            listWorks.add(newTask.getWork());
            listTasks.add(newTask);
            newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
            newTask.setFetchSource(currTask.isFetchSource());
            // set up backup task
            newTask.setBackupTask(currTask);
            newTask.setBackupChildrenTasks(currTask.getChildTasks());
            // put the mapping task to aliases
            taskToAliases.put(newTask, aliases);
        }
    } catch (Exception e) {
        e.printStackTrace();
        throw new SemanticException("Generate Map Join Task Error: ", e);
    }
    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.getMapWork().setLeftInputJoin(false);
    currWork.getMapWork().setBaseSrc(null);
    currWork.getMapWork().setMapAliases(null);
    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    // set resolver and resolver context
    cndTsk.setResolver(new ConditionalResolverCommonJoin());
    ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
    resolverCtx.setPathToAliases(pathToAliases);
    resolverCtx.setAliasToKnownSize(aliasToSize);
    resolverCtx.setTaskToAliases(taskToAliases);
    resolverCtx.setCommonJoinTask(currTask);
    resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
    resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
    cndTsk.setResolverCtx(resolverCtx);
    // replace the current task with the new generated conditional task
    replaceTaskWithConditionalTask(currTask, cndTsk);
    return cndTsk;
}

Also used : Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Set(java.util.Set) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalResolverCommonJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ConditionalResolverCommonJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork)

Example 20 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkSkewJoinProcFactory method splitTask.

/**
 * If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
 */
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
    SparkWork currentWork = currentTask.getWork();
    Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
    if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
        ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
        BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
        SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
        // disconnect the reduce work from its child. this should produce two isolated sub graphs
        currentWork.disconnect(reduceWork, childWork);
        // move works following the current reduce work into a new spark work
        SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        newWork.add(childWork);
        copyWorkGraph(currentWork, newWork, childWork);
        // remove them from current spark work
        for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
            currentWork.remove(baseWork);
            currentWork.getCloneToWork().remove(baseWork);
        }
        // create TS to read intermediate data
        Context baseCtx = parseContext.getContext();
        Path taskTmpDir = baseCtx.getMRTmpPath();
        Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
        TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
        // this will insert FS and TS between the RS and its parent
        TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
        // create new MapWork
        MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
        mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        newWork.add(mapWork);
        newWork.connect(mapWork, childWork, originEdge);
        // setup the new map work
        String streamDesc = taskTmpDir.toUri().toString();
        if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
            Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
            String id = null;
            if (childReducer instanceof JoinOperator) {
                if (parseContext.getJoinOps().contains(childReducer)) {
                    id = ((JoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof MapJoinOperator) {
                if (parseContext.getMapJoinOps().contains(childReducer)) {
                    id = ((MapJoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof SMBMapJoinOperator) {
                if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
                    id = ((SMBMapJoinOperator) childReducer).getConf().getId();
                }
            }
            if (id != null) {
                streamDesc = id + ":$INTNAME";
            } else {
                streamDesc = "$INTNAME";
            }
            String origStreamDesc = streamDesc;
            int pos = 0;
            while (mapWork.getAliasToWork().get(streamDesc) != null) {
                streamDesc = origStreamDesc.concat(String.valueOf(++pos));
            }
        }
        GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
        // insert the new task between current task and its child
        @SuppressWarnings("unchecked") Task<? extends Serializable> newTask = TaskFactory.get(newWork);
        List<Task<? extends Serializable>> childTasks = currentTask.getChildTasks();
        // must have at most one child
        if (childTasks != null && childTasks.size() > 0) {
            Task<? extends Serializable> childTask = childTasks.get(0);
            currentTask.removeDependentTask(childTask);
            newTask.addDependentTask(childTask);
        }
        currentTask.addDependentTask(newTask);
        newTask.setFetchSource(currentTask.isFetchSource());
    }
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)79 ArrayList (java.util.ArrayList)25 Path (org.apache.hadoop.fs.Path)24 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Test (org.junit.Test)15 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)14 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)13 Serializable (java.io.Serializable)12 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)12 Context (org.apache.hadoop.hive.ql.Context)11 LinkedHashMap (java.util.LinkedHashMap)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10