Examples with MapWork - org.apache.hadoop.hive.ql.plan.MapWork

Example 56 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkCompiler method setInputFormat.

@Override
protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof SparkTask) {
        SparkWork work = ((SparkTask) task).getWork();
        List<BaseWork> all = work.getAllWork();
        for (BaseWork w : all) {
            if (w instanceof MapWork) {
                MapWork mapWork = (MapWork) w;
                HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork();
                if (!opMap.isEmpty()) {
                    for (Operator<? extends OperatorDesc> op : opMap.values()) {
                        setInputFormat(mapWork, op);
                    }
                }
            }
        }
    } else if (task instanceof ConditionalTask) {
        List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
        for (Task<? extends Serializable> tsk : listTasks) {
            setInputFormat(tsk);
        }
    }
    if (task.getChildTasks() != null) {
        for (Task<? extends Serializable> childTask : task.getChildTasks()) {
            setInputFormat(childTask);
        }
    }
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) List(java.util.List) ArrayList(java.util.ArrayList) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 57 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * Create a MapredWork based on input path, the top operator and the input
 * table descriptor.
 *
 * @param conf
 * @param topOp
 *          the table scan operator that is the root of the MapReduce task.
 * @param fsDesc
 *          the file sink descriptor that serves as the input to this merge task.
 * @param parentMR
 *          the parent MapReduce work
 * @param parentFS
 *          the last FileSinkOperator in the parent MapReduce work
 * @return the MapredWork
 */
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
    ArrayList<String> aliases = new ArrayList<String>();
    Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
    String inputDirStr = inputDir.toString().intern();
    TableDesc tblDesc = fsDesc.getTableInfo();
    // dummy alias: just use the input path
    aliases.add(inputDirStr);
    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.addPathToAlias(inputDir, aliases);
    cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDirStr, topOp);
    cplan.setMapperCannotSpanPartns(true);
    return cplan;
}

Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 58 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class GenMapRedUtils method setUnionPlan.

private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, Task<? extends Serializable> currTask, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException {
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    if (currTopOp != null) {
        String currAliasId = opProcCtx.getCurrAliasId();
        if (mergeTask || !opProcCtx.isSeenOp(currTask, currTopOp)) {
            setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
        }
        currTopOp = null;
        opProcCtx.setCurrTopOp(currTopOp);
    } else {
        List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
        if ((taskTmpDirLst != null) && !(taskTmpDirLst.isEmpty())) {
            List<TableDesc> tt_descLst = uCtx.getTTDesc();
            assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
            assert taskTmpDirLst.size() == tt_descLst.size();
            int size = taskTmpDirLst.size();
            assert local == false;
            List<TableScanOperator> topOperators = uCtx.getListTopOperators();
            MapredWork plan = (MapredWork) currTask.getWork();
            for (int pos = 0; pos < size; pos++) {
                String taskTmpDir = taskTmpDirLst.get(pos);
                Path taskTmpDirPath = new Path(taskTmpDir);
                MapWork mWork = plan.getMapWork();
                if (!mWork.getPathToAliases().containsKey(taskTmpDirPath)) {
                    taskTmpDir = taskTmpDir.intern();
                    StringInternUtils.internUriStringsInPath(taskTmpDirPath);
                    TableDesc tt_desc = tt_descLst.get(pos);
                    mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
                    mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
                    mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
                }
            }
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 59 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkPlanGenerator method generateParentTran.

// Generate (possibly get from a cached result) parent SparkTran
private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception {
    if (cloneToWork.containsKey(work)) {
        BaseWork originalWork = cloneToWork.get(work);
        if (workToParentWorkTranMap.containsKey(originalWork)) {
            return workToParentWorkTranMap.get(originalWork);
        }
    }
    SparkTran result;
    if (work instanceof MapWork) {
        result = generateMapInput(sparkPlan, (MapWork) work);
        sparkPlan.addTran(result);
    } else if (work instanceof ReduceWork) {
        boolean toCache = cloneToWork.containsKey(work);
        List<BaseWork> parentWorks = sparkWork.getParents(work);
        SparkEdgeProperty sparkEdgeProperty = sparkWork.getEdgeProperty(parentWorks.get(0), work);
        result = generate(sparkPlan, sparkEdgeProperty, toCache, work.getName());
        sparkPlan.addTran(result);
        for (BaseWork parentWork : parentWorks) {
            sparkPlan.connect(workToTranMap.get(parentWork), result);
        }
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
    if (cloneToWork.containsKey(work)) {
        workToParentWorkTranMap.put(cloneToWork.get(work), result);
    }
    return result;
}

Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) List(java.util.List) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 60 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class DagUtils method createVertex.

private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
    if (mergeJoinWork.getMainWork() instanceof MapWork) {
        List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
        MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
        Vertex mergeVx = createVertex(conf, mapWork, fs, mrScratchDir, ctx, vertexType, localResources);
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set
        // to false when using this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
        for (int i = 0; i < mapWorkList.size(); i++) {
            mapWork = (MapWork) (mapWorkList.get(i));
            conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
            conf.set(Utilities.INPUT_NAME, mapWork.getName());
            LOG.info("Going through each work and adding MultiMRInput");
            mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
        }
        // To be populated for SMB joins only for all the small tables
        Map<String, Integer> inputToBucketMap = new HashMap<>();
        if (mergeJoinWork.getMergeJoinOperator().getParentOperators().size() == 1 && mergeJoinWork.getMergeJoinOperator().getOpTraits() != null) {
            // This is an SMB join.
            for (BaseWork work : mapWorkList) {
                MapWork mw = (MapWork) work;
                Map<String, Operator<?>> aliasToWork = mw.getAliasToWork();
                Preconditions.checkState(aliasToWork.size() == 1, "More than 1 alias in SMB mapwork");
                inputToBucketMap.put(mw.getName(), mw.getWorks().get(0).getOpTraits().getNumBuckets());
            }
        }
        VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
        // the +1 to the size is because of the main work.
        CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1, inputToBucketMap);
        DataOutputBuffer dob = new DataOutputBuffer();
        vertexConf.write(dob);
        byte[] userPayload = dob.getData();
        desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
        mergeVx.setVertexManagerPlugin(desc);
        return mergeVx;
    } else {
        return createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), fs, mrScratchDir, ctx, localResources);
    }
}

Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) VertexManagerPluginDescriptor(org.apache.tez.dag.api.VertexManagerPluginDescriptor) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)79 ArrayList (java.util.ArrayList)25 Path (org.apache.hadoop.fs.Path)24 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Test (org.junit.Test)15 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)14 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)13 Serializable (java.io.Serializable)12 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)12 Context (org.apache.hadoop.hive.ql.Context)11 LinkedHashMap (java.util.LinkedHashMap)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10