Search in sources :

Example 41 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class VectorPTFOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    if (LOG.isDebugEnabled()) {
        // Determine the name of our map or reduce task for debug tracing.
        BaseWork work = Utilities.getMapWork(hconf);
        if (work == null) {
            work = Utilities.getReduceWork(hconf);
        }
        taskName = work.getName();
    }
    if (!isPartitionOrderBy) {
        currentPartitionIsNull = null;
        currentPartitionLongs = null;
        currentPartitionDoubles = null;
        currentPartitionByteArrays = null;
        currentPartitionByteLengths = null;
        currentPartitionDecimals = null;
        currentPartitionTimestamps = null;
        currentPartitionIntervalDayTimes = null;
    } else {
        final int partitionKeyCount = vectorDesc.getPartitionExprNodeDescs().length;
        currentPartitionIsNull = new boolean[partitionKeyCount];
        currentPartitionLongs = new long[partitionKeyCount];
        currentPartitionDoubles = new double[partitionKeyCount];
        currentPartitionByteArrays = new byte[partitionKeyCount][];
        currentPartitionByteLengths = new int[partitionKeyCount];
        currentPartitionDecimals = new HiveDecimalWritable[partitionKeyCount];
        currentPartitionTimestamps = new Timestamp[partitionKeyCount];
        currentPartitionIntervalDayTimes = new HiveIntervalDayTime[partitionKeyCount];
    }
    evaluators = VectorPTFDesc.getEvaluators(vectorDesc, vectorPTFInfo);
    streamingEvaluatorNums = VectorPTFDesc.getStreamingEvaluatorNums(evaluators);
    allEvaluatorsAreStreaming = (streamingEvaluatorNums.length == evaluatorCount);
    /*
     * Setup the overflow batch.
     */
    overflowBatch = setupOverflowBatch();
    groupBatches = new VectorPTFGroupBatches(hconf, vectorDesc.getVectorizedPTFMaxMemoryBufferingBatchCount());
    groupBatches.init(reducerBatchTypeInfos, evaluators, outputProjectionColumnMap, outputTypeInfos, keyInputColumnMap, nonKeyInputColumnMap, streamingEvaluatorNums, overflowBatch);
    isFirstPartition = true;
    batchCounter = 0;
}
Also used : BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 42 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class SparkPlanGenerator method generateParentTran.

// Generate (possibly get from a cached result) parent SparkTran
private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception {
    if (cloneToWork.containsKey(work)) {
        BaseWork originalWork = cloneToWork.get(work);
        if (workToParentWorkTranMap.containsKey(originalWork)) {
            return workToParentWorkTranMap.get(originalWork);
        }
    }
    SparkTran result;
    if (work instanceof MapWork) {
        result = generateMapInput(sparkPlan, (MapWork) work);
        sparkPlan.addTran(result);
    } else if (work instanceof ReduceWork) {
        boolean toCache = cloneToWork.containsKey(work);
        List<BaseWork> parentWorks = sparkWork.getParents(work);
        SparkEdgeProperty sparkEdgeProperty = sparkWork.getEdgeProperty(parentWorks.get(0), work);
        result = generate(sparkPlan, sparkEdgeProperty, toCache, work.getName());
        sparkPlan.addTran(result);
        for (BaseWork parentWork : parentWorks) {
            sparkPlan.connect(workToTranMap.get(parentWork), result);
        }
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
    if (cloneToWork.containsKey(work)) {
        workToParentWorkTranMap.put(cloneToWork.get(work), result);
    }
    return result;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) List(java.util.List) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 43 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class SparkTask method getOperatorCounters.

private Map<String, List<String>> getOperatorCounters() {
    String groupName = HiveConf.getVar(conf, HiveConf.ConfVars.HIVECOUNTERGROUP);
    Map<String, List<String>> counters = new HashMap<String, List<String>>();
    List<String> hiveCounters = new LinkedList<String>();
    counters.put(groupName, hiveCounters);
    hiveCounters.add(Operator.HIVE_COUNTER_CREATED_FILES);
    // Spark transformation and Hive operators in SparkWork.
    for (MapOperator.Counter counter : MapOperator.Counter.values()) {
        hiveCounters.add(counter.toString());
    }
    SparkWork sparkWork = this.getWork();
    for (BaseWork work : sparkWork.getAllWork()) {
        for (Operator<? extends OperatorDesc> operator : work.getAllOperators()) {
            if (operator instanceof FileSinkOperator) {
                for (FileSinkOperator.Counter counter : FileSinkOperator.Counter.values()) {
                    hiveCounters.add(((FileSinkOperator) operator).getCounterName(counter));
                }
            } else if (operator instanceof ReduceSinkOperator) {
                final String contextName = conf.get(Operator.CONTEXT_NAME_KEY, "");
                for (ReduceSinkOperator.Counter counter : ReduceSinkOperator.Counter.values()) {
                    hiveCounters.add(Utilities.getVertexCounterName(counter.name(), contextName));
                }
            } else if (operator instanceof ScriptOperator) {
                for (ScriptOperator.Counter counter : ScriptOperator.Counter.values()) {
                    hiveCounters.add(counter.toString());
                }
            } else if (operator instanceof JoinOperator) {
                for (JoinOperator.SkewkeyTableCounter counter : JoinOperator.SkewkeyTableCounter.values()) {
                    hiveCounters.add(counter.toString());
                }
            }
        }
    }
    return counters;
}
Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) HashMap(java.util.HashMap) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) LinkedList(java.util.LinkedList) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 44 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class DagUtils method createVertex.

private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
    if (mergeJoinWork.getMainWork() instanceof MapWork) {
        List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
        MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
        Vertex mergeVx = createVertex(conf, mapWork, fs, mrScratchDir, ctx, vertexType, localResources);
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set
        // to false when using this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
        for (int i = 0; i < mapWorkList.size(); i++) {
            mapWork = (MapWork) (mapWorkList.get(i));
            conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
            conf.set(Utilities.INPUT_NAME, mapWork.getName());
            LOG.info("Going through each work and adding MultiMRInput");
            mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
        }
        // To be populated for SMB joins only for all the small tables
        Map<String, Integer> inputToBucketMap = new HashMap<>();
        if (mergeJoinWork.getMergeJoinOperator().getParentOperators().size() == 1 && mergeJoinWork.getMergeJoinOperator().getOpTraits() != null) {
            // This is an SMB join.
            for (BaseWork work : mapWorkList) {
                MapWork mw = (MapWork) work;
                Map<String, Operator<?>> aliasToWork = mw.getAliasToWork();
                Preconditions.checkState(aliasToWork.size() == 1, "More than 1 alias in SMB mapwork");
                inputToBucketMap.put(mw.getName(), mw.getWorks().get(0).getOpTraits().getNumBuckets());
            }
        }
        VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
        // the +1 to the size is because of the main work.
        CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1, inputToBucketMap);
        DataOutputBuffer dob = new DataOutputBuffer();
        vertexConf.write(dob);
        byte[] userPayload = dob.getData();
        desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
        mergeVx.setVertexManagerPlugin(desc);
        return mergeVx;
    } else {
        return createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), fs, mrScratchDir, ctx, localResources);
    }
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) VertexManagerPluginDescriptor(org.apache.tez.dag.api.VertexManagerPluginDescriptor) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 45 with BaseWork

use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.

the class TezTask method build.

DAG build(JobConf conf, TezWork work, Path scratchDir, Context ctx, Map<String, LocalResource> vertexResources) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    // getAllWork returns a topologically sorted list, which we use to make
    // sure that vertices are created before they are used in edges.
    List<BaseWork> ws = work.getAllWork();
    Collections.reverse(ws);
    FileSystem fs = scratchDir.getFileSystem(conf);
    // the name of the dag is what is displayed in the AM/Job UI
    String dagName = utils.createDagName(conf, queryPlan);
    LOG.info("Dag name: " + dagName);
    DAG dag = DAG.create(dagName);
    // set some info for the query
    JSONObject json = new JSONObject(new LinkedHashMap<>()).put("context", "Hive").put("description", ctx.getCmd());
    String dagInfo = json.toString();
    if (LOG.isDebugEnabled()) {
        LOG.debug("DagInfo: " + dagInfo);
    }
    dag.setDAGInfo(dagInfo);
    dag.setCredentials(conf.getCredentials());
    setAccessControlsForCurrentUser(dag, queryPlan.getQueryId(), conf);
    for (BaseWork w : ws) {
        boolean isFinal = work.getLeaves().contains(w);
        // translate work to vertex
        perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
        if (w instanceof UnionWork) {
            // Special case for unions. These items translate to VertexGroups
            List<BaseWork> unionWorkItems = new LinkedList<BaseWork>();
            List<BaseWork> children = new LinkedList<BaseWork>();
            // proper children of the union
            for (BaseWork v : work.getChildren(w)) {
                EdgeType type = work.getEdgeProperty(w, v).getEdgeType();
                if (type == EdgeType.CONTAINS) {
                    unionWorkItems.add(v);
                } else {
                    children.add(v);
                }
            }
            JobConf parentConf = workToConf.get(unionWorkItems.get(0));
            checkOutputSpec(w, parentConf);
            // create VertexGroup
            Vertex[] vertexArray = new Vertex[unionWorkItems.size()];
            int i = 0;
            for (BaseWork v : unionWorkItems) {
                vertexArray[i++] = workToVertex.get(v);
            }
            VertexGroup group = dag.createVertexGroup(w.getName(), vertexArray);
            // now hook up the children
            for (BaseWork v : children) {
                // finally we can create the grouped edge
                GroupInputEdge e = utils.createEdge(group, parentConf, workToVertex.get(v), work.getEdgeProperty(w, v), v, work);
                dag.addEdge(e);
            }
        } else {
            // Regular vertices
            JobConf wxConf = utils.initializeVertexConf(conf, ctx, w);
            checkOutputSpec(w, wxConf);
            Vertex wx = utils.createVertex(wxConf, w, scratchDir, fs, ctx, !isFinal, work, work.getVertexType(w), vertexResources);
            if (w.getReservedMemoryMB() > 0) {
                // If reversedMemoryMB is set, make memory allocation fraction adjustment as needed
                double frac = DagUtils.adjustMemoryReserveFraction(w.getReservedMemoryMB(), super.conf);
                LOG.info("Setting " + TEZ_MEMORY_RESERVE_FRACTION + " to " + frac);
                wx.setConf(TEZ_MEMORY_RESERVE_FRACTION, Double.toString(frac));
            }
            // Otherwise just leave it up to Tez to decide how much memory to allocate
            dag.addVertex(wx);
            utils.addCredentials(w, dag);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
            workToVertex.put(w, wx);
            workToConf.put(w, wxConf);
            // add all dependencies (i.e.: edges) to the graph
            for (BaseWork v : work.getChildren(w)) {
                assert workToVertex.containsKey(v);
                Edge e = null;
                TezEdgeProperty edgeProp = work.getEdgeProperty(w, v);
                e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, v, work);
                dag.addEdge(e);
            }
        }
    }
    // Clear the work map after build. TODO: remove caching instead?
    Utilities.clearWorkMap(conf);
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) UnionWork(org.apache.hadoop.hive.ql.plan.UnionWork) DAG(org.apache.tez.dag.api.DAG) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) LinkedList(java.util.LinkedList) VertexGroup(org.apache.tez.dag.api.VertexGroup) JSONObject(org.json.JSONObject) FileSystem(org.apache.hadoop.fs.FileSystem) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) JobConf(org.apache.hadoop.mapred.JobConf) Edge(org.apache.tez.dag.api.Edge) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Aggregations

BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)54 ArrayList (java.util.ArrayList)16 Operator (org.apache.hadoop.hive.ql.exec.Operator)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)11 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)10 LinkedList (java.util.LinkedList)9 HashTableDummyOperator (org.apache.hadoop.hive.ql.exec.HashTableDummyOperator)9 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)9 TezWork (org.apache.hadoop.hive.ql.plan.TezWork)9 List (java.util.List)8 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)8 JobConf (org.apache.hadoop.mapred.JobConf)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 SparkEdgeProperty (org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)7 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)7 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)6 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)6 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)6