Search in sources :

Example 6 with GroupInputEdge

use of org.apache.tez.dag.api.GroupInputEdge in project hive by apache.

the class TezTask method build.

DAG build(JobConf conf, TezWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, Context ctx) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    // getAllWork returns a topologically sorted list, which we use to make
    // sure that vertices are created before they are used in edges.
    List<BaseWork> ws = work.getAllWork();
    Collections.reverse(ws);
    FileSystem fs = scratchDir.getFileSystem(conf);
    // the name of the dag is what is displayed in the AM/Job UI
    String dagName = utils.createDagName(conf, queryPlan);
    LOG.info("Dag name: " + dagName);
    DAG dag = DAG.create(dagName);
    // set some info for the query
    JSONObject json = new JSONObject(new LinkedHashMap()).put("context", "Hive").put("description", ctx.getCmd());
    String dagInfo = json.toString();
    if (LOG.isDebugEnabled()) {
        LOG.debug("DagInfo: " + dagInfo);
    }
    dag.setDAGInfo(dagInfo);
    dag.setCredentials(conf.getCredentials());
    setAccessControlsForCurrentUser(dag, queryPlan.getQueryId(), conf);
    for (BaseWork w : ws) {
        boolean isFinal = work.getLeaves().contains(w);
        // translate work to vertex
        perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
        if (w instanceof UnionWork) {
            // Special case for unions. These items translate to VertexGroups
            List<BaseWork> unionWorkItems = new LinkedList<BaseWork>();
            List<BaseWork> children = new LinkedList<BaseWork>();
            // proper children of the union
            for (BaseWork v : work.getChildren(w)) {
                EdgeType type = work.getEdgeProperty(w, v).getEdgeType();
                if (type == EdgeType.CONTAINS) {
                    unionWorkItems.add(v);
                } else {
                    children.add(v);
                }
            }
            // create VertexGroup
            Vertex[] vertexArray = new Vertex[unionWorkItems.size()];
            int i = 0;
            for (BaseWork v : unionWorkItems) {
                vertexArray[i++] = workToVertex.get(v);
            }
            VertexGroup group = dag.createVertexGroup(w.getName(), vertexArray);
            // For a vertex group, all Outputs use the same Key-class, Val-class and partitioner.
            // Pick any one source vertex to figure out the Edge configuration.
            JobConf parentConf = workToConf.get(unionWorkItems.get(0));
            // now hook up the children
            for (BaseWork v : children) {
                // finally we can create the grouped edge
                GroupInputEdge e = utils.createEdge(group, parentConf, workToVertex.get(v), work.getEdgeProperty(w, v), work.getVertexType(v));
                dag.addEdge(e);
            }
        } else {
            // Regular vertices
            JobConf wxConf = utils.initializeVertexConf(conf, ctx, w);
            Vertex wx = utils.createVertex(wxConf, w, scratchDir, appJarLr, additionalLr, fs, ctx, !isFinal, work, work.getVertexType(w));
            if (w.getReservedMemoryMB() > 0) {
                // If reversedMemoryMB is set, make memory allocation fraction adjustment as needed
                double frac = DagUtils.adjustMemoryReserveFraction(w.getReservedMemoryMB(), super.conf);
                LOG.info("Setting " + TEZ_MEMORY_RESERVE_FRACTION + " to " + frac);
                wx.setConf(TEZ_MEMORY_RESERVE_FRACTION, Double.toString(frac));
            }
            // Otherwise just leave it up to Tez to decide how much memory to allocate
            dag.addVertex(wx);
            utils.addCredentials(w, dag);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
            workToVertex.put(w, wx);
            workToConf.put(w, wxConf);
            // add all dependencies (i.e.: edges) to the graph
            for (BaseWork v : work.getChildren(w)) {
                assert workToVertex.containsKey(v);
                Edge e = null;
                TezEdgeProperty edgeProp = work.getEdgeProperty(w, v);
                e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, work.getVertexType(v));
                dag.addEdge(e);
            }
        }
    }
    // Clear the work map after build. TODO: remove caching instead?
    Utilities.clearWorkMap(conf);
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) UnionWork(org.apache.hadoop.hive.ql.plan.UnionWork) DAG(org.apache.tez.dag.api.DAG) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) VertexGroup(org.apache.tez.dag.api.VertexGroup) JSONObject(org.json.JSONObject) FileSystem(org.apache.hadoop.fs.FileSystem) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) JobConf(org.apache.hadoop.mapred.JobConf) Edge(org.apache.tez.dag.api.Edge) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 7 with GroupInputEdge

use of org.apache.tez.dag.api.GroupInputEdge in project tez by apache.

the class TestDAGImpl method createGroupDAGPlan.

// Create a plan with 3 vertices: A, B, C. Group(A,B)->C
static DAGPlan createGroupDAGPlan() {
    LOG.info("Setting up group dag plan");
    int dummyTaskCount = 1;
    Resource dummyTaskResource = Resource.newInstance(1, 1);
    org.apache.tez.dag.api.Vertex v1 = org.apache.tez.dag.api.Vertex.create("vertex1", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    org.apache.tez.dag.api.Vertex v2 = org.apache.tez.dag.api.Vertex.create("vertex2", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    org.apache.tez.dag.api.Vertex v3 = org.apache.tez.dag.api.Vertex.create("vertex3", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    DAG dag = DAG.create("testDag");
    String groupName1 = "uv12";
    OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(TotalCountingOutputCommitter.class.getName());
    org.apache.tez.dag.api.VertexGroup uv12 = dag.createVertexGroup(groupName1, v1, v2);
    OutputDescriptor outDesc = OutputDescriptor.create("output.class");
    uv12.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
    v3.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
    GroupInputEdge e1 = GroupInputEdge.create(uv12, v3, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("dummy output class"), InputDescriptor.create("dummy input class")), InputDescriptor.create("merge.class"));
    dag.addVertex(v1);
    dag.addVertex(v2);
    dag.addVertex(v3);
    dag.addEdge(e1);
    return dag.createDag(conf, null, null, null, true);
}
Also used : OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) Resource(org.apache.hadoop.yarn.api.records.Resource) DAG(org.apache.tez.dag.api.DAG) ByteString(com.google.protobuf.ByteString) PlanTaskLocationHint(org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 8 with GroupInputEdge

use of org.apache.tez.dag.api.GroupInputEdge in project hive by apache.

the class TezTask method build.

DAG build(JobConf conf, TezWork tezWork, Path scratchDir, Context ctx, Map<String, LocalResource> vertexResources) throws Exception {
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    // getAllWork returns a topologically sorted list, which we use to make
    // sure that vertices are created before they are used in edges.
    List<BaseWork> topologicalWorkList = tezWork.getAllWork();
    Collections.reverse(topologicalWorkList);
    // the name of the dag is what is displayed in the AM/Job UI
    String dagName = utils.createDagName(conf, queryPlan);
    LOG.info("Dag name: {}", dagName);
    DAG dag = DAG.create(dagName);
    // set some info for the query
    JSONObject json = new JSONObject(new LinkedHashMap<>()).put("context", "Hive").put("description", ctx.getCmd());
    String dagInfo = json.toString();
    String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID);
    dag.setConf(HiveConf.ConfVars.HIVEQUERYID.varname, queryId);
    LOG.debug("DagInfo: {}", dagInfo);
    dag.setDAGInfo(dagInfo);
    dag.setCredentials(conf.getCredentials());
    setAccessControlsForCurrentUser(dag, queryPlan.getQueryId(), conf);
    for (BaseWork workUnit : topologicalWorkList) {
        // translate work to vertex
        perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + workUnit.getName());
        if (workUnit instanceof UnionWork) {
            // Special case for unions. These items translate to VertexGroups
            List<BaseWork> unionWorkItems = new LinkedList<BaseWork>();
            List<BaseWork> children = new LinkedList<BaseWork>();
            // proper children of the union
            for (BaseWork v : tezWork.getChildren(workUnit)) {
                EdgeType type = tezWork.getEdgeProperty(workUnit, v).getEdgeType();
                if (type == EdgeType.CONTAINS) {
                    unionWorkItems.add(v);
                } else {
                    children.add(v);
                }
            }
            JobConf parentConf = workToConf.get(unionWorkItems.get(0));
            checkOutputSpec(workUnit, parentConf);
            // create VertexGroup
            Vertex[] vertexArray = new Vertex[unionWorkItems.size()];
            int i = 0;
            for (BaseWork v : unionWorkItems) {
                vertexArray[i++] = workToVertex.get(v);
            }
            VertexGroup group = dag.createVertexGroup(workUnit.getName(), vertexArray);
            // now hook up the children
            for (BaseWork v : children) {
                // finally we can create the grouped edge
                GroupInputEdge e = utils.createEdge(group, parentConf, workToVertex.get(v), tezWork.getEdgeProperty(workUnit, v), v, tezWork);
                dag.addEdge(e);
            }
        } else {
            // Regular vertices
            JobConf wxConf = utils.initializeVertexConf(conf, ctx, workUnit);
            checkOutputSpec(workUnit, wxConf);
            Vertex wx = utils.createVertex(wxConf, workUnit, scratchDir, tezWork, vertexResources);
            if (tezWork.getChildren(workUnit).size() > 1) {
                String tezRuntimeSortMb = wxConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB);
                int originalValue = 0;
                if (tezRuntimeSortMb == null) {
                    originalValue = TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB_DEFAULT;
                } else {
                    originalValue = Integer.valueOf(tezRuntimeSortMb);
                }
                int newValue = originalValue / tezWork.getChildren(workUnit).size();
                wxConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, Integer.toString(newValue));
                LOG.info("Modified " + TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + " to " + newValue);
            }
            if (workUnit.getReservedMemoryMB() > 0) {
                // If reversedMemoryMB is set, make memory allocation fraction adjustment as needed
                double frac = DagUtils.adjustMemoryReserveFraction(workUnit.getReservedMemoryMB(), super.conf);
                LOG.info("Setting " + TEZ_MEMORY_RESERVE_FRACTION + " to " + frac);
                wx.setConf(TEZ_MEMORY_RESERVE_FRACTION, Double.toString(frac));
            }
            // Otherwise just leave it up to Tez to decide how much memory to allocate
            dag.addVertex(wx);
            utils.addCredentials(workUnit, dag, conf);
            perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + workUnit.getName());
            workToVertex.put(workUnit, wx);
            workToConf.put(workUnit, wxConf);
            // add all dependencies (i.e.: edges) to the graph
            for (BaseWork v : tezWork.getChildren(workUnit)) {
                assert workToVertex.containsKey(v);
                Edge e = null;
                TezEdgeProperty edgeProp = tezWork.getEdgeProperty(workUnit, v);
                e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, v, tezWork);
                dag.addEdge(e);
            }
        }
    }
    // Clear the work map after build. TODO: remove caching instead?
    Utilities.clearWorkMap(conf);
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_BUILD_DAG);
    return dag;
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) UnionWork(org.apache.hadoop.hive.ql.plan.UnionWork) DAG(org.apache.tez.dag.api.DAG) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) LinkedList(java.util.LinkedList) VertexGroup(org.apache.tez.dag.api.VertexGroup) JSONObject(org.json.JSONObject) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) JobConf(org.apache.hadoop.mapred.JobConf) Edge(org.apache.tez.dag.api.Edge) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Aggregations

DAG (org.apache.tez.dag.api.DAG)8 GroupInputEdge (org.apache.tez.dag.api.GroupInputEdge)8 Vertex (org.apache.tez.dag.api.Vertex)5 Resource (org.apache.hadoop.yarn.api.records.Resource)4 OutputCommitterDescriptor (org.apache.tez.dag.api.OutputCommitterDescriptor)4 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)4 VertexGroup (org.apache.tez.dag.api.VertexGroup)4 LinkedList (java.util.LinkedList)2 Configuration (org.apache.hadoop.conf.Configuration)2 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)2 TezEdgeProperty (org.apache.hadoop.hive.ql.plan.TezEdgeProperty)2 EdgeType (org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType)2 UnionWork (org.apache.hadoop.hive.ql.plan.UnionWork)2 JobConf (org.apache.hadoop.mapred.JobConf)2 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)2 Edge (org.apache.tez.dag.api.Edge)2 JSONObject (org.json.JSONObject)2 ByteString (com.google.protobuf.ByteString)1 LinkedHashMap (java.util.LinkedHashMap)1 FileSystem (org.apache.hadoop.fs.FileSystem)1