Search in sources :

Example 6 with Vertex

use of org.apache.tez.dag.api.Vertex in project hive by apache.

the class TestTezTask method testBuildDag.

@Test
public void testBuildDag() throws IllegalArgumentException, IOException, Exception {
    DAG dag = task.build(conf, work, path, appLr, null, new Context(conf));
    for (BaseWork w : work.getAllWork()) {
        Vertex v = dag.getVertex(w.getName());
        assertNotNull(v);
        List<Vertex> outs = v.getOutputVertices();
        for (BaseWork x : work.getChildren(w)) {
            boolean found = false;
            for (Vertex u : outs) {
                if (u.getName().equals(x.getName())) {
                    found = true;
                    break;
                }
            }
            assertTrue(found);
        }
    }
}
Also used : Context(org.apache.hadoop.hive.ql.Context) Vertex(org.apache.tez.dag.api.Vertex) DAG(org.apache.tez.dag.api.DAG) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) Test(org.junit.Test)

Example 7 with Vertex

use of org.apache.tez.dag.api.Vertex in project hive by apache.

the class DagUtils method createVertex.

/**
   * Create a vertex from a given work object.
   *
   * @param conf JobConf to be used to this execution unit
   * @param work The instance of BaseWork representing the actual work to be performed
   * by this vertex.
   * @param scratchDir HDFS scratch dir for this execution unit.
   * @param appJarLr Local resource for hive-exec.
   * @param additionalLr
   * @param fileSystem FS corresponding to scratchDir and LocalResources
   * @param ctx This query's context
   * @return Vertex
   */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}
Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Example 8 with Vertex

use of org.apache.tez.dag.api.Vertex in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Path tezDir = getTezDir(mrScratchDir);
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput) {
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_")), true);
        numTasks = dataSource.getNumberOfShards();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr : additionalLr) {
        localResources.put(getBaseName(lr), lr);
    }
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 9 with Vertex

use of org.apache.tez.dag.api.Vertex in project hive by apache.

the class DagUtils method createVertex.

private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
    if (mergeJoinWork.getMainWork() instanceof MapWork) {
        List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
        MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
        Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set
        // to false when using this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
        for (int i = 0; i < mapWorkList.size(); i++) {
            mapWork = (MapWork) (mapWorkList.get(i));
            conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
            conf.set(Utilities.INPUT_NAME, mapWork.getName());
            LOG.info("Going through each work and adding MultiMRInput");
            mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
        }
        VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
        // the +1 to the size is because of the main work.
        CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
        DataOutputBuffer dob = new DataOutputBuffer();
        vertexConf.write(dob);
        byte[] userPayload = dob.getData();
        desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
        mergeVx.setVertexManagerPlugin(desc);
        return mergeVx;
    } else {
        Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
        return mergeVx;
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) VertexManagerPluginDescriptor(org.apache.tez.dag.api.VertexManagerPluginDescriptor) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Aggregations

Vertex (org.apache.tez.dag.api.Vertex)9 FileSystem (org.apache.hadoop.fs.FileSystem)4 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)4 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)4 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)4 Path (org.apache.hadoop.fs.Path)3 Context (org.apache.hadoop.hive.ql.Context)3 JobConf (org.apache.hadoop.mapred.JobConf)3 PreWarmVertex (org.apache.tez.dag.api.PreWarmVertex)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)2 TezEdgeProperty (org.apache.hadoop.hive.ql.plan.TezEdgeProperty)2 DAG (org.apache.tez.dag.api.DAG)2 FileNotFoundException (java.io.FileNotFoundException)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1