Search in sources :

Example 11 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestMRRJobsDAGApi method testMRRSleepJobDagSubmitCore.

public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning, boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM, Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources) throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
    LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");
    JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());
    stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
    stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
    stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
    stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    MRHelpers.translateMRConfToTez(stage1Conf);
    MRHelpers.translateMRConfToTez(stage2Conf);
    MRHelpers.translateMRConfToTez(stage3Conf);
    MRHelpers.configureMRApiUsage(stage1Conf);
    MRHelpers.configureMRApiUsage(stage2Conf);
    MRHelpers.configureMRApiUsage(stage3Conf);
    Path remoteStagingDir = remoteFs.makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
    TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);
    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
    UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);
    DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));
    Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass) : null;
    LOG.info("Using initializer class: " + initializerClass);
    DataSourceDescriptor dsd;
    if (!genSplitsInAM) {
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
    } else {
        if (initializerClass == null) {
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
        } else {
            InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).setCustomInitializerDescriptor(iid).build();
        }
    }
    Vertex stage1Vertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload), dsd.getNumberOfShards(), Resource.newInstance(256, 1));
    stage1Vertex.addDataSource("MRInput", dsd);
    Vertex stage2Vertex = Vertex.create("ireduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1, Resource.newInstance(256, 1));
    Vertex stage3Vertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1, Resource.newInstance(256, 1));
    stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
    DataSinkDescriptor dataSinkDescriptor = MROutputLegacy.createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
    Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
    stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);
    // TODO env, resources
    dag.addVertex(stage1Vertex);
    dag.addVertex(stage2Vertex);
    dag.addVertex(stage3Vertex);
    Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
    Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));
    dag.addEdge(edge1);
    dag.addEdge(edge2);
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
    DAGClient dagClient = null;
    boolean reuseSession = reUseTezSession != null;
    TezClient tezSession = null;
    if (!dagViaRPC) {
        Preconditions.checkArgument(reuseSession == false);
    }
    if (!reuseSession) {
        TezConfiguration tempTezconf = new TezConfiguration(tezConf);
        if (!dagViaRPC) {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
        } else {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
        }
        tezSession = TezClient.create("testsession", tempTezconf);
        tezSession.start();
    } else {
        tezSession = reUseTezSession;
    }
    if (!dagViaRPC) {
        // TODO Use utility method post TEZ-205 to figure out AM arguments etc.
        dagClient = tezSession.submitDAG(dag);
    }
    if (dagViaRPC && closeSessionBeforeSubmit) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(mrrTezCluster.getConfig());
        yarnClient.start();
        boolean sentKillSession = false;
        while (true) {
            Thread.sleep(500l);
            ApplicationReport appReport = yarnClient.getApplicationReport(tezSession.getAppMasterApplicationId());
            if (appReport == null) {
                continue;
            }
            YarnApplicationState appState = appReport.getYarnApplicationState();
            if (!sentKillSession) {
                if (appState == YarnApplicationState.RUNNING) {
                    tezSession.stop();
                    sentKillSession = true;
                }
            } else {
                if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED || appState == YarnApplicationState.FAILED) {
                    LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState=" + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
                    Assert.assertEquals(YarnApplicationState.FINISHED, appState);
                    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, appReport.getFinalApplicationStatus());
                    break;
                }
            }
        }
        yarnClient.stop();
        return null;
    }
    if (dagViaRPC) {
        LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId() + " and Dag Name=" + dag.getName());
        if (additionalLocalResources != null) {
            tezSession.addAppMasterLocalFiles(additionalLocalResources);
        }
        dagClient = tezSession.submitDAG(dag);
        Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
    }
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
        LOG.info("Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
        Thread.sleep(500l);
        if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
            LOG.info("Killing running dag/session");
            if (dagViaRPC) {
                tezSession.stop();
            } else {
                dagClient.tryKillDAG();
            }
        }
        dagStatus = dagClient.getDAGStatus(null);
    }
    if (!reuseSession) {
        tezSession.stop();
    }
    return dagStatus.getState();
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) YarnApplicationState(org.apache.hadoop.yarn.api.records.YarnApplicationState) TezClient(org.apache.tez.client.TezClient) Random(java.util.Random) SleepInputFormat(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepInputFormat) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) MapProcessor(org.apache.tez.mapreduce.processor.map.MapProcessor) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) ReduceProcessor(org.apache.tez.mapreduce.processor.reduce.ReduceProcessor) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) SleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepReducer) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DAGClient(org.apache.tez.dag.api.client.DAGClient) SleepMapper(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepMapper) MRRSleepJobPartitioner(org.apache.tez.mapreduce.examples.MRRSleepJob.MRRSleepJobPartitioner) Edge(org.apache.tez.dag.api.Edge) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat)

Example 12 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class CartesianProduct method createDAG.

private DAG createDAG(TezConfiguration tezConf) throws IOException {
    InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
    InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
    DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
    Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v1.addDataSource(INPUT, dataSourceDescriptor);
    Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v2.addDataSource(INPUT, dataSourceDescriptor);
    OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
    OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
    DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
    CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
    UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
    Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
    v3.addDataSink(OUTPUT, dataSinkDescriptor);
    v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
    EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
    edgeManagerDescriptor.setUserPayload(userPayload);
    UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
    EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
    return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) UserPayload(org.apache.tez.dag.api.UserPayload) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 13 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class UnionExample method createDAG.

private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
    DAG dag = DAG.create("UnionExample");
    int numMaps = -1;
    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);
    Configuration partsConf = new Configuration(tezConf);
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
    unionVertex.addDataSink("parts", od1);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
    return dag;
}
Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) ConcatenatedMergedKeyValuesInput(org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 14 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Path tezDir = getTezDir(mrScratchDir);
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput) {
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_")), true);
        numTasks = dataSource.getNumberOfShards();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr : additionalLr) {
        localResources.put(getBaseName(lr), lr);
    }
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 15 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
                // SMB Join.
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
        InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
        dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
        numTasks = inputSplitInfo.getNumTasks();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) InputSplitInfo(org.apache.tez.mapreduce.hadoop.InputSplitInfo) IOException(java.io.IOException) MRInputSplitDistributor(org.apache.tez.mapreduce.common.MRInputSplitDistributor) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)24 Vertex (org.apache.tez.dag.api.Vertex)14 Configuration (org.apache.hadoop.conf.Configuration)10 Path (org.apache.hadoop.fs.Path)10 DAG (org.apache.tez.dag.api.DAG)10 UserPayload (org.apache.tez.dag.api.UserPayload)10 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)8 IOException (java.io.IOException)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 Test (org.junit.Test)7 IntWritable (org.apache.hadoop.io.IntWritable)5 Text (org.apache.hadoop.io.Text)5 JobConf (org.apache.hadoop.mapred.JobConf)5 InputDescriptor (org.apache.tez.dag.api.InputDescriptor)5 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)5 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 TezClient (org.apache.tez.client.TezClient)4