Examples with DataSinkDescriptor - org.apache.tez.dag.api.DataSinkDescriptor

Example 11 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class TestMRRJobsDAGApi method testMRRSleepJobDagSubmitCore.

public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning, boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM, Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources) throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
    LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");
    JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());
    stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
    stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
    stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
    stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    MRHelpers.translateMRConfToTez(stage1Conf);
    MRHelpers.translateMRConfToTez(stage2Conf);
    MRHelpers.translateMRConfToTez(stage3Conf);
    MRHelpers.configureMRApiUsage(stage1Conf);
    MRHelpers.configureMRApiUsage(stage2Conf);
    MRHelpers.configureMRApiUsage(stage3Conf);
    Path remoteStagingDir = remoteFs.makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
    TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);
    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
    UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);
    DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));
    Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass) : null;
    LOG.info("Using initializer class: " + initializerClass);
    DataSourceDescriptor dsd;
    if (!genSplitsInAM) {
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
    } else {
        if (initializerClass == null) {
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
        } else {
            InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).setCustomInitializerDescriptor(iid).build();
        }
    }
    Vertex stage1Vertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload), dsd.getNumberOfShards(), Resource.newInstance(256, 1));
    stage1Vertex.addDataSource("MRInput", dsd);
    Vertex stage2Vertex = Vertex.create("ireduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1, Resource.newInstance(256, 1));
    Vertex stage3Vertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1, Resource.newInstance(256, 1));
    stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
    DataSinkDescriptor dataSinkDescriptor = MROutputLegacy.createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
    Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
    stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);
    // TODO env, resources
    dag.addVertex(stage1Vertex);
    dag.addVertex(stage2Vertex);
    dag.addVertex(stage3Vertex);
    Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
    Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));
    dag.addEdge(edge1);
    dag.addEdge(edge2);
    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
    DAGClient dagClient = null;
    boolean reuseSession = reUseTezSession != null;
    TezClient tezSession = null;
    if (!dagViaRPC) {
        Preconditions.checkArgument(reuseSession == false);
    }
    if (!reuseSession) {
        TezConfiguration tempTezconf = new TezConfiguration(tezConf);
        if (!dagViaRPC) {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
        } else {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
        }
        tezSession = TezClient.create("testsession", tempTezconf);
        tezSession.start();
    } else {
        tezSession = reUseTezSession;
    }
    if (!dagViaRPC) {
        // TODO Use utility method post TEZ-205 to figure out AM arguments etc.
        dagClient = tezSession.submitDAG(dag);
    }
    if (dagViaRPC && closeSessionBeforeSubmit) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(mrrTezCluster.getConfig());
        yarnClient.start();
        boolean sentKillSession = false;
        while (true) {
            Thread.sleep(500l);
            ApplicationReport appReport = yarnClient.getApplicationReport(tezSession.getAppMasterApplicationId());
            if (appReport == null) {
                continue;
            }
            YarnApplicationState appState = appReport.getYarnApplicationState();
            if (!sentKillSession) {
                if (appState == YarnApplicationState.RUNNING) {
                    tezSession.stop();
                    sentKillSession = true;
                }
            } else {
                if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED || appState == YarnApplicationState.FAILED) {
                    LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState=" + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
                    Assert.assertEquals(YarnApplicationState.FINISHED, appState);
                    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, appReport.getFinalApplicationStatus());
                    break;
                }
            }
        }
        yarnClient.stop();
        return null;
    }
    if (dagViaRPC) {
        LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId() + " and Dag Name=" + dag.getName());
        if (additionalLocalResources != null) {
            tezSession.addAppMasterLocalFiles(additionalLocalResources);
        }
        dagClient = tezSession.submitDAG(dag);
        Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
    }
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
        LOG.info("Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
        Thread.sleep(500l);
        if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
            LOG.info("Killing running dag/session");
            if (dagViaRPC) {
                tezSession.stop();
            } else {
                dagClient.tryKillDAG();
            }
        }
        dagStatus = dagClient.getDAGStatus(null);
    }
    if (!reuseSession) {
        tezSession.stop();
    }
    return dagStatus.getState();
}

Also used : Vertex(org.apache.tez.dag.api.Vertex) YarnApplicationState(org.apache.hadoop.yarn.api.records.YarnApplicationState) TezClient(org.apache.tez.client.TezClient) Random(java.util.Random) SleepInputFormat(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepInputFormat) DAGStatus(org.apache.tez.dag.api.client.DAGStatus) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) MapProcessor(org.apache.tez.mapreduce.processor.map.MapProcessor) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) Path(org.apache.hadoop.fs.Path) UserPayload(org.apache.tez.dag.api.UserPayload) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) ReduceProcessor(org.apache.tez.mapreduce.processor.reduce.ReduceProcessor) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) SleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepReducer) ISleepReducer(org.apache.tez.mapreduce.examples.MRRSleepJob.ISleepReducer) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DAGClient(org.apache.tez.dag.api.client.DAGClient) SleepMapper(org.apache.tez.mapreduce.examples.MRRSleepJob.SleepMapper) MRRSleepJobPartitioner(org.apache.tez.mapreduce.examples.MRRSleepJob.MRRSleepJobPartitioner) Edge(org.apache.tez.dag.api.Edge) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat)

Example 12 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class CartesianProduct method createDAG.

private DAG createDAG(TezConfiguration tezConf) throws IOException {
    InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
    InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
    DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
    Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v1.addDataSource(INPUT, dataSourceDescriptor);
    Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v2.addDataSource(INPUT, dataSourceDescriptor);
    OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
    OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
    DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
    CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
    UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
    Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
    v3.addDataSink(OUTPUT, dataSinkDescriptor);
    v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
    EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
    edgeManagerDescriptor.setUserPayload(userPayload);
    UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
    EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
    return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}

Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) UserPayload(org.apache.tez.dag.api.UserPayload) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 13 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.

the class UnionExample method createDAG.

private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
    DAG dag = DAG.create("UnionExample");
    int numMaps = -1;
    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);
    Configuration partsConf = new Configuration(tezConf);
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
    unionVertex.addDataSink("parts", od1);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
    return dag;
}

Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) ConcatenatedMergedKeyValuesInput(org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 14 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param workUnit The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
    Vertex vertex;
    // simply dispatch the call to the right method for the actual (sub-) type of
    // BaseWork.
    VertexType vertexType = tezWork.getVertexType(workUnit);
    if (workUnit instanceof MapWork) {
        vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
    } else if (workUnit instanceof ReduceWork) {
        vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
    } else if (workUnit instanceof MergeJoinWork) {
        vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(workUnit)) {
            if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
    vertex.addTaskLocalFiles(localResources);
    vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    vertex.setExecutionContext(vertexExecutionContext);
    // initialize stats publisher if necessary
    if (workUnit.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    final Class outputKlass;
    if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
        // Hive uses this output format, when it is going to write all its data through FS operator
        outputKlass = NullMROutput.class;
    } else {
        outputKlass = MROutput.class;
    }
    // If there is a fileSink add a DataSink to the vertex
    boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
    // final vertices need to have at least one output
    boolean endVertex = tezWork.getLeaves().contains(workUnit);
    if (endVertex || hasFileSink) {
        OutputCommitterDescriptor ocd = null;
        String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
        if (committer != null && !committer.isEmpty()) {
            ocd = OutputCommitterDescriptor.create(committer);
        }
        vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
    }
    return vertex;
}

Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) HiveOutputFormatImpl(org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) VertexType(org.apache.hadoop.hive.ql.plan.TezWork.VertexType) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 15 with DataSinkDescriptor

use of org.apache.tez.dag.api.DataSinkDescriptor in project hive by apache.

the class DagUtils method createVertex.

/**
   * Create a vertex from a given work object.
   *
   * @param conf JobConf to be used to this execution unit
   * @param work The instance of BaseWork representing the actual work to be performed
   * by this vertex.
   * @param scratchDir HDFS scratch dir for this execution unit.
   * @param appJarLr Local resource for hive-exec.
   * @param additionalLr
   * @param fileSystem FS corresponding to scratchDir and LocalResources
   * @param ctx This query's context
   * @return Vertex
   */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}

Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Aggregations

DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)23 Vertex (org.apache.tez.dag.api.Vertex)12 OutputContext (org.apache.tez.runtime.api.OutputContext)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)8 Path (org.apache.hadoop.fs.Path)7 DAG (org.apache.tez.dag.api.DAG)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 UserPayload (org.apache.tez.dag.api.UserPayload)6 JobConf (org.apache.hadoop.mapred.JobConf)5 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)5 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)3 MergeJoinWork (org.apache.hadoop.hive.ql.plan.MergeJoinWork)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 StatsCollectionContext (org.apache.hadoop.hive.ql.stats.StatsCollectionContext)3 StatsFactory (org.apache.hadoop.hive.ql.stats.StatsFactory)3