Search in sources :

Example 1 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project hive by apache.

the class DagUtils method createEdgeProperty.

/*
   * Helper function to create an edge property from an edge type.
   */
private EdgeProperty createEdgeProperty(TezEdgeProperty edgeProp, Configuration conf) throws IOException {
    MRHelpers.translateMRConfToTez(conf);
    String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
    String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
    String partitionerClassName = conf.get("mapred.partitioner.class");
    Map<String, String> partitionerConf;
    EdgeType edgeType = edgeProp.getEdgeType();
    switch(edgeType) {
        case BROADCAST_EDGE:
            UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et1Conf.createDefaultBroadcastEdgeProperty();
        case CUSTOM_EDGE:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
            CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
            DataOutputBuffer dob = new DataOutputBuffer();
            edgeConf.write(dob);
            byte[] userPayload = dob.getData();
            edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
            return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
        case CUSTOM_SIMPLE_EDGE:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            UnorderedPartitionedKVEdgeConfig et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et3Conf.createDefaultEdgeProperty();
        case SIMPLE_EDGE:
        default:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            OrderedPartitionedKVEdgeConfig et4Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et4Conf.createDefaultEdgeProperty();
    }
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) MRPartitioner(org.apache.tez.mapreduce.partition.MRPartitioner) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) TezBytesComparator(org.apache.tez.runtime.library.common.comparator.TezBytesComparator) UnorderedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) TezBytesWritableSerialization(org.apache.tez.runtime.library.common.serializer.TezBytesWritableSerialization)

Example 2 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project hive by apache.

the class DagUtils method createEdgeProperty.

/*
   * Helper function to create an edge property from an edge type.
   */
private EdgeProperty createEdgeProperty(Vertex w, TezEdgeProperty edgeProp, Configuration conf, BaseWork work, TezWork tezWork) throws IOException {
    MRHelpers.translateMRConfToTez(conf);
    String keyClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
    String valClass = conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
    String partitionerClassName = conf.get("mapred.partitioner.class");
    Map<String, String> partitionerConf;
    EdgeType edgeType = edgeProp.getEdgeType();
    switch(edgeType) {
        case BROADCAST_EDGE:
            UnorderedKVEdgeConfig et1Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et1Conf.createDefaultBroadcastEdgeProperty();
        case CUSTOM_EDGE:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            UnorderedPartitionedKVEdgeConfig et2Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            EdgeManagerPluginDescriptor edgeDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
            CustomEdgeConfiguration edgeConf = new CustomEdgeConfiguration(edgeProp.getNumBuckets(), null);
            DataOutputBuffer dob = new DataOutputBuffer();
            edgeConf.write(dob);
            byte[] userPayload = dob.getData();
            edgeDesc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
            return et2Conf.createDefaultCustomEdgeProperty(edgeDesc);
        case CUSTOM_SIMPLE_EDGE:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            UnorderedPartitionedKVEdgeConfig et3Conf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et3Conf.createDefaultEdgeProperty();
        case ONE_TO_ONE_EDGE:
            UnorderedKVEdgeConfig et4Conf = UnorderedKVEdgeConfig.newBuilder(keyClass, valClass).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et4Conf.createDefaultOneToOneEdgeProperty();
        case XPROD_EDGE:
            EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
            List<String> crossProductSources = new ArrayList<>();
            for (BaseWork parentWork : tezWork.getParents(work)) {
                if (EdgeType.XPROD_EDGE == tezWork.getEdgeType(parentWork, work)) {
                    crossProductSources.add(parentWork.getName());
                }
            }
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            edgeManagerDescriptor.setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf)));
            UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, ValueHashPartitioner.class.getName()).build();
            return cpEdgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
        case SIMPLE_EDGE:
        // fallthrough
        default:
            assert partitionerClassName != null;
            partitionerConf = createPartitionerConf(partitionerClassName, conf);
            OrderedPartitionedKVEdgeConfig et5Conf = OrderedPartitionedKVEdgeConfig.newBuilder(keyClass, valClass, MRPartitioner.class.getName(), partitionerConf).setFromConfiguration(conf).setKeySerializationClass(TezBytesWritableSerialization.class.getName(), TezBytesComparator.class.getName(), null).setValueSerializationClass(TezBytesWritableSerialization.class.getName(), null).build();
            return et5Conf.createDefaultEdgeProperty();
    }
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) ArrayList(java.util.ArrayList) MRPartitioner(org.apache.tez.mapreduce.partition.MRPartitioner) EdgeType(org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType) TezBytesComparator(org.apache.tez.runtime.library.common.comparator.TezBytesComparator) UnorderedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) TezBytesWritableSerialization(org.apache.tez.runtime.library.common.serializer.TezBytesWritableSerialization) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 3 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class MRRSleepJob method createDAG.

public DAG createDAG(Configuration conf, Path stagingDir, int numMapper, int numReducer, int iReduceStagesCount, int numIReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount, long iReduceSleepTime, int iReduceSleepCount, boolean writeSplitsToDFS, boolean generateSplitsInAM) throws IOException, YarnException {
    Configuration mapStageConf = new JobConf(conf);
    mapStageConf.setInt(MRJobConfig.NUM_MAPS, numMapper);
    mapStageConf.setLong(MAP_SLEEP_TIME, mapSleepTime);
    mapStageConf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
    mapStageConf.setLong(IREDUCE_SLEEP_TIME, iReduceSleepTime);
    mapStageConf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
    mapStageConf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
    mapStageConf.setInt(IREDUCE_SLEEP_COUNT, iReduceSleepCount);
    mapStageConf.setInt(IREDUCE_STAGES_COUNT, iReduceStagesCount);
    mapStageConf.setInt(IREDUCE_TASKS_COUNT, numIReducer);
    mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    if (numIReducer == 0 && numReducer == 0) {
        mapStageConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());
    }
    MRHelpers.translateMRConfToTez(mapStageConf, false);
    Configuration[] intermediateReduceStageConfs = null;
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        intermediateReduceStageConfs = new JobConf[iReduceStagesCount];
        for (int i = 1; i <= iReduceStagesCount; ++i) {
            JobConf iReduceStageConf = new JobConf(conf);
            iReduceStageConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, iReduceSleepTime);
            iReduceStageConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, iReduceSleepCount);
            iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, numIReducer);
            iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
            MRHelpers.translateMRConfToTez(iReduceStageConf, false);
            intermediateReduceStageConfs[i - 1] = iReduceStageConf;
        }
    }
    Configuration finalReduceConf = null;
    if (numReducer > 0) {
        finalReduceConf = new JobConf(conf);
        finalReduceConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, reduceSleepTime);
        finalReduceConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, reduceSleepCount);
        finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, numReducer);
        finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());
        MRHelpers.translateMRConfToTez(finalReduceConf, false);
    }
    MRHelpers.configureMRApiUsage(mapStageConf);
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            MRHelpers.configureMRApiUsage(intermediateReduceStageConfs[i]);
        }
    }
    if (numReducer > 0) {
        MRHelpers.configureMRApiUsage(finalReduceConf);
    }
    DataSourceDescriptor dataSource = null;
    if (!generateSplitsInAM && writeSplitsToDFS) {
        LOG.info("Writing splits to DFS");
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, stagingDir, true);
    } else {
        dataSource = MRInputLegacy.createConfigBuilder(mapStageConf, SleepInputFormat.class).generateSplitsInAM(generateSplitsInAM).build();
    }
    DAG dag = DAG.create("MRRSleepJob");
    String jarPath = ClassUtil.findContainingJar(getClass());
    if (jarPath == null) {
        throw new TezUncheckedException("Could not find any jar containing" + " MRRSleepJob.class in the classpath");
    }
    FileSystem stagingFs = stagingDir.getFileSystem(conf);
    Path remoteJarPath = new Path(stagingDir, "dag_job.jar");
    stagingFs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
    FileStatus jarFileStatus = stagingFs.getFileStatus(remoteJarPath);
    TokenCache.obtainTokensForNamenodes(this.credentials, new Path[] { remoteJarPath }, mapStageConf);
    Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>();
    LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(), jarFileStatus.getModificationTime());
    commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);
    List<Vertex> vertices = new ArrayList<Vertex>();
    UserPayload mapUserPayload = TezUtils.createUserPayloadFromConf(mapStageConf);
    int numTasks = generateSplitsInAM ? -1 : numMapper;
    Map<String, String> mapEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
    Map<String, String> reduceEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);
    Vertex mapVertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(mapUserPayload), numTasks, MRHelpers.getResourceForMRMapper(mapStageConf));
    mapVertex.addTaskLocalFiles(commonLocalResources).addDataSource("MRInput", dataSource).setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf)).setTaskEnvironment(mapEnv);
    vertices.add(mapVertex);
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            Configuration iconf = intermediateReduceStageConfs[i];
            UserPayload iReduceUserPayload = TezUtils.createUserPayloadFromConf(iconf);
            Vertex ivertex = Vertex.create("ireduce" + (i + 1), ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(iReduceUserPayload), numIReducer, MRHelpers.getResourceForMRReducer(intermediateReduceStageConfs[i]));
            ivertex.addTaskLocalFiles(commonLocalResources).setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(intermediateReduceStageConfs[i])).setTaskEnvironment(reduceEnv);
            vertices.add(ivertex);
        }
    }
    Vertex finalReduceVertex = null;
    if (numReducer > 0) {
        UserPayload reducePayload = TezUtils.createUserPayloadFromConf(finalReduceConf);
        finalReduceVertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(reducePayload), numReducer, MRHelpers.getResourceForMRReducer(finalReduceConf));
        finalReduceVertex.addTaskLocalFiles(commonLocalResources).addDataSink("MROutput", MROutputLegacy.createConfigBuilder(finalReduceConf, NullOutputFormat.class).build()).setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf)).setTaskEnvironment(reduceEnv);
        vertices.add(finalReduceVertex);
    } else {
        // Map only job
        mapVertex.addDataSink("MROutput", MROutputLegacy.createConfigBuilder(mapStageConf, NullOutputFormat.class).build());
    }
    for (int i = 0; i < vertices.size(); ++i) {
        dag.addVertex(vertices.get(i));
        if (i != 0) {
            Map<String, String> partitionerConf = Maps.newHashMap();
            partitionerConf.put(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
            Configuration edgeConfiguration = ((i + 1) == vertices.size()) ? finalReduceConf : intermediateReduceStageConfs[i - 1];
            OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName(), partitionerConf).configureInput().useLegacyInput().done().setFromConfiguration(edgeConfiguration).build();
            dag.addEdge(Edge.create(vertices.get(i - 1), vertices.get(i), edgeConf.createDefaultEdgeProperty()));
        }
    }
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) MapProcessor(org.apache.tez.mapreduce.processor.map.MapProcessor) Path(org.apache.hadoop.fs.Path) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) UserPayload(org.apache.tez.dag.api.UserPayload) DAG(org.apache.tez.dag.api.DAG) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ReduceProcessor(org.apache.tez.mapreduce.processor.reduce.ReduceProcessor) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat)

Example 4 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class MultipleCommitsExample method createDAG.

private DAG createDAG(TezConfiguration tezConf, String v1OutputPathPrefix, int v1OutputNum, String v2OutputPathPrefix, int v2OutputNum, String uv12OutputPathPrefix, int uv12OutputNum, String v3OutputPathPrefix, int v3OutputNum, boolean commitOnVertexSuccess) throws IOException {
    DAG dag = DAG.create("multipleCommitsDAG");
    dag.setConf(TezConfiguration.TEZ_AM_COMMIT_ALL_OUTPUTS_ON_DAG_SUCCESS, !commitOnVertexSuccess + "");
    Vertex v1 = Vertex.create("v1", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V1OutputNamePrefix, v1OutputNum, UV12OutputNamePrefix, uv12OutputNum).toUserPayload()), 2);
    Vertex v2 = Vertex.create("v2", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V2OutputNamePrefix, v2OutputNum, UV12OutputNamePrefix, uv12OutputNum).toUserPayload()), 2);
    // add data sinks for v1
    for (int i = 0; i < v1OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v1OutputPathPrefix + "_" + i).build();
        v1.addDataSink(V1OutputNamePrefix + "_" + i, sink);
    }
    // add data sinks for v2
    for (int i = 0; i < v2OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v2OutputPathPrefix + "_" + i).build();
        v2.addDataSink(V2OutputNamePrefix + "_" + i, sink);
    }
    // add data sinks for (v1,v2)
    VertexGroup uv12 = dag.createVertexGroup("uv12", v1, v2);
    for (int i = 0; i < uv12OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, uv12OutputPathPrefix + "_" + i).build();
        uv12.addDataSink(UV12OutputNamePrefix + "_" + i, sink);
    }
    Vertex v3 = Vertex.create("v3", ProcessorDescriptor.create(MultipleOutputProcessor.class.getName()).setUserPayload(new MultipleOutputProcessor.MultipleOutputProcessorConfig(V3OutputNamePrefix, v3OutputNum).toUserPayload()), 2);
    // add data sinks for v3
    for (int i = 0; i < v3OutputNum; ++i) {
        DataSinkDescriptor sink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, v3OutputPathPrefix + "_" + i).build();
        v3.addDataSink(V3OutputNamePrefix + "_" + i, sink);
    }
    OrderedPartitionedKVEdgeConfig edgeConfig = OrderedPartitionedKVEdgeConfig.newBuilder(NullWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
    GroupInputEdge edge = GroupInputEdge.create(uv12, v3, edgeConfig.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName()));
    dag.addVertex(v1).addVertex(v2).addVertex(v3).addEdge(edge);
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) NullWritable(org.apache.hadoop.io.NullWritable) VertexGroup(org.apache.tez.dag.api.VertexGroup) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 5 with OrderedPartitionedKVEdgeConfig

use of org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig in project tez by apache.

the class TestOrderedWordCount method createDAG.

@VisibleForTesting
public DAG createDAG(FileSystem fs, Configuration conf, Map<String, LocalResource> commonLocalResources, Path stagingDir, int dagIndex, String inputPath, String outputPath, boolean generateSplitsInClient, boolean useMRSettings, int intermediateNumReduceTasks, int maxDataLengthThroughIPC, int exceedDataLimit) throws Exception {
    Configuration mapStageConf = new JobConf(conf);
    mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR, TokenizerMapper.class.getName());
    MRHelpers.translateMRConfToTez(mapStageConf, !useMRSettings);
    Configuration iReduceStageConf = new JobConf(conf);
    // TODO replace with auto-reduce parallelism
    iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, 2);
    iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR, IntSumReducer.class.getName());
    iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
    iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
    iReduceStageConf.setBoolean("mapred.mapper.new-api", true);
    MRHelpers.translateMRConfToTez(iReduceStageConf, !useMRSettings);
    Configuration finalReduceConf = new JobConf(conf);
    finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, 1);
    finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR, MyOrderByNoOpReducer.class.getName());
    finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, IntWritable.class.getName());
    finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, Text.class.getName());
    MRHelpers.translateMRConfToTez(finalReduceConf, !useMRSettings);
    MRHelpers.configureMRApiUsage(mapStageConf);
    MRHelpers.configureMRApiUsage(iReduceStageConf);
    MRHelpers.configureMRApiUsage(finalReduceConf);
    List<Vertex> vertices = new ArrayList<Vertex>();
    String mapStageHistoryText = TezUtils.convertToHistoryText("Initial Tokenizer Vertex", mapStageConf);
    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
        mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class.getName());
        mapStageConf.set(FileInputFormat.INPUT_DIR, inputPath);
        mapStageConf.setBoolean("mapred.mapper.new-api", true);
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, stagingDir, true);
    } else {
        dsd = MRInputLegacy.createConfigBuilder(mapStageConf, TextInputFormat.class, inputPath).build();
    }
    dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText("HDFS Input " + inputPath, mapStageConf));
    Map<String, String> mapEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
    Map<String, String> reduceEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);
    Configuration copyMapStageConf = new Configuration(mapStageConf);
    setMaxDataLengthConf(copyMapStageConf, maxDataLengthThroughIPC, exceedDataLimit);
    Vertex mapVertex;
    ProcessorDescriptor mapProcessorDescriptor = ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(copyMapStageConf)).setHistoryText(mapStageHistoryText);
    if (!useMRSettings) {
        mapVertex = Vertex.create("initialmap", mapProcessorDescriptor);
    } else {
        mapVertex = Vertex.create("initialmap", mapProcessorDescriptor, -1, MRHelpers.getResourceForMRMapper(mapStageConf));
        mapVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf));
        mapVertex.setTaskEnvironment(mapEnv);
    }
    mapVertex.addTaskLocalFiles(commonLocalResources).addDataSource("MRInput", dsd);
    vertices.add(mapVertex);
    Configuration copyiReduceStageConf = new Configuration(iReduceStageConf);
    setMaxDataLengthConf(copyiReduceStageConf, maxDataLengthThroughIPC, exceedDataLimit);
    String iReduceStageHistoryText = TezUtils.convertToHistoryText("Intermediate Summation Vertex", iReduceStageConf);
    ProcessorDescriptor iReduceProcessorDescriptor = ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(copyiReduceStageConf)).setHistoryText(iReduceStageHistoryText);
    Vertex intermediateVertex;
    if (!useMRSettings) {
        intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor, intermediateNumReduceTasks);
    } else {
        intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor, intermediateNumReduceTasks, MRHelpers.getResourceForMRReducer(iReduceStageConf));
        intermediateVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(iReduceStageConf));
        intermediateVertex.setTaskEnvironment(reduceEnv);
    }
    intermediateVertex.addTaskLocalFiles(commonLocalResources);
    vertices.add(intermediateVertex);
    Configuration copyFinalReduceConf = new Configuration(finalReduceConf);
    setMaxDataLengthConf(copyFinalReduceConf, maxDataLengthThroughIPC, exceedDataLimit);
    String finalReduceStageHistoryText = TezUtils.convertToHistoryText("Final Sorter Vertex", finalReduceConf);
    UserPayload finalReducePayload = TezUtils.createUserPayloadFromConf(copyFinalReduceConf);
    Vertex finalReduceVertex;
    ProcessorDescriptor finalReduceProcessorDescriptor = ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(finalReducePayload).setHistoryText(finalReduceStageHistoryText);
    if (!useMRSettings) {
        finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1);
    } else {
        finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1, MRHelpers.getResourceForMRReducer(finalReduceConf));
        finalReduceVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf));
        finalReduceVertex.setTaskEnvironment(reduceEnv);
    }
    finalReduceVertex.addTaskLocalFiles(commonLocalResources);
    finalReduceVertex.addDataSink("MROutput", MROutputLegacy.createConfigBuilder(finalReduceConf, TextOutputFormat.class, outputPath).build());
    finalReduceVertex.getDataSinks().get(0).getOutputDescriptor().setHistoryText(TezUtils.convertToHistoryText("HDFS Output " + outputPath, finalReduceConf));
    vertices.add(finalReduceVertex);
    DAG dag = DAG.create("OrderedWordCount" + dagIndex);
    for (int i = 0; i < vertices.size(); ++i) {
        dag.addVertex(vertices.get(i));
    }
    OrderedPartitionedKVEdgeConfig edgeConf1 = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(iReduceStageConf).configureInput().useLegacyInput().done().build();
    dag.addEdge(Edge.create(dag.getVertex("initialmap"), dag.getVertex("intermediate_reducer"), edgeConf1.createDefaultEdgeProperty()));
    OrderedPartitionedKVEdgeConfig edgeConf2 = OrderedPartitionedKVEdgeConfig.newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(finalReduceConf).configureInput().useLegacyInput().done().build();
    dag.addEdge(Edge.create(dag.getVertex("intermediate_reducer"), dag.getVertex("finalreduce"), edgeConf2.createDefaultEdgeProperty()));
    updateDAGACls(conf, dag, dagIndex);
    return dag;
}
Also used : OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) UserPayload(org.apache.tez.dag.api.UserPayload) ArrayList(java.util.ArrayList) ProcessorDescriptor(org.apache.tez.dag.api.ProcessorDescriptor) Text(org.apache.hadoop.io.Text) DAG(org.apache.tez.dag.api.DAG) ReduceProcessor(org.apache.tez.mapreduce.processor.reduce.ReduceProcessor) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) MapProcessor(org.apache.tez.mapreduce.processor.map.MapProcessor) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)13 DAG (org.apache.tez.dag.api.DAG)11 Vertex (org.apache.tez.dag.api.Vertex)11 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)10 Configuration (org.apache.hadoop.conf.Configuration)8 Text (org.apache.hadoop.io.Text)7 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)6 HashPartitioner (org.apache.tez.runtime.library.partitioner.HashPartitioner)6 IntWritable (org.apache.hadoop.io.IntWritable)5 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)5 TextInputFormat (org.apache.hadoop.mapreduce.lib.input.TextInputFormat)4 ArrayList (java.util.ArrayList)3 NullWritable (org.apache.hadoop.io.NullWritable)3 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)3 Edge (org.apache.tez.dag.api.Edge)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 HashMap (java.util.HashMap)2 Path (org.apache.hadoop.fs.Path)2 EdgeType (org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType)2 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)2