Search in sources :

Example 1 with KeyValueWriter

use of org.apache.tez.runtime.library.api.KeyValueWriter in project tez by apache.

the class TestOnFileUnorderedKVOutput method testWithPipelinedShuffle.

@Test(timeout = 30000)
@SuppressWarnings("unchecked")
public void testWithPipelinedShuffle() throws Exception {
    Configuration conf = new Configuration();
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, true);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, 1);
    TezSharedExecutor sharedExecutor = new TezSharedExecutor(conf);
    OutputContext outputContext = createOutputContext(conf, sharedExecutor);
    UnorderedKVOutput kvOutput = new UnorderedKVOutput(outputContext, 1);
    List<Event> events = null;
    events = kvOutput.initialize();
    kvOutput.start();
    assertTrue(events != null && events.size() == 0);
    KeyValueWriter kvWriter = kvOutput.getWriter();
    for (int i = 0; i < 500; i++) {
        kvWriter.write(new Text(RandomStringUtils.randomAscii(10000)), new IntWritable(i));
    }
    events = kvOutput.close();
    // When pipelining is on, all events are sent out within writer itself.
    assertTrue(events != null && events.size() == 0);
    // ensure that data is sent via outputContext.
    ArgumentCaptor<List> eventsCaptor = ArgumentCaptor.forClass(List.class);
    verify(outputContext, atLeast(1)).sendEvents(eventsCaptor.capture());
    events = eventsCaptor.getValue();
    CompositeDataMovementEvent dmEvent = (CompositeDataMovementEvent) events.get(1);
    assertEquals("Invalid source index", 0, dmEvent.getSourceIndexStart());
    DataMovementEventPayloadProto shufflePayload = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(dmEvent.getUserPayload()));
    assertTrue(shufflePayload.hasLastEvent());
    assertFalse(shufflePayload.hasEmptyPartitions());
    assertEquals(shufflePort, shufflePayload.getPort());
    assertEquals("localhost", shufflePayload.getHost());
    sharedExecutor.shutdownNow();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) Text(org.apache.hadoop.io.Text) OutputContext(org.apache.tez.runtime.api.OutputContext) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezSharedExecutor(org.apache.tez.common.TezSharedExecutor) Event(org.apache.tez.runtime.api.Event) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) List(java.util.List) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 2 with KeyValueWriter

use of org.apache.tez.runtime.library.api.KeyValueWriter in project tez by apache.

the class ReduceProcessor method run.

@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
    this.inputs = _inputs;
    this.outputs = _outputs;
    progressHelper = new ProgressHelper(this.inputs, processorContext, this.getClass().getSimpleName());
    LOG.info("Running reduce: " + processorContext.getUniqueIdentifier());
    if (_outputs.size() <= 0 || _outputs.size() > 1) {
        throw new IOException("Invalid number of _outputs" + ", outputCount=" + _outputs.size());
    }
    if (_inputs.size() <= 0 || _inputs.size() > 1) {
        throw new IOException("Invalid number of _inputs" + ", inputCount=" + _inputs.size());
    }
    LogicalInput in = _inputs.values().iterator().next();
    in.start();
    List<Input> pendingInputs = new LinkedList<Input>();
    pendingInputs.add(in);
    processorContext.waitForAllInputsReady(pendingInputs);
    LOG.info("Input is ready for consumption. Starting Output");
    LogicalOutput out = _outputs.values().iterator().next();
    out.start();
    initTask(out);
    progressHelper.scheduleProgressTaskService(0, 100);
    this.statusUpdate();
    Class keyClass = ConfigUtils.getIntermediateInputKeyClass(jobConf);
    Class valueClass = ConfigUtils.getIntermediateInputValueClass(jobConf);
    LOG.info("Using keyClass: " + keyClass);
    LOG.info("Using valueClass: " + valueClass);
    RawComparator comparator = ConfigUtils.getInputKeySecondaryGroupingComparator(jobConf);
    LOG.info("Using comparator: " + comparator);
    reduceInputKeyCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_GROUPS);
    reduceInputValueCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_RECORDS);
    // Sanity check
    if (!(in instanceof OrderedGroupedInputLegacy)) {
        throw new IOException("Illegal input to reduce: " + in.getClass());
    }
    OrderedGroupedInputLegacy shuffleInput = (OrderedGroupedInputLegacy) in;
    KeyValuesReader kvReader = shuffleInput.getReader();
    KeyValueWriter kvWriter = null;
    if ((out instanceof MROutputLegacy)) {
        kvWriter = ((MROutputLegacy) out).getWriter();
    } else if ((out instanceof OrderedPartitionedKVOutput)) {
        kvWriter = ((OrderedPartitionedKVOutput) out).getWriter();
    } else {
        throw new IOException("Illegal output to reduce: " + in.getClass());
    }
    if (useNewApi) {
        try {
            runNewReducer(jobConf, mrReporter, shuffleInput, comparator, keyClass, valueClass, kvWriter);
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        }
    } else {
        runOldReducer(jobConf, mrReporter, kvReader, comparator, keyClass, valueClass, kvWriter);
    }
    done();
}
Also used : OrderedGroupedInputLegacy(org.apache.tez.runtime.library.input.OrderedGroupedInputLegacy) ProgressHelper(org.apache.tez.common.ProgressHelper) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) MROutputLegacy(org.apache.tez.mapreduce.output.MROutputLegacy) OrderedPartitionedKVOutput(org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput) IOException(java.io.IOException) LinkedList(java.util.LinkedList) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) RawComparator(org.apache.hadoop.io.RawComparator) LogicalInput(org.apache.tez.runtime.api.LogicalInput) Input(org.apache.tez.runtime.api.Input) LogicalInput(org.apache.tez.runtime.api.LogicalInput) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader)

Example 3 with KeyValueWriter

use of org.apache.tez.runtime.library.api.KeyValueWriter in project tez by apache.

the class FilterByWordInputProcessor method run.

@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
    this.inputs = _inputs;
    this.outputs = _outputs;
    this.progressHelper = new ProgressHelper(this.inputs, getContext(), this.getClass().getSimpleName());
    if (_inputs.size() != 1) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single input");
    }
    if (_outputs.size() != 1) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single output");
    }
    for (LogicalInput input : _inputs.values()) {
        input.start();
    }
    for (LogicalOutput output : _outputs.values()) {
        output.start();
    }
    LogicalInput li = _inputs.values().iterator().next();
    if (!(li instanceof MRInput)) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with MRInput");
    }
    LogicalOutput lo = _outputs.values().iterator().next();
    if (!(lo instanceof UnorderedKVOutput)) {
        throw new IllegalStateException("FilterByWordInputProcessor processor can only work with OnFileUnorderedKVOutput");
    }
    progressHelper.scheduleProgressTaskService(0, 100);
    MRInputLegacy mrInput = (MRInputLegacy) li;
    mrInput.init();
    UnorderedKVOutput kvOutput = (UnorderedKVOutput) lo;
    Configuration updatedConf = mrInput.getConfigUpdates();
    Text srcFile = new Text();
    srcFile.set("UNKNOWN_FILENAME_IN_PROCESSOR");
    if (updatedConf != null) {
        String fileName = updatedConf.get(MRJobConfig.MAP_INPUT_FILE);
        if (fileName != null) {
            LOG.info("Processing file: " + fileName);
            srcFile.set(fileName);
        }
    }
    KeyValueReader kvReader = mrInput.getReader();
    KeyValueWriter kvWriter = kvOutput.getWriter();
    while (kvReader.next()) {
        Object key = kvReader.getCurrentKey();
        Object val = kvReader.getCurrentValue();
        Text valText = (Text) val;
        String readVal = valText.toString();
        if (readVal.contains(filterWord)) {
            LongWritable lineNum = (LongWritable) key;
            TextLongPair outVal = new TextLongPair(srcFile, lineNum);
            kvWriter.write(valText, outVal);
        }
    }
}
Also used : MRInput(org.apache.tez.mapreduce.input.MRInput) ProgressHelper(org.apache.tez.common.ProgressHelper) Configuration(org.apache.hadoop.conf.Configuration) TextLongPair(org.apache.tez.mapreduce.examples.FilterLinesByWord.TextLongPair) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) Text(org.apache.hadoop.io.Text) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) UnorderedKVOutput(org.apache.tez.runtime.library.output.UnorderedKVOutput) LogicalInput(org.apache.tez.runtime.api.LogicalInput) LongWritable(org.apache.hadoop.io.LongWritable) MRInputLegacy(org.apache.tez.mapreduce.input.MRInputLegacy)

Example 4 with KeyValueWriter

use of org.apache.tez.runtime.library.api.KeyValueWriter in project tez by apache.

the class FilterByWordOutputProcessor method run.

@Override
public void run() throws Exception {
    if (inputs.size() != 1) {
        throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single input");
    }
    if (outputs.size() != 1) {
        throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single output");
    }
    for (LogicalInput input : inputs.values()) {
        input.start();
    }
    for (LogicalOutput output : outputs.values()) {
        output.start();
    }
    LogicalInput li = inputs.values().iterator().next();
    if (!(li instanceof UnorderedKVInput)) {
        throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with ShuffledUnorderedKVInput");
    }
    LogicalOutput lo = outputs.values().iterator().next();
    if (!(lo instanceof MROutput)) {
        throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with MROutput");
    }
    UnorderedKVInput kvInput = (UnorderedKVInput) li;
    MROutput mrOutput = (MROutput) lo;
    KeyValueReader kvReader = kvInput.getReader();
    KeyValueWriter kvWriter = mrOutput.getWriter();
    while (kvReader.next()) {
        Object key = kvReader.getCurrentKey();
        Object value = kvReader.getCurrentValue();
        kvWriter.write(key, value);
    }
}
Also used : KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) UnorderedKVInput(org.apache.tez.runtime.library.input.UnorderedKVInput) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) LogicalInput(org.apache.tez.runtime.api.LogicalInput) MROutput(org.apache.tez.mapreduce.output.MROutput)

Example 5 with KeyValueWriter

use of org.apache.tez.runtime.library.api.KeyValueWriter in project tez by apache.

the class TestOnFileUnorderedKVOutput method testGeneratedDataMovementEvent.

@Test(timeout = 5000)
public void testGeneratedDataMovementEvent() throws Exception {
    Configuration conf = new Configuration();
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
    TezSharedExecutor sharedExecutor = new TezSharedExecutor(conf);
    OutputContext outputContext = createOutputContext(conf, sharedExecutor);
    UnorderedKVOutput kvOutput = new UnorderedKVOutput(outputContext, 1);
    List<Event> events = null;
    events = kvOutput.initialize();
    kvOutput.start();
    assertTrue(events != null && events.size() == 0);
    KeyValueWriter kvWriter = kvOutput.getWriter();
    List<KVPair> data = KVDataGen.generateTestData(true, 0);
    for (KVPair kvp : data) {
        kvWriter.write(kvp.getKey(), kvp.getvalue());
    }
    events = kvOutput.close();
    assertEquals(45, task.getTaskStatistics().getIOStatistics().values().iterator().next().getDataSize());
    assertEquals(5, task.getTaskStatistics().getIOStatistics().values().iterator().next().getItemsProcessed());
    assertTrue(events != null && events.size() == 2);
    CompositeDataMovementEvent dmEvent = (CompositeDataMovementEvent) events.get(1);
    assertEquals("Invalid source index", 0, dmEvent.getSourceIndexStart());
    DataMovementEventPayloadProto shufflePayload = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(dmEvent.getUserPayload()));
    assertFalse(shufflePayload.hasEmptyPartitions());
    assertEquals(outputContext.getUniqueIdentifier(), shufflePayload.getPathComponent());
    assertEquals(shufflePort, shufflePayload.getPort());
    assertEquals("localhost", shufflePayload.getHost());
    sharedExecutor.shutdownNow();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) Text(org.apache.hadoop.io.Text) OutputContext(org.apache.tez.runtime.api.OutputContext) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezSharedExecutor(org.apache.tez.common.TezSharedExecutor) KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) Event(org.apache.tez.runtime.api.Event) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Aggregations

KeyValueWriter (org.apache.tez.runtime.library.api.KeyValueWriter)7 Configuration (org.apache.hadoop.conf.Configuration)4 LogicalInput (org.apache.tez.runtime.api.LogicalInput)4 LogicalOutput (org.apache.tez.runtime.api.LogicalOutput)4 Text (org.apache.hadoop.io.Text)3 ProgressHelper (org.apache.tez.common.ProgressHelper)3 IOException (java.io.IOException)2 IntWritable (org.apache.hadoop.io.IntWritable)2 TezSharedExecutor (org.apache.tez.common.TezSharedExecutor)2 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)2 MRInputLegacy (org.apache.tez.mapreduce.input.MRInputLegacy)2 MROutputLegacy (org.apache.tez.mapreduce.output.MROutputLegacy)2 Event (org.apache.tez.runtime.api.Event)2 OutputContext (org.apache.tez.runtime.api.OutputContext)2 CompositeDataMovementEvent (org.apache.tez.runtime.api.events.CompositeDataMovementEvent)2 KeyValueReader (org.apache.tez.runtime.library.api.KeyValueReader)2 TezRuntimeConfiguration (org.apache.tez.runtime.library.api.TezRuntimeConfiguration)2 OrderedPartitionedKVOutput (org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput)2 DataMovementEventPayloadProto (org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto)2 Test (org.junit.Test)2