Search in sources :

Example 96 with LongWritable

use of org.apache.hadoop.io.LongWritable in project cdap by caskdata.

the class StreamInputFormatTest method testStreamRecordReader.

@Test
public void testStreamRecordReader() throws Exception {
    File inputDir = tmpFolder.newFolder();
    File partition = new File(inputDir, "1.1000");
    partition.mkdirs();
    File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
    File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
    // write 1 event
    StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile), Files.newOutputStreamSupplier(indexFile), 100L);
    writer.append(StreamFileTestUtils.createEvent(1000, "test"));
    writer.flush();
    // get splits from the input format. Expect to get 2 splits,
    // one from 0 - some offset and one from offset - Long.MAX_VALUE.
    Configuration conf = new Configuration();
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    AbstractStreamInputFormat.setStreamId(conf, DUMMY_ID);
    AbstractStreamInputFormat.setStreamPath(conf, inputDir.toURI());
    AbstractStreamInputFormat format = new AbstractStreamInputFormat() {

        @Override
        public AuthorizationEnforcer getAuthorizationEnforcer(TaskAttemptContext context) {
            return new NoOpAuthorizer();
        }

        @Override
        public AuthenticationContext getAuthenticationContext(TaskAttemptContext context) {
            return new AuthenticationTestContext();
        }
    };
    List<InputSplit> splits = format.getSplits(new JobContextImpl(new JobConf(conf), new JobID()));
    Assert.assertEquals(2, splits.size());
    // write another event so that the 2nd split has something to read
    writer.append(StreamFileTestUtils.createEvent(1001, "test"));
    writer.close();
    // create a record reader for the 2nd split
    StreamRecordReader<LongWritable, StreamEvent> recordReader = new StreamRecordReader<>(new IdentityStreamEventDecoder(), new NoOpAuthorizer(), new AuthenticationTestContext(), DUMMY_ID);
    recordReader.initialize(splits.get(1), context);
    // check that we read the 2nd stream event
    Assert.assertTrue(recordReader.nextKeyValue());
    StreamEvent output = recordReader.getCurrentValue();
    Assert.assertEquals(1001, output.getTimestamp());
    Assert.assertEquals("test", Bytes.toString(output.getBody()));
    // check that there is nothing more to read
    Assert.assertFalse(recordReader.nextKeyValue());
}
Also used : JobContextImpl(org.apache.hadoop.mapred.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) AuthenticationTestContext(co.cask.cdap.security.auth.context.AuthenticationTestContext) NoOpAuthorizer(co.cask.cdap.security.spi.authorization.NoOpAuthorizer) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) IdentityStreamEventDecoder(co.cask.cdap.data.stream.decoder.IdentityStreamEventDecoder) LongWritable(org.apache.hadoop.io.LongWritable) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) JobID(org.apache.hadoop.mapred.JobID) Test(org.junit.Test)

Example 97 with LongWritable

use of org.apache.hadoop.io.LongWritable in project cdap by caskdata.

the class StreamInputFormatTest method testStringStreamEventDecoder.

@Test
public void testStringStreamEventDecoder() {
    String body = "Testing";
    StreamEvent event = new StreamEvent(ImmutableMap.<String, String>of(), Charsets.UTF_8.encode(body));
    StreamEventDecoder<LongWritable, String> decoder = new StringStreamEventDecoder();
    StreamEventDecoder.DecodeResult<LongWritable, String> result = new StreamEventDecoder.DecodeResult<>();
    result = decoder.decode(event, result);
    Assert.assertEquals(event.getTimestamp(), result.getKey().get());
    Assert.assertEquals(body, result.getValue());
}
Also used : IdentityStreamEventDecoder(co.cask.cdap.data.stream.decoder.IdentityStreamEventDecoder) StringStreamEventDecoder(co.cask.cdap.data.stream.decoder.StringStreamEventDecoder) StreamEventDecoder(co.cask.cdap.api.stream.StreamEventDecoder) BytesStreamEventDecoder(co.cask.cdap.data.stream.decoder.BytesStreamEventDecoder) TextStreamEventDecoder(co.cask.cdap.data.stream.decoder.TextStreamEventDecoder) StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) LongWritable(org.apache.hadoop.io.LongWritable) StringStreamEventDecoder(co.cask.cdap.data.stream.decoder.StringStreamEventDecoder) Test(org.junit.Test)

Example 98 with LongWritable

use of org.apache.hadoop.io.LongWritable in project tez by apache.

the class TestTezMerger method createDataForIFile.

/**
 * Generate data set for ifile.  Create repeated keys if needed.
 *
 * @param keyCount    approximate number of keys to be created
 * @param repeatCount number of times a key should be repeated
 * @return
 */
static TreeMultimap<Integer, Long> createDataForIFile(int keyCount, int repeatCount) {
    TreeMultimap<Integer, Long> dataSet = TreeMultimap.create();
    Random rnd = new Random();
    for (int i = 0; i < keyCount; i++) {
        if (repeatCount > 0 && (rnd.nextInt(keyCount) % 2 == 0)) {
            // repeat this key
            for (int j = 0; j < repeatCount; j++) {
                IntWritable key = new IntWritable(rnd.nextInt(keyCount));
                LongWritable value = new LongWritable(System.nanoTime());
                dataSet.put(key.get(), value.get());
            }
            i += repeatCount;
            LOG.info("Repeated key count=" + (repeatCount));
        } else {
            IntWritable key = new IntWritable(rnd.nextInt(keyCount));
            LongWritable value = new LongWritable(System.nanoTime());
            dataSet.put(key.get(), value.get());
        }
    }
    for (Integer key : dataSet.keySet()) {
        for (Long value : dataSet.get(key)) {
            LOG.info("Key=" + key + ", val=" + value);
        }
    }
    LOG.info("=============");
    return dataSet;
}
Also used : Random(java.util.Random) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 99 with LongWritable

use of org.apache.hadoop.io.LongWritable in project tez by apache.

the class TestTezMerger method createInMemorySegments.

private List<TezMerger.Segment> createInMemorySegments(int segmentCount, int keysPerSegment) throws IOException {
    List<TezMerger.Segment> segmentList = Lists.newLinkedList();
    Random rnd = new Random();
    DataInputBuffer key = new DataInputBuffer();
    DataInputBuffer value = new DataInputBuffer();
    for (int i = 0; i < segmentCount; i++) {
        BoundedByteArrayOutputStream stream = new BoundedByteArrayOutputStream(10000);
        InMemoryWriter writer = new InMemoryWriter(stream);
        for (int j = 0; j < keysPerSegment; j++) {
            populateData(new IntWritable(rnd.nextInt()), new LongWritable(rnd.nextLong()), key, value);
            writer.append(key, value);
        }
        writer.close();
        InMemoryReader reader = new InMemoryReader(merger, null, stream.getBuffer(), 0, stream.getLimit());
        segmentList.add(new TezMerger.Segment(reader, null));
    }
    return segmentList;
}
Also used : InMemoryReader(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryReader) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 100 with LongWritable

use of org.apache.hadoop.io.LongWritable in project tez by apache.

the class TestUnorderedPartitionedKVWriter method baseTest.

private void baseTest(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress, int maxSingleBufferSizeBytes, int bufferMergePercent, int availableMemory) throws IOException, InterruptedException {
    PartitionerForTest partitioner = new PartitionerForTest();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, maxSingleBufferSizeBytes);
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_PARTITIONED_KVWRITER_BUFFER_MERGE_PERCENT, bufferMergePercent);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numOutputs = numPartitions;
    int numRecordsWritten = 0;
    Map<Integer, Multimap<Integer, Long>> expectedValues = new HashMap<Integer, Multimap<Integer, Long>>();
    for (int i = 0; i < numOutputs; i++) {
        expectedValues.put(i, LinkedListMultimap.<Integer, Long>create());
    }
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    // IntW + LongW
    int sizePerRecord = 4 + 8;
    // Record + META_OVERHEAD
    int sizePerRecordWithOverhead = sizePerRecord + 12;
    IntWritable intWritable = new IntWritable();
    LongWritable longWritable = new LongWritable();
    BitSet partitionsWithData = new BitSet(numPartitions);
    for (int i = 0; i < numRecords; i++) {
        intWritable.set(i);
        longWritable.set(i);
        int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
        if (skippedPartitions != null && skippedPartitions.contains(partition)) {
            continue;
        }
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(intWritable.get(), longWritable.get());
        kvWriter.write(intWritable, longWritable);
        numRecordsWritten++;
    }
    List<Event> events = kvWriter.close();
    if (numPartitions == 1) {
        assertEquals(true, kvWriter.skipBuffers);
    }
    int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
    int numExpectedSpills = numRecordsWritten / recordsPerBuffer / kvWriter.spillLimit;
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    assertNull(kvWriter.currentBuffer);
    assertEquals(0, kvWriter.availableBuffers.size());
    // Verify the counters
    TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
    TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
    TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
    TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
    TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
    TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
    TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
    TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
    assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
    if (numPartitions > 1) {
        assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
    }
    assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
    long fileOutputBytes = fileOutputBytesCounter.getValue();
    if (numRecordsWritten > 0) {
        assertTrue(fileOutputBytes > 0);
        if (!shouldCompress) {
            assertTrue(fileOutputBytes > outputRecordBytesCounter.getValue());
        }
    } else {
        assertEquals(0, fileOutputBytes);
    }
    assertEquals(recordsPerBuffer * numExpectedSpills, spilledRecordsCounter.getValue());
    long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
    long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
    if (numExpectedSpills == 0) {
        assertEquals(0, additionalSpillBytesWritten);
        assertEquals(0, additionalSpillBytesRead);
    } else {
        assertTrue(additionalSpillBytesWritten > 0);
        assertTrue(additionalSpillBytesRead > 0);
        if (!shouldCompress) {
            assertTrue(additionalSpillBytesWritten > (recordsPerBuffer * numExpectedSpills * sizePerRecord));
            assertTrue(additionalSpillBytesRead > (recordsPerBuffer * numExpectedSpills * sizePerRecord));
        }
    }
    assertEquals(additionalSpillBytesWritten, additionalSpillBytesRead);
    // due to multiple threads, buffers could be merged in chunks in scheduleSpill.
    assertTrue(numExpectedSpills >= numAdditionalSpillsCounter.getValue());
    BitSet emptyPartitionBits = null;
    // Verify the events returned
    assertEquals(2, events.size());
    assertTrue(events.get(0) instanceof VertexManagerEvent);
    VertexManagerEvent vme = (VertexManagerEvent) events.get(0);
    verifyPartitionStats(vme, partitionsWithData);
    assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(1);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numOutputs, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    if (skippedPartitions == null && numRecordsWritten > 0) {
        assertFalse(eventProto.hasEmptyPartitions());
        emptyPartitionBits = new BitSet(numPartitions);
    } else {
        assertTrue(eventProto.hasEmptyPartitions());
        byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
        emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
        if (numRecordsWritten == 0) {
            assertEquals(numPartitions, emptyPartitionBits.cardinality());
        } else {
            for (Integer e : skippedPartitions) {
                assertTrue(emptyPartitionBits.get(e));
            }
            assertEquals(skippedPartitions.size(), emptyPartitionBits.cardinality());
        }
    }
    if (emptyPartitionBits.cardinality() != numPartitions) {
        assertEquals(HOST_STRING, eventProto.getHost());
        assertEquals(SHUFFLE_PORT, eventProto.getPort());
        assertEquals(uniqueId, eventProto.getPathComponent());
    } else {
        assertFalse(eventProto.hasHost());
        assertFalse(eventProto.hasPort());
        assertFalse(eventProto.hasPathComponent());
    }
    // Verify the actual data
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    Path outputFilePath = kvWriter.finalOutPath;
    Path spillFilePath = kvWriter.finalIndexPath;
    if (numRecordsWritten <= 0) {
        return;
    }
    assertTrue(localFs.exists(outputFilePath));
    assertTrue(localFs.exists(spillFilePath));
    // verify no intermediate spill files have been left around
    synchronized (kvWriter.spillInfoList) {
        for (SpillInfo spill : kvWriter.spillInfoList) {
            assertFalse("lingering intermediate spill file " + spill.outPath, localFs.exists(spill.outPath));
        }
    }
    // Special case for 0 records.
    TezSpillRecord spillRecord = new TezSpillRecord(spillFilePath, conf);
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();
    IntWritable keyDeser = new IntWritable();
    LongWritable valDeser = new LongWritable();
    for (int i = 0; i < numOutputs; i++) {
        TezIndexRecord indexRecord = spillRecord.getIndex(i);
        if (skippedPartitions != null && skippedPartitions.contains(i)) {
            assertFalse("The Index Record for partition " + i + " should not have any data", indexRecord.hasData());
            continue;
        }
        FSDataInputStream inStream = FileSystem.getLocal(conf).open(outputFilePath);
        inStream.seek(indexRecord.getStartOffset());
        IFile.Reader reader = new IFile.Reader(inStream, indexRecord.getPartLength(), codec, null, null, false, 0, -1);
        while (reader.nextRawKey(keyBuffer)) {
            reader.nextRawValue(valBuffer);
            keyDeser.readFields(keyBuffer);
            valDeser.readFields(valBuffer);
            int partition = partitioner.getPartition(keyDeser, valDeser, numOutputs);
            assertTrue(expectedValues.get(partition).remove(keyDeser.get(), valDeser.get()));
        }
        inStream.close();
    }
    for (int i = 0; i < numOutputs; i++) {
        assertEquals(0, expectedValues.get(i).size());
        expectedValues.remove(i);
    }
    assertEquals(0, expectedValues.size());
    verify(outputContext, atLeast(1)).notifyProgress();
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) HashMap(java.util.HashMap) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) SpillInfo(org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.SpillInfo) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) Multimap(com.google.common.collect.Multimap) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35