Examples with CompressionCodec - org.apache.hadoop.io.compress.CompressionCodec

Example 41 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.

the class TestUnorderedPartitionedKVWriter method baseTestWithFinalMergeDisabled.

@SuppressWarnings("unchecked")
private void baseTestWithFinalMergeDisabled(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress) throws IOException, InterruptedException {
    PartitionerForTest partitioner = new PartitionerForTest();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, -1);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, false);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numOutputs = numPartitions;
    long availableMemory = 2048;
    int numRecordsWritten = 0;
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    // IntW + LongW
    int sizePerRecord = 4 + 8;
    // Record + META_OVERHEAD
    int sizePerRecordWithOverhead = sizePerRecord + 12;
    BitSet partitionsWithData = new BitSet(numPartitions);
    IntWritable intWritable = new IntWritable();
    LongWritable longWritable = new LongWritable();
    for (int i = 0; i < numRecords; i++) {
        intWritable.set(i);
        longWritable.set(i);
        int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
        if (skippedPartitions != null && skippedPartitions.contains(partition)) {
            continue;
        }
        partitionsWithData.set(partition);
        kvWriter.write(intWritable, longWritable);
        numRecordsWritten++;
    }
    int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
    int numExpectedSpills = numRecordsWritten / recordsPerBuffer;
    ArgumentCaptor<List> eventCaptor = ArgumentCaptor.forClass(List.class);
    List<Event> lastEvents = kvWriter.close();
    if (numPartitions == 1) {
        assertEquals(true, kvWriter.skipBuffers);
    }
    // max events sent are spills + one VM event. If there are no spills, atleast empty
    // partitions would be sent out finally.
    int spills = Math.max(1, kvWriter.numSpills.get());
    // spills + VMEvent
    assertEquals((spills + 1), lastEvents.size());
    verify(outputContext, atMost(0)).sendEvents(eventCaptor.capture());
    for (int i = 0; i < lastEvents.size(); i++) {
        Event event = lastEvents.get(i);
        if (event instanceof VertexManagerEvent) {
            // to stats.
            if (numRecordsWritten > 0) {
                verifyPartitionStats(((VertexManagerEvent) event), partitionsWithData);
            }
        }
    }
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    assertNull(kvWriter.currentBuffer);
    assertEquals(0, kvWriter.availableBuffers.size());
    // Verify the counters
    TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
    TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
    TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
    TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
    TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
    TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
    TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
    TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
    assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
    assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
    if (outputRecordsCounter.getValue() > 0) {
        assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
    } else {
        assertEquals(0, outputBytesWithOverheadCounter.getValue());
    }
    long fileOutputBytes = fileOutputBytesCounter.getValue();
    if (numRecordsWritten > 0) {
        assertTrue(fileOutputBytes > 0);
        if (!shouldCompress) {
            assertTrue("fileOutputBytes=" + fileOutputBytes + ", outputRecordBytes=" + outputRecordBytesCounter.getValue(), fileOutputBytes > outputRecordBytesCounter.getValue());
        }
    } else {
        assertEquals(0, fileOutputBytes);
    }
    // due to multiple threads, buffers could be merged in chunks in scheduleSpill.
    assertTrue(recordsPerBuffer * numExpectedSpills >= spilledRecordsCounter.getValue());
    long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
    long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
    // No additional spill bytes written when final merge is disabled.
    assertEquals(additionalSpillBytesWritten, 0);
    // No additional spills when final merge is disabled.
    assertTrue(additionalSpillBytesWritten == additionalSpillBytesRead);
    // No additional spills when final merge is disabled.
    assertEquals(numAdditionalSpillsCounter.getValue(), 0);
    assertTrue(lastEvents.size() > 0);
    // Get the last event
    int index = lastEvents.size() - 1;
    assertTrue(lastEvents.get(index) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) lastEvents.get(index);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numOutputs, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    verifyEmptyPartitions(eventProto, numRecordsWritten, numPartitions, skippedPartitions);
    if (outputRecordsCounter.getValue() > 0) {
        // Ensure that this is the last event
        assertTrue(eventProto.getLastEvent());
    }
    // Verify if all path components have spillIds when final merge is disabled
    Pattern mergePathComponentPattern = Pattern.compile("(.*)(_\\d+)");
    for (Event event : lastEvents) {
        if (!(event instanceof CompositeDataMovementEvent)) {
            continue;
        }
        cdme = (CompositeDataMovementEvent) event;
        eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
        assertEquals(false, eventProto.getPipelined());
        if (eventProto.hasPathComponent()) {
            // for final merge disabled cases, it should have _spillId
            Matcher matcher = mergePathComponentPattern.matcher(eventProto.getPathComponent());
            assertTrue("spill id should be present in path component " + eventProto.getPathComponent(), matcher.matches());
            assertEquals(2, matcher.groupCount());
            assertEquals(uniqueId, matcher.group(1));
            assertTrue("spill id should be present in path component", matcher.group(2) != null);
            Path outputPath = new Path(outputContext.getWorkDirs()[0], "output/" + eventProto.getPathComponent() + "/" + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING);
            Path indexPath = outputPath.suffix(Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING);
            assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputPath).getPermission().toShort());
            assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(indexPath).getPermission().toShort());
        } else {
            assertEquals(0, eventProto.getSpillId());
            if (outputRecordsCounter.getValue() > 0) {
                assertEquals(true, eventProto.getLastEvent());
            } else {
                byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
                BitSet emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
                assertEquals(numPartitions, emptyPartitionBits.cardinality());
            }
        }
    }
    verify(outputContext, atLeast(1)).notifyProgress();
    // Verify if all spill files are available.
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    if (numRecordsWritten > 0) {
        int numSpills = kvWriter.numSpills.get();
        for (int i = 0; i < numSpills; i++) {
            assertTrue(localFs.exists(taskOutput.getSpillFileForWrite(i, 10)));
            assertTrue(localFs.exists(taskOutput.getSpillIndexFileForWrite(i, 10)));
        }
    } else {
        return;
    }
}

Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) Matcher(java.util.regex.Matcher) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) List(java.util.List) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) BitSet(java.util.BitSet) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Example 42 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.

the class UnorderedKVInput method start.

@Override
public synchronized void start() throws IOException {
    if (!isStarted.get()) {
        // //// Initial configuration
        memoryUpdateCallbackHandler.validateUpdateReceived();
        CompressionCodec codec;
        if (ConfigUtils.isIntermediateInputCompressed(conf)) {
            Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateInputCompressorClass(conf, DefaultCodec.class);
            codec = ReflectionUtils.newInstance(codecClass, conf);
        } else {
            codec = null;
        }
        boolean compositeFetch = ShuffleUtils.isTezShuffleHandler(conf);
        boolean ifileReadAhead = conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_DEFAULT);
        int ifileReadAheadLength = 0;
        int ifileBufferSize = 0;
        if (ifileReadAhead) {
            ifileReadAheadLength = conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES_DEFAULT);
        }
        ifileBufferSize = conf.getInt("io.file.buffer.size", TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_BUFFER_SIZE_DEFAULT);
        this.inputManager = new SimpleFetchedInputAllocator(TezUtilsInternal.cleanVertexName(getContext().getSourceVertexName()), getContext().getUniqueIdentifier(), getContext().getDagIdentifier(), conf, getContext().getTotalMemoryAvailableToTask(), memoryUpdateCallbackHandler.getMemoryAssigned());
        this.shuffleManager = new ShuffleManager(getContext(), conf, getNumPhysicalInputs(), ifileBufferSize, ifileReadAhead, ifileReadAheadLength, codec, inputManager);
        this.inputEventHandler = new ShuffleInputEventHandlerImpl(getContext(), shuffleManager, inputManager, codec, ifileReadAhead, ifileReadAheadLength, compositeFetch);
        // //// End of Initial configuration
        this.shuffleManager.run();
        this.kvReader = createReader(inputRecordCounter, codec, ifileBufferSize, ifileReadAhead, ifileReadAheadLength);
        List<Event> pending = new LinkedList<Event>();
        pendingEvents.drainTo(pending);
        if (pending.size() > 0) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(getContext().getSourceVertexName() + ": " + "NoAutoStart delay in processing first event: " + (System.currentTimeMillis() - firstEventReceivedTime));
            }
            inputEventHandler.handleEvents(pending);
        }
        isStarted.set(true);
    }
}

Also used : ShuffleInputEventHandlerImpl(org.apache.tez.runtime.library.common.shuffle.impl.ShuffleInputEventHandlerImpl) ShuffleManager(org.apache.tez.runtime.library.common.shuffle.impl.ShuffleManager) Event(org.apache.tez.runtime.api.Event) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SimpleFetchedInputAllocator(org.apache.tez.runtime.library.common.shuffle.impl.SimpleFetchedInputAllocator)

Example 43 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.

the class TestGroupedSplits method testGzip.

/**
 * Test using the gzip codec for reading
 */
@Test(timeout = 10000)
public void testGzip() throws IOException {
    JobConf job = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, job);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "is\ngzip\n");
    writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip, "one\nmore\nsplit\n");
    FileInputFormat.setInputPaths(job, workDir);
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(wrappedFormat);
    // TextInputFormat will produce 3 splits
    for (int j = 1; j <= 3; ++j) {
        format.setDesiredNumberOfSplits(j);
        InputSplit[] splits = format.getSplits(job, 100);
        if (j == 1) {
            // j==1 covers single split corner case
            // and does not do grouping
            assertEquals("compressed splits == " + j, j, splits.length);
        }
        List<Text> results = new ArrayList<Text>();
        for (int i = 0; i < splits.length; ++i) {
            List<Text> read = readSplit(format, splits[i], job);
            results.addAll(read);
        }
        assertEquals("splits length", 11, results.size());
        final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
        final String[] secondList = { "is", "gzip" };
        final String[] thirdList = { "one", "more", "split" };
        String first = results.get(0).toString();
        int start = 0;
        switch(first.charAt(0)) {
            case 't':
                start = testResults(results, firstList, start);
                break;
            case 'i':
                start = testResults(results, secondList, start);
                break;
            case 'o':
                start = testResults(results, thirdList, start);
                break;
            default:
                Assert.fail("unexpected first token - " + first);
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 44 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project mongo-hadoop by mongodb.

the class BSONFileRecordReader method init.

public void init(final InputSplit inputSplit, final Configuration configuration) throws IOException, InterruptedException {
    this.configuration = configuration;
    fileSplit = (FileSplit) inputSplit;
    if (LOG.isDebugEnabled()) {
        LOG.debug("reading split " + fileSplit);
    }
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(configuration);
    CompressionCodec codec = new CompressionCodecFactory(configuration).getCodec(fileSplit.getPath());
    inRaw = fs.open(file, 16 * 1024 * 1024);
    inRaw.seek(startingPosition == BSON_RR_POSITION_NOT_GIVEN ? fileSplit.getStart() : startingPosition);
    if (codec != null) {
        decompressor = CodecPool.getDecompressor(codec);
        in = codec.createInputStream(inRaw, decompressor);
    } else {
        in = inRaw;
    }
    if (MongoConfigUtil.getLazyBSON(configuration)) {
        callback = new LazyBSONCallback();
        decoder = new LazyBSONDecoder();
    } else {
        callback = new BasicBSONCallback();
        decoder = new BasicBSONDecoder();
    }
}

Also used : Path(org.apache.hadoop.fs.Path) BasicBSONCallback(org.bson.BasicBSONCallback) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBSONCallback(org.bson.LazyBSONCallback) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LazyBSONDecoder(org.bson.LazyBSONDecoder) BasicBSONDecoder(org.bson.BasicBSONDecoder)

Example 45 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project drill by apache.

the class DrillFileSystem method openPossiblyCompressedStream.

/**
 * Returns an InputStream from a Hadoop path. If the data is compressed, this method will return a compressed
 * InputStream depending on the codec.
 * @param path Input file path
 * @return InputStream of opened file path
 * @throws IOException If the file is unreachable, unavailable or otherwise unreadable
 */
public InputStream openPossiblyCompressedStream(Path path) throws IOException {
    // infers from file ext.
    CompressionCodec codec = getCodec(path);
    InputStream inputStream = open(path);
    if (codec != null) {
        inputStream = codec.createInputStream(inputStream);
    }
    return inputStream;
}

Also used : FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) InputStream(java.io.InputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)110 Path (org.apache.hadoop.fs.Path)53 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)37 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)36 InputStream (java.io.InputStream)17 Test (org.junit.Test)17 IOException (java.io.IOException)16 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)14 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)6 ByteString (com.google.protobuf.ByteString)5