use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.
the class TestUnorderedPartitionedKVWriter method baseTestWithFinalMergeDisabled.
@SuppressWarnings("unchecked")
private void baseTestWithFinalMergeDisabled(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress) throws IOException, InterruptedException {
PartitionerForTest partitioner = new PartitionerForTest();
ApplicationId appId = ApplicationId.newInstance(10000000, 1);
TezCounters counters = new TezCounters();
String uniqueId = UUID.randomUUID().toString();
int dagId = 1;
String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, -1);
conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, false);
CompressionCodec codec = null;
if (shouldCompress) {
codec = new DefaultCodec();
((Configurable) codec).setConf(conf);
}
int numOutputs = numPartitions;
long availableMemory = 2048;
int numRecordsWritten = 0;
UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
int sizePerBuffer = kvWriter.sizePerBuffer;
// IntW + LongW
int sizePerRecord = 4 + 8;
// Record + META_OVERHEAD
int sizePerRecordWithOverhead = sizePerRecord + 12;
BitSet partitionsWithData = new BitSet(numPartitions);
IntWritable intWritable = new IntWritable();
LongWritable longWritable = new LongWritable();
for (int i = 0; i < numRecords; i++) {
intWritable.set(i);
longWritable.set(i);
int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
if (skippedPartitions != null && skippedPartitions.contains(partition)) {
continue;
}
partitionsWithData.set(partition);
kvWriter.write(intWritable, longWritable);
numRecordsWritten++;
}
int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
int numExpectedSpills = numRecordsWritten / recordsPerBuffer;
ArgumentCaptor<List> eventCaptor = ArgumentCaptor.forClass(List.class);
List<Event> lastEvents = kvWriter.close();
if (numPartitions == 1) {
assertEquals(true, kvWriter.skipBuffers);
}
// max events sent are spills + one VM event. If there are no spills, atleast empty
// partitions would be sent out finally.
int spills = Math.max(1, kvWriter.numSpills.get());
// spills + VMEvent
assertEquals((spills + 1), lastEvents.size());
verify(outputContext, atMost(0)).sendEvents(eventCaptor.capture());
for (int i = 0; i < lastEvents.size(); i++) {
Event event = lastEvents.get(i);
if (event instanceof VertexManagerEvent) {
// to stats.
if (numRecordsWritten > 0) {
verifyPartitionStats(((VertexManagerEvent) event), partitionsWithData);
}
}
}
verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
assertNull(kvWriter.currentBuffer);
assertEquals(0, kvWriter.availableBuffers.size());
// Verify the counters
TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
if (outputRecordsCounter.getValue() > 0) {
assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
} else {
assertEquals(0, outputBytesWithOverheadCounter.getValue());
}
long fileOutputBytes = fileOutputBytesCounter.getValue();
if (numRecordsWritten > 0) {
assertTrue(fileOutputBytes > 0);
if (!shouldCompress) {
assertTrue("fileOutputBytes=" + fileOutputBytes + ", outputRecordBytes=" + outputRecordBytesCounter.getValue(), fileOutputBytes > outputRecordBytesCounter.getValue());
}
} else {
assertEquals(0, fileOutputBytes);
}
// due to multiple threads, buffers could be merged in chunks in scheduleSpill.
assertTrue(recordsPerBuffer * numExpectedSpills >= spilledRecordsCounter.getValue());
long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
// No additional spill bytes written when final merge is disabled.
assertEquals(additionalSpillBytesWritten, 0);
// No additional spills when final merge is disabled.
assertTrue(additionalSpillBytesWritten == additionalSpillBytesRead);
// No additional spills when final merge is disabled.
assertEquals(numAdditionalSpillsCounter.getValue(), 0);
assertTrue(lastEvents.size() > 0);
// Get the last event
int index = lastEvents.size() - 1;
assertTrue(lastEvents.get(index) instanceof CompositeDataMovementEvent);
CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) lastEvents.get(index);
assertEquals(0, cdme.getSourceIndexStart());
assertEquals(numOutputs, cdme.getCount());
DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
verifyEmptyPartitions(eventProto, numRecordsWritten, numPartitions, skippedPartitions);
if (outputRecordsCounter.getValue() > 0) {
// Ensure that this is the last event
assertTrue(eventProto.getLastEvent());
}
// Verify if all path components have spillIds when final merge is disabled
Pattern mergePathComponentPattern = Pattern.compile("(.*)(_\\d+)");
for (Event event : lastEvents) {
if (!(event instanceof CompositeDataMovementEvent)) {
continue;
}
cdme = (CompositeDataMovementEvent) event;
eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
assertEquals(false, eventProto.getPipelined());
if (eventProto.hasPathComponent()) {
// for final merge disabled cases, it should have _spillId
Matcher matcher = mergePathComponentPattern.matcher(eventProto.getPathComponent());
assertTrue("spill id should be present in path component " + eventProto.getPathComponent(), matcher.matches());
assertEquals(2, matcher.groupCount());
assertEquals(uniqueId, matcher.group(1));
assertTrue("spill id should be present in path component", matcher.group(2) != null);
Path outputPath = new Path(outputContext.getWorkDirs()[0], "output/" + eventProto.getPathComponent() + "/" + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING);
Path indexPath = outputPath.suffix(Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING);
assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputPath).getPermission().toShort());
assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(indexPath).getPermission().toShort());
} else {
assertEquals(0, eventProto.getSpillId());
if (outputRecordsCounter.getValue() > 0) {
assertEquals(true, eventProto.getLastEvent());
} else {
byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
BitSet emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
assertEquals(numPartitions, emptyPartitionBits.cardinality());
}
}
}
verify(outputContext, atLeast(1)).notifyProgress();
// Verify if all spill files are available.
TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
if (numRecordsWritten > 0) {
int numSpills = kvWriter.numSpills.get();
for (int i = 0; i < numSpills; i++) {
assertTrue(localFs.exists(taskOutput.getSpillFileForWrite(i, 10)));
assertTrue(localFs.exists(taskOutput.getSpillIndexFileForWrite(i, 10)));
}
} else {
return;
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.
the class UnorderedKVInput method start.
@Override
public synchronized void start() throws IOException {
if (!isStarted.get()) {
// //// Initial configuration
memoryUpdateCallbackHandler.validateUpdateReceived();
CompressionCodec codec;
if (ConfigUtils.isIntermediateInputCompressed(conf)) {
Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateInputCompressorClass(conf, DefaultCodec.class);
codec = ReflectionUtils.newInstance(codecClass, conf);
} else {
codec = null;
}
boolean compositeFetch = ShuffleUtils.isTezShuffleHandler(conf);
boolean ifileReadAhead = conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_DEFAULT);
int ifileReadAheadLength = 0;
int ifileBufferSize = 0;
if (ifileReadAhead) {
ifileReadAheadLength = conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES_DEFAULT);
}
ifileBufferSize = conf.getInt("io.file.buffer.size", TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_BUFFER_SIZE_DEFAULT);
this.inputManager = new SimpleFetchedInputAllocator(TezUtilsInternal.cleanVertexName(getContext().getSourceVertexName()), getContext().getUniqueIdentifier(), getContext().getDagIdentifier(), conf, getContext().getTotalMemoryAvailableToTask(), memoryUpdateCallbackHandler.getMemoryAssigned());
this.shuffleManager = new ShuffleManager(getContext(), conf, getNumPhysicalInputs(), ifileBufferSize, ifileReadAhead, ifileReadAheadLength, codec, inputManager);
this.inputEventHandler = new ShuffleInputEventHandlerImpl(getContext(), shuffleManager, inputManager, codec, ifileReadAhead, ifileReadAheadLength, compositeFetch);
// //// End of Initial configuration
this.shuffleManager.run();
this.kvReader = createReader(inputRecordCounter, codec, ifileBufferSize, ifileReadAhead, ifileReadAheadLength);
List<Event> pending = new LinkedList<Event>();
pendingEvents.drainTo(pending);
if (pending.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(getContext().getSourceVertexName() + ": " + "NoAutoStart delay in processing first event: " + (System.currentTimeMillis() - firstEventReceivedTime));
}
inputEventHandler.handleEvents(pending);
}
isStarted.set(true);
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project tez by apache.
the class TestGroupedSplits method testGzip.
/**
* Test using the gzip codec for reading
*/
@Test(timeout = 10000)
public void testGzip() throws IOException {
JobConf job = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, job);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "is\ngzip\n");
writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip, "one\nmore\nsplit\n");
FileInputFormat.setInputPaths(job, workDir);
TextInputFormat wrappedFormat = new TextInputFormat();
wrappedFormat.configure(job);
TezGroupedSplitsInputFormat<LongWritable, Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>();
format.setConf(job);
format.setInputFormat(wrappedFormat);
// TextInputFormat will produce 3 splits
for (int j = 1; j <= 3; ++j) {
format.setDesiredNumberOfSplits(j);
InputSplit[] splits = format.getSplits(job, 100);
if (j == 1) {
// j==1 covers single split corner case
// and does not do grouping
assertEquals("compressed splits == " + j, j, splits.length);
}
List<Text> results = new ArrayList<Text>();
for (int i = 0; i < splits.length; ++i) {
List<Text> read = readSplit(format, splits[i], job);
results.addAll(read);
}
assertEquals("splits length", 11, results.size());
final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
final String[] secondList = { "is", "gzip" };
final String[] thirdList = { "one", "more", "split" };
String first = results.get(0).toString();
int start = 0;
switch(first.charAt(0)) {
case 't':
start = testResults(results, firstList, start);
break;
case 'i':
start = testResults(results, secondList, start);
break;
case 'o':
start = testResults(results, thirdList, start);
break;
default:
Assert.fail("unexpected first token - " + first);
}
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project mongo-hadoop by mongodb.
the class BSONFileRecordReader method init.
public void init(final InputSplit inputSplit, final Configuration configuration) throws IOException, InterruptedException {
this.configuration = configuration;
fileSplit = (FileSplit) inputSplit;
if (LOG.isDebugEnabled()) {
LOG.debug("reading split " + fileSplit);
}
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(configuration);
CompressionCodec codec = new CompressionCodecFactory(configuration).getCodec(fileSplit.getPath());
inRaw = fs.open(file, 16 * 1024 * 1024);
inRaw.seek(startingPosition == BSON_RR_POSITION_NOT_GIVEN ? fileSplit.getStart() : startingPosition);
if (codec != null) {
decompressor = CodecPool.getDecompressor(codec);
in = codec.createInputStream(inRaw, decompressor);
} else {
in = inRaw;
}
if (MongoConfigUtil.getLazyBSON(configuration)) {
callback = new LazyBSONCallback();
decoder = new LazyBSONDecoder();
} else {
callback = new BasicBSONCallback();
decoder = new BasicBSONDecoder();
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project drill by apache.
the class DrillFileSystem method openPossiblyCompressedStream.
/**
* Returns an InputStream from a Hadoop path. If the data is compressed, this method will return a compressed
* InputStream depending on the codec.
* @param path Input file path
* @return InputStream of opened file path
* @throws IOException If the file is unreachable, unavailable or otherwise unreadable
*/
public InputStream openPossiblyCompressedStream(Path path) throws IOException {
// infers from file ext.
CompressionCodec codec = getCodec(path);
InputStream inputStream = open(path);
if (codec != null) {
inputStream = codec.createInputStream(inputStream);
}
return inputStream;
}
Aggregations