use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project parquet-mr by apache.
the class ParquetFileReader method readDictionary.
/**
* Reads and decompresses a dictionary page for the given column chunk.
*
* Returns null if the given column chunk has no dictionary page.
*
* @param meta a column's ColumnChunkMetaData to read the dictionary from
* @return an uncompressed DictionaryPage or null
* @throws IOException
*/
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
return null;
}
// TODO: this should use getDictionaryPageOffset() but it isn't reliable.
if (f.getPos() != meta.getStartingPos()) {
f.seek(meta.getStartingPos());
}
PageHeader pageHeader = Util.readPageHeader(f);
if (!pageHeader.isSetDictionary_page_header()) {
// TODO: should this complain?
return null;
}
DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
return new DictionaryPage(decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding());
}
use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project drill by apache.
the class PageReader method readCompressedPageV2.
/**
* Reads a compressed v2 data page which excluded the repetition and definition level
* sections from compression.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf readCompressedPageV2() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
int inputSize = pageHeader.getCompressed_page_size();
int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
int compDataOffset = repLevelSize + defLevelSize;
int outputSize = pageHeader.uncompressed_page_size;
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = null;
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
// Read in both the uncompressed and compressed sections
inputPageData = dataReader.getNext(inputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
timer.reset();
timer.start();
start = dataReader.getPos();
// Write out the uncompressed section
// Note that the following setBytes call to read the repetition and definition level sections
// advances readerIndex in inputPageData but not writerIndex in outputPageData.
outputPageData.setBytes(0, inputPageData, compDataOffset);
// decompress from the start of compressed data to the end of the input buffer
CompressionCodecName codecName = columnChunkMetaData.getCodec();
BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project drill by apache.
the class PageReader method readCompressedPageV1.
/**
* Reads a compressed v1 data page or a dictionary page, both of which are compressed
* in their entirety.
* @return decompressed Parquet page data
* @throws IOException
*/
protected DrillBuf readCompressedPageV1() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
int inputSize = pageHeader.getCompressed_page_size();
int outputSize = pageHeader.getUncompressed_page_size();
long start = dataReader.getPos();
long timeToRead;
DrillBuf inputPageData = null;
DrillBuf outputPageData = this.allocator.buffer(outputSize);
try {
timer.start();
inputPageData = dataReader.getNext(inputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
timer.reset();
timer.start();
start = dataReader.getPos();
CompressionCodecName codecName = columnChunkMetaData.getCodec();
BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
decomp.decompress(input, inputSize, output, outputSize);
outputPageData.writerIndex(outputSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
if (logger.isTraceEnabled()) {
logger.trace("Col: {} readPos: {} Uncompressed_size: {} pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
}
this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
} finally {
if (inputPageData != null) {
inputPageData.release();
}
}
return outputPageData;
}
Aggregations