Search in sources :

Example 1 with BytesInputDecompressor

use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project parquet-mr by apache.

the class ParquetFileReader method readDictionary.

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
    if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
        return null;
    }
    // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
    if (f.getPos() != meta.getStartingPos()) {
        f.seek(meta.getStartingPos());
    }
    PageHeader pageHeader = Util.readPageHeader(f);
    if (!pageHeader.isSetDictionary_page_header()) {
        // TODO: should this complain?
        return null;
    }
    DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
    BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
    return new DictionaryPage(decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), compressedPage.getDictionarySize(), compressedPage.getEncoding());
}
Also used : PageHeader(org.apache.parquet.format.PageHeader) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) BytesInputDecompressor(org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 2 with BytesInputDecompressor

use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project drill by apache.

the class PageReader method readCompressedPageV2.

/**
 * Reads a compressed v2 data page which excluded the repetition and definition level
 * sections from compression.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf readCompressedPageV2() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    int inputSize = pageHeader.getCompressed_page_size();
    int repLevelSize = pageHeader.data_page_header_v2.getRepetition_levels_byte_length();
    int defLevelSize = pageHeader.data_page_header_v2.getDefinition_levels_byte_length();
    int compDataOffset = repLevelSize + defLevelSize;
    int outputSize = pageHeader.uncompressed_page_size;
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = null;
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        // Read in both the uncompressed and compressed sections
        inputPageData = dataReader.getNext(inputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
        timer.reset();
        timer.start();
        start = dataReader.getPos();
        // Write out the uncompressed section
        // Note that the following setBytes call to read the repetition and definition level sections
        // advances readerIndex in inputPageData but not writerIndex in outputPageData.
        outputPageData.setBytes(0, inputPageData, compDataOffset);
        // decompress from the start of compressed data to the end of the input buffer
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(compDataOffset, inputSize - compDataOffset);
        ByteBuffer output = outputPageData.nioBuffer(compDataOffset, outputSize - compDataOffset);
        decomp.decompress(input, inputSize - compDataOffset, output, outputSize - compDataOffset);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) BytesInputDecompressor(org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Example 3 with BytesInputDecompressor

use of org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor in project drill by apache.

the class PageReader method readCompressedPageV1.

/**
 * Reads a compressed v1 data page or a dictionary page, both of which are compressed
 * in their entirety.
 * @return decompressed Parquet page data
 * @throws IOException
 */
protected DrillBuf readCompressedPageV1() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    int inputSize = pageHeader.getCompressed_page_size();
    int outputSize = pageHeader.getUncompressed_page_size();
    long start = dataReader.getPos();
    long timeToRead;
    DrillBuf inputPageData = null;
    DrillBuf outputPageData = this.allocator.buffer(outputSize);
    try {
        timer.start();
        inputPageData = dataReader.getNext(inputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        this.updateStats(pageHeader, "Page Read", start, timeToRead, inputSize, inputSize);
        timer.reset();
        timer.start();
        start = dataReader.getPos();
        CompressionCodecName codecName = columnChunkMetaData.getCodec();
        BytesInputDecompressor decomp = codecFactory.getDecompressor(codecName);
        ByteBuffer input = inputPageData.nioBuffer(0, inputSize);
        ByteBuffer output = outputPageData.nioBuffer(0, outputSize);
        decomp.decompress(input, inputSize, output, outputSize);
        outputPageData.writerIndex(outputSize);
        timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
        if (logger.isTraceEnabled()) {
            logger.trace("Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}", columnChunkMetaData.toString(), dataReader.getPos(), outputSize, ByteBufUtil.hexDump(outputPageData));
        }
        this.updateStats(pageHeader, "Decompress", start, timeToRead, inputSize, outputSize);
    } finally {
        if (inputPageData != null) {
            inputPageData.release();
        }
    }
    return outputPageData;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) BytesInputDecompressor(org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor) ByteBuffer(java.nio.ByteBuffer) DrillBuf(io.netty.buffer.DrillBuf)

Aggregations

BytesInputDecompressor (org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor)3 DrillBuf (io.netty.buffer.DrillBuf)2 ByteBuffer (java.nio.ByteBuffer)2 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)1 DataPageHeader (org.apache.parquet.format.DataPageHeader)1 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)1 PageHeader (org.apache.parquet.format.PageHeader)1