Search in sources :

Example 6 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 7 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class TestInputFormat method newBlock.

private BlockMetaData newBlock(long start, long compressedBlockSize) {
    BlockMetaData blockMetaData = new BlockMetaData();
    // assuming the compression ratio is 2
    long uncompressedSize = compressedBlockSize * 2;
    ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), new BinaryStatistics(), start, 0l, 0l, compressedBlockSize, uncompressedSize);
    blockMetaData.addColumn(column);
    blockMetaData.setTotalByteSize(uncompressedSize);
    return blockMetaData;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Encoding(org.apache.parquet.column.Encoding) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics)

Example 8 with Encoding

use of org.apache.parquet.column.Encoding in project drill by axbaretto.

the class PageReader method next.

/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    currentPageCount = -1;
    valuesRead = 0;
    valuesReadyToRead = 0;
    // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
    // and submit a bug report
    long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
    if (parentColumnReader.totalValuesRead >= totalValueCount) {
        return false;
    }
    clearBuffers();
    nextInternal();
    if (pageData == null || pageHeader == null) {
        // TODO: Is this an error condition or a normal condition??
        return false;
    }
    timer.start();
    currentPageCount = pageHeader.data_page_header.num_values;
    final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
    final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
    final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
    byteLength = pageHeader.uncompressed_page_size;
    final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
    readPosInBytes = 0;
    if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
        repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
        repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
        // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
        // read the first zero here to simplify the reading processes, and start reading the first value the same as all
        // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
        // the first list of repetition levels
        readPosInBytes = repetitionLevels.getNextOffset();
        repetitionLevels.readInteger();
    }
    if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
        parentColumnReader.currDefLevel = -1;
        definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
        definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        readPosInBytes = definitionLevels.getNextOffset();
        if (!valueEncoding.usesDictionary()) {
            valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
            valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        }
    }
    if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
        valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
        valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
    if (valueEncoding.usesDictionary()) {
        // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
        // actually copying the values out into the vectors
        dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
        dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        dictionaryValueReader = new DictionaryValuesReader(dictionary);
        dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        parentColumnReader.usingDictionary = true;
    } else {
        parentColumnReader.usingDictionary = false;
    }
    // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
    // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
    // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
    // definition and repetition level data which is stored alongside the page data itself
    readyToReadPosInBytes = readPosInBytes;
    long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
    stats.numDataPagesDecoded.incrementAndGet();
    stats.timeDataPageDecode.addAndGet(timeDecode);
    return true;
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Encoding(org.apache.parquet.column.Encoding) DictionaryValuesReader(org.apache.parquet.column.values.dictionary.DictionaryValuesReader) ByteBuffer(java.nio.ByteBuffer)

Example 9 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class ParquetMetadataCommand method printColumnChunk.

private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
    String[] path = column.getPath().toArray();
    PrimitiveType type = primitive(schema, path);
    Preconditions.checkNotNull(type);
    ColumnDescriptor desc = schema.getColumnDescription(path);
    long size = column.getTotalSize();
    long count = column.getValueCount();
    float perValue = ((float) size) / count;
    CompressionCodecName codec = column.getCodec();
    Set<Encoding> encodings = column.getEncodings();
    EncodingStats encodingStats = column.getEncodingStats();
    String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
    Statistics stats = column.getStatistics();
    String name = column.getPath().toDotString();
    PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
    if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    } else {
        console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    }
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Encoding(org.apache.parquet.column.Encoding) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingsAsString(org.apache.parquet.cli.Util.encodingsAsString) Util.encodingStatsAsString(org.apache.parquet.cli.Util.encodingStatsAsString) Statistics(org.apache.parquet.column.statistics.Statistics)

Example 10 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class Util method encodingStatsAsString.

public static String encodingStatsAsString(EncodingStats encodingStats) {
    StringBuilder sb = new StringBuilder();
    if (encodingStats.hasDictionaryPages()) {
        for (Encoding encoding : encodingStats.getDictionaryEncodings()) {
            sb.append(encodingAsString(encoding, true));
        }
        sb.append(" ");
    } else {
        sb.append("  ");
    }
    Set<Encoding> encodings = encodingStats.getDataEncodings();
    if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
        sb.append("R");
    }
    if (encodings.contains(PLAIN)) {
        sb.append("_");
    }
    if (encodings.contains(DELTA_BYTE_ARRAY) || encodings.contains(DELTA_BINARY_PACKED) || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
        sb.append("D");
    }
    // Check for fallback and add a flag
    if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
        sb.append(" F");
    }
    return sb.toString();
}
Also used : Encoding(org.apache.parquet.column.Encoding)

Aggregations

Encoding (org.apache.parquet.column.Encoding)16 Path (org.apache.hadoop.fs.Path)6 Test (org.junit.Test)6 Configuration (org.apache.hadoop.conf.Configuration)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 PageReadStore (org.apache.parquet.column.page.PageReadStore)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)4 File (java.io.File)3 HashMap (java.util.HashMap)3 EncodingStats (org.apache.parquet.column.EncodingStats)3 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 MessageType (org.apache.parquet.schema.MessageType)3 Stopwatch (com.google.common.base.Stopwatch)2 ByteBuffer (java.nio.ByteBuffer)2 HashSet (java.util.HashSet)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 BytesInput (org.apache.parquet.bytes.BytesInput)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)2