Search in sources :

Example 1 with DataPageV2

use of org.apache.parquet.column.page.DataPageV2 in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 2 with DataPageV2

use of org.apache.parquet.column.page.DataPageV2 in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
    PageReader reader = store.getPageReader(column);
    long vc = reader.getTotalValueCount();
    int rmax = column.getMaxRepetitionLevel();
    int dmax = column.getMaxDefinitionLevel();
    out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);
    DictionaryPage dict = reader.readDictionaryPage();
    if (dict != null) {
        out.format(" DS:%d", dict.getDictionarySize());
        out.format(" DE:%s", dict.getEncoding());
    }
    out.println();
    out.rule('-');
    DataPage page = reader.readPage();
    for (long count = 0; page != null; count++) {
        out.format("page %d:", count);
        page.accept(new Visitor<Void>() {

            @Override
            public Void visit(DataPageV1 pageV1) {
                out.format(" DLE:%s", pageV1.getDlEncoding());
                out.format(" RLE:%s", pageV1.getRlEncoding());
                out.format(" VLE:%s", pageV1.getValueEncoding());
                Statistics<?> statistics = pageV1.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }

            @Override
            public Void visit(DataPageV2 pageV2) {
                out.format(" DLE:RLE");
                out.format(" RLE:RLE");
                out.format(" VLE:%s", pageV2.getDataEncoding());
                Statistics<?> statistics = pageV2.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }
        });
        out.format(" SZ:%d", page.getUncompressedSize());
        out.format(" VC:%d", page.getValueCount());
        out.println();
        page = reader.readPage();
    }
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) PageReader(org.apache.parquet.column.page.PageReader) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1) Statistics(org.apache.parquet.column.statistics.Statistics) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 3 with DataPageV2

use of org.apache.parquet.column.page.DataPageV2 in project flink by apache.

the class AbstractColumnReader method readToVector.

/**
 * Reads `total` values from this columnReader into column.
 */
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
    int rowId = 0;
    WritableIntVector dictionaryIds = null;
    if (dictionary != null) {
        dictionaryIds = vector.reserveDictionaryIds(readNumber);
    }
    while (readNumber > 0) {
        // Compute the number of values we want to read in this page.
        int leftInPage = (int) (endOfPageValueCount - valuesRead);
        if (leftInPage == 0) {
            DataPage page = pageReader.readPage();
            if (page instanceof DataPageV1) {
                readPageV1((DataPageV1) page);
            } else if (page instanceof DataPageV2) {
                readPageV2((DataPageV2) page);
            } else {
                throw new RuntimeException("Unsupported page type: " + page.getClass());
            }
            leftInPage = (int) (endOfPageValueCount - valuesRead);
        }
        int num = Math.min(readNumber, leftInPage);
        if (isCurrentPageDictionaryEncoded) {
            // Read and decode dictionary ids.
            runLenDecoder.readDictionaryIds(num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);
            if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
                // Column vector supports lazy decoding of dictionary values so just set the
                // dictionary.
                // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e.
                // some
                // non-dictionary encoded values have already been added).
                vector.setDictionary(new ParquetDictionary(dictionary));
            } else {
                readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
            }
        } else {
            if (vector.hasDictionary() && rowId != 0) {
                // This batch already has dictionary encoded values but this new page is not.
                // The batch
                // does not support a mix of dictionary and not so we will decode the
                // dictionary.
                readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
            }
            vector.setDictionary(null);
            readBatch(rowId, num, vector);
        }
        valuesRead += num;
        rowId += num;
        readNumber -= num;
    }
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1) ParquetDictionary(org.apache.flink.formats.parquet.vector.ParquetDictionary) WritableIntVector(org.apache.flink.table.data.columnar.vector.writable.WritableIntVector)

Example 4 with DataPageV2

use of org.apache.parquet.column.page.DataPageV2 in project hive by apache.

the class VectorizedPrimitiveColumnReader method readPage.

private void readPage() throws IOException {
    DataPage page = pageReader.readPage();
    // TODO: Why is this a visitor?
    page.accept(new DataPage.Visitor<Void>() {

        @Override
        public Void visit(DataPageV1 dataPageV1) {
            readPageV1(dataPageV1);
            return null;
        }

        @Override
        public Void visit(DataPageV2 dataPageV2) {
            readPageV2(dataPageV2);
            return null;
        }
    });
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1)

Example 5 with DataPageV2

use of org.apache.parquet.column.page.DataPageV2 in project parquet-mr by apache.

the class ColumnReaderImpl method readPage.

private void readPage() {
    LOG.debug("loading page");
    DataPage page = pageReader.readPage();
    page.accept(new DataPage.Visitor<Void>() {

        @Override
        public Void visit(DataPageV1 dataPageV1) {
            readPageV1(dataPageV1);
            return null;
        }

        @Override
        public Void visit(DataPageV2 dataPageV2) {
            readPageV2(dataPageV2);
            return null;
        }
    });
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1)

Aggregations

DataPageV2 (org.apache.parquet.column.page.DataPageV2)6 DataPage (org.apache.parquet.column.page.DataPage)5 DataPageV1 (org.apache.parquet.column.page.DataPageV1)5 PageReader (org.apache.parquet.column.page.PageReader)2 HashMap (java.util.HashMap)1 ParquetDictionary (org.apache.flink.formats.parquet.vector.ParquetDictionary)1 WritableIntVector (org.apache.flink.table.data.columnar.vector.writable.WritableIntVector)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 BytesInput (org.apache.parquet.bytes.BytesInput)1 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 Encoding (org.apache.parquet.column.Encoding)1 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)1 PageReadStore (org.apache.parquet.column.page.PageReadStore)1 PageWriter (org.apache.parquet.column.page.PageWriter)1 Statistics (org.apache.parquet.column.statistics.Statistics)1 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)1 MessageType (org.apache.parquet.schema.MessageType)1 Test (org.junit.Test)1