Search in sources :

Example 1 with PageReader

use of org.apache.parquet.column.page.PageReader in project parquet-mr by apache.

the class TestParquetFileWriter method validateContains.

private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
    PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
    DataPage page = pageReader.readPage();
    assertEquals(values, page.getValueCount());
    assertArrayEquals(bytes.toByteArray(), ((DataPageV1) page).getBytes().toByteArray());
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) PageReader(org.apache.parquet.column.page.PageReader) DataPageV1(org.apache.parquet.column.page.DataPageV1)

Example 2 with PageReader

use of org.apache.parquet.column.page.PageReader in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
    if (this.columns == null || this.columns.isEmpty()) {
        for (ColumnDescriptor descriptor : schema.getColumns()) {
            columns.put(descriptor, primitive(schema, descriptor.getPath()));
        }
    } else {
        for (String column : this.columns) {
            columns.put(descriptor(column, schema), primitive(column, schema));
        }
    }
    CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
    // accumulate formatted lines to print by column
    Map<String, List<String>> formatted = Maps.newLinkedHashMap();
    PageFormatter formatter = new PageFormatter();
    PageReadStore pageStore;
    int rowGroupNum = 0;
    while ((pageStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor descriptor : columns.keySet()) {
            List<String> lines = formatted.get(columnName(descriptor));
            if (lines == null) {
                lines = Lists.newArrayList();
                formatted.put(columnName(descriptor), lines);
            }
            formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
            PageReader pages = pageStore.getPageReader(descriptor);
            DictionaryPage dict = pages.readDictionaryPage();
            if (dict != null) {
                lines.add(formatter.format(dict));
            }
            DataPage page;
            while ((page = pages.readPage()) != null) {
                lines.add(formatter.format(page));
            }
        }
        rowGroupNum += 1;
    }
    // TODO: Show total column size and overall size per value in the column summary line
    for (String columnName : formatted.keySet()) {
        console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
        console.info(formatter.getHeader());
        for (String line : formatted.get(columnName)) {
            console.info(line);
        }
        console.info("");
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 3 with PageReader

use of org.apache.parquet.column.page.PageReader in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 4 with PageReader

use of org.apache.parquet.column.page.PageReader in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
    PageReader reader = store.getPageReader(column);
    long vc = reader.getTotalValueCount();
    int rmax = column.getMaxRepetitionLevel();
    int dmax = column.getMaxDefinitionLevel();
    out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);
    DictionaryPage dict = reader.readDictionaryPage();
    if (dict != null) {
        out.format(" DS:%d", dict.getDictionarySize());
        out.format(" DE:%s", dict.getEncoding());
    }
    out.println();
    out.rule('-');
    DataPage page = reader.readPage();
    for (long count = 0; page != null; count++) {
        out.format("page %d:", count);
        page.accept(new Visitor<Void>() {

            @Override
            public Void visit(DataPageV1 pageV1) {
                out.format(" DLE:%s", pageV1.getDlEncoding());
                out.format(" RLE:%s", pageV1.getRlEncoding());
                out.format(" VLE:%s", pageV1.getValueEncoding());
                Statistics<?> statistics = pageV1.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }

            @Override
            public Void visit(DataPageV2 pageV2) {
                out.format(" DLE:RLE");
                out.format(" RLE:RLE");
                out.format(" VLE:%s", pageV2.getDataEncoding());
                Statistics<?> statistics = pageV2.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }
        });
        out.format(" SZ:%d", page.getUncompressedSize());
        out.format(" VC:%d", page.getValueCount());
        out.println();
        page = reader.readPage();
    }
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) PageReader(org.apache.parquet.column.page.PageReader) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1) Statistics(org.apache.parquet.column.statistics.Statistics) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 5 with PageReader

use of org.apache.parquet.column.page.PageReader in project drill by apache.

the class ParquetRecordReaderTest method validateContains.

private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
    PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
    DataPageV1 page = (DataPageV1) pageReader.readPage();
    assertEquals(values, page.getValueCount());
    assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}
Also used : PageReader(org.apache.parquet.column.page.PageReader) DataPageV1(org.apache.parquet.column.page.DataPageV1)

Aggregations

PageReader (org.apache.parquet.column.page.PageReader)7 DataPage (org.apache.parquet.column.page.DataPage)4 DataPageV1 (org.apache.parquet.column.page.DataPageV1)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 DataPageV2 (org.apache.parquet.column.page.DataPageV2)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 PageWriter (org.apache.parquet.column.page.PageWriter)2 MessageType (org.apache.parquet.schema.MessageType)2 Test (org.junit.Test)2 HashMap (java.util.HashMap)1 List (java.util.List)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 BytesInput (org.apache.parquet.bytes.BytesInput)1 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)1 Util.encodingAsString (org.apache.parquet.cli.Util.encodingAsString)1 Util.minMaxAsString (org.apache.parquet.cli.Util.minMaxAsString)1 Encoding (org.apache.parquet.column.Encoding)1 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)1