Search in sources :

Example 6 with DataPage

use of org.apache.parquet.column.page.DataPage in project parquet-mr by apache.

the class TestColumnReaderImpl method testOptional.

@Test
public void testOptional() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    MemPageWriter pageWriter = new MemPageWriter();
    ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder().withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0).withPageSize(2048).build());
    for (int i = 0; i < rows; i++) {
        columnWriterV2.writeNull(0, 0);
        if ((i + 1) % 1000 == 0) {
            columnWriterV2.writePage(i);
        }
    }
    columnWriterV2.writePage(rows);
    columnWriterV2.finalizeColumnChunk();
    List<DataPage> pages = pageWriter.getPages();
    int valueCount = 0;
    int rowCount = 0;
    for (DataPage dataPage : pages) {
        valueCount += dataPage.getValueCount();
        rowCount += ((DataPageV2) dataPage).getRowCount();
    }
    assertEquals(rows, rowCount);
    assertEquals(rows, valueCount);
    MemPageReader pageReader = new MemPageReader((long) rows, pages.iterator(), pageWriter.getDictionaryPage());
    ValidatingConverter converter = new ValidatingConverter();
    ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
    for (int i = 0; i < rows; i++) {
        assertEquals(0, columnReader.getCurrentRepetitionLevel());
        assertEquals(0, columnReader.getCurrentDefinitionLevel());
        columnReader.consume();
    }
    assertEquals(0, converter.count);
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MemPageWriter(org.apache.parquet.column.page.mem.MemPageWriter) ColumnReader(org.apache.parquet.column.ColumnReader) MessageType(org.apache.parquet.schema.MessageType) MemPageReader(org.apache.parquet.column.page.mem.MemPageReader) Test(org.junit.Test)

Example 7 with DataPage

use of org.apache.parquet.column.page.DataPage in project parquet-mr by apache.

the class TestColumnReaderImpl method test.

@Test
public void test() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    MemPageWriter pageWriter = new MemPageWriter();
    ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder().withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0).withPageSize(2048).build());
    for (int i = 0; i < rows; i++) {
        columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
        if ((i + 1) % 1000 == 0) {
            columnWriterV2.writePage(i);
        }
    }
    columnWriterV2.writePage(rows);
    columnWriterV2.finalizeColumnChunk();
    List<DataPage> pages = pageWriter.getPages();
    int valueCount = 0;
    int rowCount = 0;
    for (DataPage dataPage : pages) {
        valueCount += dataPage.getValueCount();
        rowCount += ((DataPageV2) dataPage).getRowCount();
    }
    assertEquals(rows, rowCount);
    assertEquals(rows, valueCount);
    MemPageReader pageReader = new MemPageReader((long) rows, pages.iterator(), pageWriter.getDictionaryPage());
    ValidatingConverter converter = new ValidatingConverter();
    ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
    for (int i = 0; i < rows; i++) {
        assertEquals(0, columnReader.getCurrentRepetitionLevel());
        assertEquals(0, columnReader.getCurrentDefinitionLevel());
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    assertEquals(rows, converter.count);
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MemPageWriter(org.apache.parquet.column.page.mem.MemPageWriter) ColumnReader(org.apache.parquet.column.ColumnReader) MessageType(org.apache.parquet.schema.MessageType) MemPageReader(org.apache.parquet.column.page.mem.MemPageReader) Test(org.junit.Test)

Example 8 with DataPage

use of org.apache.parquet.column.page.DataPage in project parquet-mr by apache.

the class MemPageReader method readPage.

@Override
public DataPage readPage() {
    if (pages.hasNext()) {
        DataPage next = pages.next();
        LOG.debug("read page {}", next);
        return next;
    } else {
        throw new ParquetDecodingException("after last page");
    }
}
Also used : ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) DataPage(org.apache.parquet.column.page.DataPage)

Example 9 with DataPage

use of org.apache.parquet.column.page.DataPage in project parquet-mr by apache.

the class TestMemPageStore method test.

@Test
public void test() throws IOException {
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnDescriptor col = new ColumnDescriptor(path, PrimitiveTypeName.INT64, 2, 2);
    LongStatistics stats = new LongStatistics();
    PageWriter pageWriter = memPageStore.getPageWriter(col);
    pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    PageReader pageReader = memPageStore.getPageReader(col);
    long totalValueCount = pageReader.getTotalValueCount();
    System.out.println(totalValueCount);
    int total = 0;
    do {
        DataPage readPage = pageReader.readPage();
        total += readPage.getValueCount();
        System.out.println(readPage);
    // TODO: assert
    } while (total < totalValueCount);
}
Also used : LongStatistics(org.apache.parquet.column.statistics.LongStatistics) DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 10 with DataPage

use of org.apache.parquet.column.page.DataPage in project parquet-mr by apache.

the class ColumnReaderImpl method readPage.

private void readPage() {
    LOG.debug("loading page");
    DataPage page = pageReader.readPage();
    page.accept(new DataPage.Visitor<Void>() {

        @Override
        public Void visit(DataPageV1 dataPageV1) {
            readPageV1(dataPageV1);
            return null;
        }

        @Override
        public Void visit(DataPageV2 dataPageV2) {
            readPageV2(dataPageV2);
            return null;
        }
    });
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1)

Aggregations

DataPage (org.apache.parquet.column.page.DataPage)11 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 DataPageV2 (org.apache.parquet.column.page.DataPageV2)5 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 PageReader (org.apache.parquet.column.page.PageReader)4 MessageType (org.apache.parquet.schema.MessageType)3 Test (org.junit.Test)3 ColumnReader (org.apache.parquet.column.ColumnReader)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)2 MemPageReader (org.apache.parquet.column.page.mem.MemPageReader)2 MemPageWriter (org.apache.parquet.column.page.mem.MemPageWriter)2 List (java.util.List)1 ParquetDictionary (org.apache.flink.formats.parquet.vector.ParquetDictionary)1 WritableIntVector (org.apache.flink.table.data.columnar.vector.writable.WritableIntVector)1 Util.encodingAsString (org.apache.parquet.cli.Util.encodingAsString)1 Util.minMaxAsString (org.apache.parquet.cli.Util.minMaxAsString)1 PageReadStore (org.apache.parquet.column.page.PageReadStore)1 PageWriter (org.apache.parquet.column.page.PageWriter)1 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)1 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)1