Search in sources :

Example 1 with PageWriter

use of org.apache.parquet.column.page.PageWriter in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 2 with PageWriter

use of org.apache.parquet.column.page.PageWriter in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method testColumnOrderV1.

@Test
public void testColumnOrderV1() throws IOException {
    ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
    InOrder inOrder = inOrder(mockFileWriter);
    MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
    BytesInput fakeData = BytesInput.fromInt(34);
    int fakeCount = 3;
    BinaryStatistics fakeStats = new BinaryStatistics();
    // TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
    // see comment at this constructor
    ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator());
    for (ColumnDescriptor col : schema.getColumns()) {
        PageWriter pageWriter = store.getPageWriter(col);
        pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
    }
    // flush to the mock writer
    store.flushToFileWriter(mockFileWriter);
    for (ColumnDescriptor col : schema.getColumns()) {
        inOrder.verify(mockFileWriter).startColumn(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED));
    }
}
Also used : InOrder(org.mockito.InOrder) BytesInput(org.apache.parquet.bytes.BytesInput) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 3 with PageWriter

use of org.apache.parquet.column.page.PageWriter in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 4 with PageWriter

use of org.apache.parquet.column.page.PageWriter in project parquet-mr by apache.

the class TestMemPageStore method test.

@Test
public void test() throws IOException {
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnDescriptor col = new ColumnDescriptor(path, PrimitiveTypeName.INT64, 2, 2);
    LongStatistics stats = new LongStatistics();
    PageWriter pageWriter = memPageStore.getPageWriter(col);
    pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
    PageReader pageReader = memPageStore.getPageReader(col);
    long totalValueCount = pageReader.getTotalValueCount();
    System.out.println(totalValueCount);
    int total = 0;
    do {
        DataPage readPage = pageReader.readPage();
        total += readPage.getValueCount();
        System.out.println(readPage);
    // TODO: assert
    } while (total < totalValueCount);
}
Also used : LongStatistics(org.apache.parquet.column.statistics.LongStatistics) DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 PageWriter (org.apache.parquet.column.page.PageWriter)4 Test (org.junit.Test)4 BytesInput (org.apache.parquet.bytes.BytesInput)3 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)2 PageReader (org.apache.parquet.column.page.PageReader)2 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)2 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)2 MessageType (org.apache.parquet.schema.MessageType)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 ParsedVersion (org.apache.parquet.VersionParser.ParsedVersion)1 Encoding (org.apache.parquet.column.Encoding)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 DataPage (org.apache.parquet.column.page.DataPage)1 DataPageV2 (org.apache.parquet.column.page.DataPageV2)1 PageReadStore (org.apache.parquet.column.page.PageReadStore)1 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)1