Search in sources :

Example 1 with ColumnWriteStore

use of org.apache.parquet.column.ColumnWriteStore in project parquet-mr by apache.

the class TestMemColumn method testPageSize.

@Test
public void testPageSize() {
    MessageType schema = Types.buildMessage().requiredList().requiredElement(BINARY).named("binary_col").requiredList().requiredElement(INT32).named("int32_col").named("msg");
    System.out.println(schema);
    MemPageStore memPageStore = new MemPageStore(123);
    // Using V2 pages so we have rowCount info
    ColumnWriteStore writeStore = new ColumnWriteStoreV2(schema, memPageStore, ParquetProperties.builder().withPageSize(// Less than 10 records for binary_col
    1024).withMinRowCountForPageSizeCheck(// Enforce having precise page sizing
    1).withPageRowCountLimit(10).withDictionaryEncoding(// Enforce having large binary_col pages
    false).build());
    ColumnDescriptor binaryCol = schema.getColumnDescription(new String[] { "binary_col", "list", "element" });
    ColumnWriter binaryColWriter = writeStore.getColumnWriter(binaryCol);
    ColumnDescriptor int32Col = schema.getColumnDescription(new String[] { "int32_col", "list", "element" });
    ColumnWriter int32ColWriter = writeStore.getColumnWriter(int32Col);
    // Writing 123 records
    for (int i = 0; i < 123; ++i) {
        // Writing 10 values per record
        for (int j = 0; j < 10; ++j) {
            binaryColWriter.write(Binary.fromString("aaaaaaaaaaaa"), j == 0 ? 0 : 2, 2);
            int32ColWriter.write(42, j == 0 ? 0 : 2, 2);
        }
        writeStore.endRecord();
    }
    writeStore.flush();
    // Check that all the binary_col pages are <= 1024 bytes
    {
        PageReader binaryColPageReader = memPageStore.getPageReader(binaryCol);
        assertEquals(1230, binaryColPageReader.getTotalValueCount());
        int pageCnt = 0;
        int valueCnt = 0;
        while (valueCnt < binaryColPageReader.getTotalValueCount()) {
            DataPage page = binaryColPageReader.readPage();
            ++pageCnt;
            valueCnt += page.getValueCount();
            LOG.info("binary_col page-{}: {} bytes, {} rows", pageCnt, page.getCompressedSize(), page.getIndexRowCount().get());
            assertTrue("Compressed size should be less than 1024", page.getCompressedSize() <= 1024);
        }
    }
    // Check that all the int32_col pages contain <= 10 rows
    {
        PageReader int32ColPageReader = memPageStore.getPageReader(int32Col);
        assertEquals(1230, int32ColPageReader.getTotalValueCount());
        int pageCnt = 0;
        int valueCnt = 0;
        while (valueCnt < int32ColPageReader.getTotalValueCount()) {
            DataPage page = int32ColPageReader.readPage();
            ++pageCnt;
            valueCnt += page.getValueCount();
            LOG.info("int32_col page-{}: {} bytes, {} rows", pageCnt, page.getCompressedSize(), page.getIndexRowCount().get());
            assertTrue("Row count should be less than 10", page.getIndexRowCount().get() <= 10);
        }
    }
}
Also used : ColumnWriteStore(org.apache.parquet.column.ColumnWriteStore) DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) ColumnWriter(org.apache.parquet.column.ColumnWriter) MessageType(org.apache.parquet.schema.MessageType) ColumnWriteStoreV2(org.apache.parquet.column.impl.ColumnWriteStoreV2) Test(org.junit.Test)

Example 2 with ColumnWriteStore

use of org.apache.parquet.column.ColumnWriteStore in project parquet-mr by apache.

the class ColumnMasker method nullifyColumn.

private void nullifyColumn(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, ParquetFileWriter writer, MessageType schema) throws IOException {
    long totalChunkValues = chunk.getValueCount();
    int dMax = descriptor.getMaxDefinitionLevel();
    ColumnReader cReader = crStore.getColumnReader(descriptor);
    WriterVersion writerVersion = chunk.getEncodingStats().usesV2Pages() ? WriterVersion.PARQUET_2_0 : WriterVersion.PARQUET_1_0;
    ParquetProperties props = ParquetProperties.builder().withWriterVersion(writerVersion).build();
    CodecFactory codecFactory = new CodecFactory(new Configuration(), props.getPageSizeThreshold());
    CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(chunk.getCodec());
    // Create new schema that only has the current column
    MessageType newSchema = newSchema(schema, descriptor);
    ColumnChunkPageWriteStore cPageStore = new ColumnChunkPageWriteStore(compressor, newSchema, props.getAllocator(), props.getColumnIndexTruncateLength());
    ColumnWriteStore cStore = props.newColumnWriteStore(newSchema, cPageStore);
    ColumnWriter cWriter = cStore.getColumnWriter(descriptor);
    for (int i = 0; i < totalChunkValues; i++) {
        int rlvl = cReader.getCurrentRepetitionLevel();
        int dlvl = cReader.getCurrentDefinitionLevel();
        if (dlvl == dMax) {
            // since we checked ether optional or repeated, dlvl should be > 0
            if (dlvl == 0) {
                throw new IOException("definition level is detected to be 0 for column " + chunk.getPath().toDotString() + " to be nullified");
            }
            // we just write one null for the whole list at the top level, instead of nullify the elements in the list one by one
            if (rlvl == 0) {
                cWriter.writeNull(rlvl, dlvl - 1);
            }
        } else {
            cWriter.writeNull(rlvl, dlvl);
        }
        cStore.endRecord();
    }
    cStore.flush();
    cPageStore.flushToFileWriter(writer);
    cStore.close();
    cWriter.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ColumnChunkPageWriteStore(org.apache.parquet.hadoop.ColumnChunkPageWriteStore) ParquetProperties(org.apache.parquet.column.ParquetProperties) IOException(java.io.IOException) ColumnWriter(org.apache.parquet.column.ColumnWriter) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) ColumnWriteStore(org.apache.parquet.column.ColumnWriteStore) ColumnReader(org.apache.parquet.column.ColumnReader) CodecFactory(org.apache.parquet.hadoop.CodecFactory) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

ColumnWriteStore (org.apache.parquet.column.ColumnWriteStore)2 ColumnWriter (org.apache.parquet.column.ColumnWriter)2 MessageType (org.apache.parquet.schema.MessageType)2 IOException (java.io.IOException)1 Configuration (org.apache.hadoop.conf.Configuration)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 ColumnReader (org.apache.parquet.column.ColumnReader)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)1 ColumnWriteStoreV2 (org.apache.parquet.column.impl.ColumnWriteStoreV2)1 DataPage (org.apache.parquet.column.page.DataPage)1 PageReader (org.apache.parquet.column.page.PageReader)1 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)1 CodecFactory (org.apache.parquet.hadoop.CodecFactory)1 ColumnChunkPageWriteStore (org.apache.parquet.hadoop.ColumnChunkPageWriteStore)1 Test (org.junit.Test)1