use of org.apache.parquet.column.ColumnWriteStore in project parquet-mr by apache.
the class TestMemColumn method testPageSize.
@Test
public void testPageSize() {
MessageType schema = Types.buildMessage().requiredList().requiredElement(BINARY).named("binary_col").requiredList().requiredElement(INT32).named("int32_col").named("msg");
System.out.println(schema);
MemPageStore memPageStore = new MemPageStore(123);
// Using V2 pages so we have rowCount info
ColumnWriteStore writeStore = new ColumnWriteStoreV2(schema, memPageStore, ParquetProperties.builder().withPageSize(// Less than 10 records for binary_col
1024).withMinRowCountForPageSizeCheck(// Enforce having precise page sizing
1).withPageRowCountLimit(10).withDictionaryEncoding(// Enforce having large binary_col pages
false).build());
ColumnDescriptor binaryCol = schema.getColumnDescription(new String[] { "binary_col", "list", "element" });
ColumnWriter binaryColWriter = writeStore.getColumnWriter(binaryCol);
ColumnDescriptor int32Col = schema.getColumnDescription(new String[] { "int32_col", "list", "element" });
ColumnWriter int32ColWriter = writeStore.getColumnWriter(int32Col);
// Writing 123 records
for (int i = 0; i < 123; ++i) {
// Writing 10 values per record
for (int j = 0; j < 10; ++j) {
binaryColWriter.write(Binary.fromString("aaaaaaaaaaaa"), j == 0 ? 0 : 2, 2);
int32ColWriter.write(42, j == 0 ? 0 : 2, 2);
}
writeStore.endRecord();
}
writeStore.flush();
// Check that all the binary_col pages are <= 1024 bytes
{
PageReader binaryColPageReader = memPageStore.getPageReader(binaryCol);
assertEquals(1230, binaryColPageReader.getTotalValueCount());
int pageCnt = 0;
int valueCnt = 0;
while (valueCnt < binaryColPageReader.getTotalValueCount()) {
DataPage page = binaryColPageReader.readPage();
++pageCnt;
valueCnt += page.getValueCount();
LOG.info("binary_col page-{}: {} bytes, {} rows", pageCnt, page.getCompressedSize(), page.getIndexRowCount().get());
assertTrue("Compressed size should be less than 1024", page.getCompressedSize() <= 1024);
}
}
// Check that all the int32_col pages contain <= 10 rows
{
PageReader int32ColPageReader = memPageStore.getPageReader(int32Col);
assertEquals(1230, int32ColPageReader.getTotalValueCount());
int pageCnt = 0;
int valueCnt = 0;
while (valueCnt < int32ColPageReader.getTotalValueCount()) {
DataPage page = int32ColPageReader.readPage();
++pageCnt;
valueCnt += page.getValueCount();
LOG.info("int32_col page-{}: {} bytes, {} rows", pageCnt, page.getCompressedSize(), page.getIndexRowCount().get());
assertTrue("Row count should be less than 10", page.getIndexRowCount().get() <= 10);
}
}
}
use of org.apache.parquet.column.ColumnWriteStore in project parquet-mr by apache.
the class ColumnMasker method nullifyColumn.
private void nullifyColumn(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, ParquetFileWriter writer, MessageType schema) throws IOException {
long totalChunkValues = chunk.getValueCount();
int dMax = descriptor.getMaxDefinitionLevel();
ColumnReader cReader = crStore.getColumnReader(descriptor);
WriterVersion writerVersion = chunk.getEncodingStats().usesV2Pages() ? WriterVersion.PARQUET_2_0 : WriterVersion.PARQUET_1_0;
ParquetProperties props = ParquetProperties.builder().withWriterVersion(writerVersion).build();
CodecFactory codecFactory = new CodecFactory(new Configuration(), props.getPageSizeThreshold());
CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(chunk.getCodec());
// Create new schema that only has the current column
MessageType newSchema = newSchema(schema, descriptor);
ColumnChunkPageWriteStore cPageStore = new ColumnChunkPageWriteStore(compressor, newSchema, props.getAllocator(), props.getColumnIndexTruncateLength());
ColumnWriteStore cStore = props.newColumnWriteStore(newSchema, cPageStore);
ColumnWriter cWriter = cStore.getColumnWriter(descriptor);
for (int i = 0; i < totalChunkValues; i++) {
int rlvl = cReader.getCurrentRepetitionLevel();
int dlvl = cReader.getCurrentDefinitionLevel();
if (dlvl == dMax) {
// since we checked ether optional or repeated, dlvl should be > 0
if (dlvl == 0) {
throw new IOException("definition level is detected to be 0 for column " + chunk.getPath().toDotString() + " to be nullified");
}
// we just write one null for the whole list at the top level, instead of nullify the elements in the list one by one
if (rlvl == 0) {
cWriter.writeNull(rlvl, dlvl - 1);
}
} else {
cWriter.writeNull(rlvl, dlvl);
}
cStore.endRecord();
}
cStore.flush();
cPageStore.flushToFileWriter(writer);
cStore.close();
cWriter.close();
}
Aggregations