use of org.apache.parquet.hadoop.CodecFactory in project parquet-mr by apache.
the class ColumnMasker method nullifyColumn.
private void nullifyColumn(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, ParquetFileWriter writer, MessageType schema) throws IOException {
long totalChunkValues = chunk.getValueCount();
int dMax = descriptor.getMaxDefinitionLevel();
ColumnReader cReader = crStore.getColumnReader(descriptor);
WriterVersion writerVersion = chunk.getEncodingStats().usesV2Pages() ? WriterVersion.PARQUET_2_0 : WriterVersion.PARQUET_1_0;
ParquetProperties props = ParquetProperties.builder().withWriterVersion(writerVersion).build();
CodecFactory codecFactory = new CodecFactory(new Configuration(), props.getPageSizeThreshold());
CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(chunk.getCodec());
// Create new schema that only has the current column
MessageType newSchema = newSchema(schema, descriptor);
ColumnChunkPageWriteStore cPageStore = new ColumnChunkPageWriteStore(compressor, newSchema, props.getAllocator(), props.getColumnIndexTruncateLength());
ColumnWriteStore cStore = props.newColumnWriteStore(newSchema, cPageStore);
ColumnWriter cWriter = cStore.getColumnWriter(descriptor);
for (int i = 0; i < totalChunkValues; i++) {
int rlvl = cReader.getCurrentRepetitionLevel();
int dlvl = cReader.getCurrentDefinitionLevel();
if (dlvl == dMax) {
// since we checked ether optional or repeated, dlvl should be > 0
if (dlvl == 0) {
throw new IOException("definition level is detected to be 0 for column " + chunk.getPath().toDotString() + " to be nullified");
}
// we just write one null for the whole list at the top level, instead of nullify the elements in the list one by one
if (rlvl == 0) {
cWriter.writeNull(rlvl, dlvl - 1);
}
} else {
cWriter.writeNull(rlvl, dlvl);
}
cStore.endRecord();
}
cStore.flush();
cPageStore.flushToFileWriter(writer);
cStore.close();
cWriter.close();
}
Aggregations