use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnEncryptor method processBlocks.
private void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, List<String> encryptPaths) throws IOException {
Set<ColumnPath> encryptColumnsPath = convertToColumnPaths(encryptPaths);
int blockId = 0;
PageReadStore store = reader.readNextRowGroup();
while (store != null) {
writer.startBlock(store.getRowCount());
List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockId).getColumns();
Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
// Later we can add a feature to trans-encrypt it with different keys
if (chunk.isEncrypted()) {
throw new IOException("Column " + chunk.getPath().toDotString() + " is already encrypted");
}
ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath());
processChunk(descriptor, chunk, reader, writer, encryptColumnsPath, blockId, i, meta.getFileMetaData().getCreatedBy());
}
writer.endBlock();
store = reader.readNextRowGroup();
blockId++;
}
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnMasker method processBlocks.
/**
* @param reader Reader of source file
* @param writer Writer of destination file
* @param meta Metadata of source file
* @param schema Schema of source file
* @param paths Column Paths need to be masked
* @param maskMode Mode to mask
* @throws IOException
*/
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, List<String> paths, MaskMode maskMode) throws IOException {
Set<ColumnPath> nullifyColumns = convertToColumnPaths(paths);
int blockIndex = 0;
PageReadStore store = reader.readNextRowGroup();
while (store != null) {
writer.startBlock(store.getRowCount());
List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockIndex).getColumns();
Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
ColumnReadStoreImpl crStore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath());
processChunk(descriptor, chunk, crStore, reader, writer, schema, nullifyColumns, maskMode);
}
writer.endBlock();
store = reader.readNextRowGroup();
blockIndex++;
}
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnMaskerTest method nullifyColumns.
private void nullifyColumns(Configuration conf, String inputFile, String outputFile) throws IOException {
Path inPath = new Path(inputFile);
Path outPath = new Path(outputFile);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.OVERWRITE);
writer.start();
List<String> paths = new ArrayList<>();
paths.add("DocId");
paths.add("Gender");
paths.add("Links.Backward");
try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
columnMasker.processBlocks(reader, writer, metaData, schema, paths, ColumnMasker.MaskMode.NULLIFY);
} finally {
writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnMaskingCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(mode != null && (mode.equals("nullify")), "mask mode cannot be null and can be only nullify");
Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required.");
Preconditions.checkArgument(cols != null && cols.size() > 0, "columns cannot be null or empty");
MaskMode maskMode = MaskMode.fromString(mode);
Path inPath = new Path(input);
Path outPath = new Path(output);
ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
writer.start();
try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
masker.processBlocks(reader, writer, metaData, schema, cols, maskMode);
} finally {
writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
return 0;
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class CompressionConveterTest method convertCompression.
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
Path inPath = new Path(inputFile);
Path outPath = new Path(outputFile);
CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
writer.start();
try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
} finally {
writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
}
Aggregations