Search in sources :

Example 1 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnEncryptor method processBlocks.

private void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, List<String> encryptPaths) throws IOException {
    Set<ColumnPath> encryptColumnsPath = convertToColumnPaths(encryptPaths);
    int blockId = 0;
    PageReadStore store = reader.readNextRowGroup();
    while (store != null) {
        writer.startBlock(store.getRowCount());
        List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockId).getColumns();
        Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
        for (int i = 0; i < columnsInOrder.size(); i += 1) {
            ColumnChunkMetaData chunk = columnsInOrder.get(i);
            // Later we can add a feature to trans-encrypt it with different keys
            if (chunk.isEncrypted()) {
                throw new IOException("Column " + chunk.getPath().toDotString() + " is already encrypted");
            }
            ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath());
            processChunk(descriptor, chunk, reader, writer, encryptColumnsPath, blockId, i, meta.getFileMetaData().getCreatedBy());
        }
        writer.endBlock();
        store = reader.readNextRowGroup();
        blockId++;
    }
}
Also used : AesCipher(org.apache.parquet.crypto.AesCipher) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH(org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH) DataPageHeaderV2(org.apache.parquet.format.DataPageHeaderV2) DictionaryPage(org.apache.parquet.column.page.DictionaryPage) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) ModuleType(org.apache.parquet.crypto.ModuleCipherFactory.ModuleType) InternalFileEncryptor(org.apache.parquet.crypto.InternalFileEncryptor) HashSet(java.util.HashSet) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) MAX_PADDING_SIZE_DEFAULT(org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) Configuration(org.apache.hadoop.conf.Configuration) BytesInput(org.apache.parquet.bytes.BytesInput) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) PageReadStore(org.apache.parquet.column.page.PageReadStore) DEFAULT_STATISTICS_TRUNCATE_LENGTH(org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH) ParquetProperties(org.apache.parquet.column.ParquetProperties) FileEncryptionProperties(org.apache.parquet.crypto.FileEncryptionProperties) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) HadoopReadOptions(org.apache.parquet.HadoopReadOptions) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) DataPageHeader(org.apache.parquet.format.DataPageHeader) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) BlockCipher(org.apache.parquet.format.BlockCipher) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageHeader(org.apache.parquet.format.PageHeader) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) DEFAULT_BLOCK_SIZE(org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE) NO_FILTER(org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) IOException(java.io.IOException)

Example 2 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnMasker method processBlocks.

/**
 * @param reader Reader of source file
 * @param writer Writer of destination file
 * @param meta Metadata of source file
 * @param schema Schema of source file
 * @param paths Column Paths need to be masked
 * @param maskMode Mode to mask
 * @throws IOException
 */
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, List<String> paths, MaskMode maskMode) throws IOException {
    Set<ColumnPath> nullifyColumns = convertToColumnPaths(paths);
    int blockIndex = 0;
    PageReadStore store = reader.readNextRowGroup();
    while (store != null) {
        writer.startBlock(store.getRowCount());
        List<ColumnChunkMetaData> columnsInOrder = meta.getBlocks().get(blockIndex).getColumns();
        Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
        ColumnReadStoreImpl crStore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
        for (int i = 0; i < columnsInOrder.size(); i += 1) {
            ColumnChunkMetaData chunk = columnsInOrder.get(i);
            ColumnDescriptor descriptor = descriptorsMap.get(chunk.getPath());
            processChunk(descriptor, chunk, crStore, reader, writer, schema, nullifyColumns, maskMode);
        }
        writer.endBlock();
        store = reader.readNextRowGroup();
        blockIndex++;
    }
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) Converter(org.apache.parquet.io.api.Converter) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnWriter(org.apache.parquet.column.ColumnWriter) HashSet(java.util.HashSet) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) CodecFactory(org.apache.parquet.hadoop.CodecFactory) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) PageReadStore(org.apache.parquet.column.page.PageReadStore) ParquetProperties(org.apache.parquet.column.ParquetProperties) ColumnWriteStore(org.apache.parquet.column.ColumnWriteStore) GroupType(org.apache.parquet.schema.GroupType) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) GroupConverter(org.apache.parquet.io.api.GroupConverter) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) ColumnChunkPageWriteStore(org.apache.parquet.hadoop.ColumnChunkPageWriteStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnReader(org.apache.parquet.column.ColumnReader) Type(org.apache.parquet.schema.Type) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath)

Example 3 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnMaskerTest method nullifyColumns.

private void nullifyColumns(Configuration conf, String inputFile, String outputFile) throws IOException {
    Path inPath = new Path(inputFile);
    Path outPath = new Path(outputFile);
    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.OVERWRITE);
    writer.start();
    List<String> paths = new ArrayList<>();
    paths.add("DocId");
    paths.add("Gender");
    paths.add("Links.Backward");
    try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
        columnMasker.processBlocks(reader, writer, metaData, schema, paths, ColumnMasker.MaskMode.NULLIFY);
    } finally {
        writer.end(metaData.getFileMetaData().getKeyValueMetaData());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnMaskingCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(mode != null && (mode.equals("nullify")), "mask mode cannot be null and can be only nullify");
    Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required.");
    Preconditions.checkArgument(cols != null && cols.size() > 0, "columns cannot be null or empty");
    MaskMode maskMode = MaskMode.fromString(mode);
    Path inPath = new Path(input);
    Path outPath = new Path(output);
    ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
    writer.start();
    try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
        masker.processBlocks(reader, writer, metaData, schema, cols, maskMode);
    } finally {
        writer.end(metaData.getFileMetaData().getKeyValueMetaData());
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) MaskMode(org.apache.parquet.hadoop.util.ColumnMasker.MaskMode) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Example 5 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class CompressionConveterTest method convertCompression.

private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
    Path inPath = new Path(inputFile);
    Path outPath = new Path(outputFile);
    CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
    writer.start();
    try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
        compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
    } finally {
        writer.end(metaData.getFileMetaData().getKeyValueMetaData());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)9 TransParquetFileReader (org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader)9 Path (org.apache.hadoop.fs.Path)7 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)7 MessageType (org.apache.parquet.schema.MessageType)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)3 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)3 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Configuration (org.apache.hadoop.conf.Configuration)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 ParquetProperties (org.apache.parquet.column.ParquetProperties)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 ArrayList (java.util.ArrayList)1