Search in sources :

Example 6 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class CompressionConveterTest method validColumnIndex.

private void validColumnIndex(String inputFile, String outFile) throws Exception {
    ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
    ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
    Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
    try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
        TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
        for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
            BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
            BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
            Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
            for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
                ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
                ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
                OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
                ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
                ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
                OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
                if (inColumnIndex != null) {
                    Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
                    Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
                    Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
                    Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
                }
                if (inOffsetIndex != null) {
                    List<Long> inOffsets = getOffsets(inReader, inChunk);
                    List<Long> outOffsets = getOffsets(outReader, outChunk);
                    Assert.assertEquals(inOffsets.size(), outOffsets.size());
                    Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
                    Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
                    for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
                        Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
                        Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
                        Assert.assertEquals(inOffsetIndex.getOffset(k), (long) inOffsets.get(k));
                        Assert.assertEquals(outOffsetIndex.getOffset(k), (long) outOffsets.get(k));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 7 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnEncryptorTest method verifyOffsetIndexes.

private void verifyOffsetIndexes() throws IOException {
    ParquetReadOptions readOptions = HadoopReadOptions.builder(conf).withDecryption(EncDecProperties.getFileDecryptionProperties()).build();
    try (TransParquetFileReader inReader = createFileReader(inputFile.getFileName());
        TransParquetFileReader outReader = createFileReader(outputFile)) {
        ParquetMetadata inMetaData = getMetadata(readOptions, inputFile.getFileName(), inReader);
        ParquetMetadata outMetaData = getMetadata(readOptions, outputFile, outReader);
        compareOffsetIndexes(inReader, outReader, inMetaData, outMetaData);
    }
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) ParquetReadOptions(org.apache.parquet.ParquetReadOptions)

Example 8 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class TransCompressionCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required.");
    Preconditions.checkArgument(codec != null, "The codec cannot be null");
    Path inPath = new Path(input);
    Path outPath = new Path(output);
    CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
    ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
    writer.start();
    try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
        compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
    } finally {
        writer.end(metaData.getFileMetaData().getKeyValueMetaData());
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Example 9 with TransParquetFileReader

use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.

the class ColumnEncryptor method encryptColumns.

/**
 * Given the input file, encrypt the columns specified by paths, and output the file.
 * The encryption settings can be specified in the parameter of fileEncryptionProperties
 * @param inputFile Input file
 * @param outputFile Output file
 * @param paths columns to be encrypted
 * @param fileEncryptionProperties FileEncryptionProperties of the file
 * @throws IOException
 */
public void encryptColumns(String inputFile, String outputFile, List<String> paths, FileEncryptionProperties fileEncryptionProperties) throws IOException {
    Path inPath = new Path(inputFile);
    Path outPath = new Path(outputFile);
    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();
    ParquetFileWriter writer = new ParquetFileWriter(HadoopOutputFile.fromPath(outPath, conf), schema, ParquetFileWriter.Mode.OVERWRITE, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, fileEncryptionProperties);
    writer.start();
    try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
        processBlocks(reader, writer, metaData, schema, paths);
    }
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)9 TransParquetFileReader (org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader)9 Path (org.apache.hadoop.fs.Path)7 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)7 MessageType (org.apache.parquet.schema.MessageType)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)3 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)3 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Configuration (org.apache.hadoop.conf.Configuration)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 ParquetProperties (org.apache.parquet.column.ParquetProperties)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 ArrayList (java.util.ArrayList)1