use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class CompressionConveterTest method validColumnIndex.
private void validColumnIndex(String inputFile, String outFile) throws Exception {
ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
if (inColumnIndex != null) {
Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
}
if (inOffsetIndex != null) {
List<Long> inOffsets = getOffsets(inReader, inChunk);
List<Long> outOffsets = getOffsets(outReader, outChunk);
Assert.assertEquals(inOffsets.size(), outOffsets.size());
Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
Assert.assertEquals(inOffsetIndex.getOffset(k), (long) inOffsets.get(k));
Assert.assertEquals(outOffsetIndex.getOffset(k), (long) outOffsets.get(k));
}
}
}
}
}
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnEncryptorTest method verifyOffsetIndexes.
private void verifyOffsetIndexes() throws IOException {
ParquetReadOptions readOptions = HadoopReadOptions.builder(conf).withDecryption(EncDecProperties.getFileDecryptionProperties()).build();
try (TransParquetFileReader inReader = createFileReader(inputFile.getFileName());
TransParquetFileReader outReader = createFileReader(outputFile)) {
ParquetMetadata inMetaData = getMetadata(readOptions, inputFile.getFileName(), inReader);
ParquetMetadata outMetaData = getMetadata(readOptions, outputFile, outReader);
compareOffsetIndexes(inReader, outReader, inMetaData, outMetaData);
}
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class TransCompressionCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required.");
Preconditions.checkArgument(codec != null, "The codec cannot be null");
Path inPath = new Path(input);
Path outPath = new Path(output);
CompressionCodecName codecName = CompressionCodecName.valueOf(codec);
ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
writer.start();
try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
} finally {
writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
return 0;
}
use of org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader in project parquet-mr by apache.
the class ColumnEncryptor method encryptColumns.
/**
* Given the input file, encrypt the columns specified by paths, and output the file.
* The encryption settings can be specified in the parameter of fileEncryptionProperties
* @param inputFile Input file
* @param outputFile Output file
* @param paths columns to be encrypted
* @param fileEncryptionProperties FileEncryptionProperties of the file
* @throws IOException
*/
public void encryptColumns(String inputFile, String outputFile, List<String> paths, FileEncryptionProperties fileEncryptionProperties) throws IOException {
Path inPath = new Path(inputFile);
Path outPath = new Path(outputFile);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
ParquetFileWriter writer = new ParquetFileWriter(HadoopOutputFile.fromPath(outPath, conf), schema, ParquetFileWriter.Mode.OVERWRITE, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, fileEncryptionProperties);
writer.start();
try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
processBlocks(reader, writer, metaData, schema, paths);
}
writer.end(metaData.getFileMetaData().getKeyValueMetaData());
}
Aggregations