use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class TestParquetMetadataConverter method testOffsetIndexConversion.
@Test
public void testOffsetIndexConversion() {
OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder();
builder.add(1000, 10000, 0);
builder.add(22000, 12000, 100);
OffsetIndex offsetIndex = ParquetMetadataConverter.fromParquetOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(builder.build(100000)));
assertEquals(2, offsetIndex.getPageCount());
assertEquals(101000, offsetIndex.getOffset(0));
assertEquals(10000, offsetIndex.getCompressedPageSize(0));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(122000, offsetIndex.getOffset(1));
assertEquals(12000, offsetIndex.getCompressedPageSize(1));
assertEquals(100, offsetIndex.getFirstRowIndex(1));
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class CompressionConveterTest method validColumnIndex.
private void validColumnIndex(String inputFile, String outFile) throws Exception {
ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
if (inColumnIndex != null) {
Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
}
if (inOffsetIndex != null) {
List<Long> inOffsets = getOffsets(inReader, inChunk);
List<Long> outOffsets = getOffsets(outReader, outChunk);
Assert.assertEquals(inOffsets.size(), outOffsets.size());
Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
Assert.assertEquals(inOffsetIndex.getOffset(k), (long) inOffsets.get(k));
Assert.assertEquals(outOffsetIndex.getOffset(k), (long) outOffsets.get(k));
}
}
}
}
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class TestParquetFileWriter method testColumnIndexWriteRead.
@Test
public void testColumnIndexWriteRead() throws Exception {
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
w.start();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 5, CODEC);
long c1p1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), statsC1(null, Binary.fromString("aaa")), 1, BIT_PACKED, BIT_PACKED, PLAIN);
long c1p2Starts = w.getPos();
w.writeDataPage(3, 4, BytesInput.from(BYTES1), statsC1(Binary.fromString("bbb"), Binary.fromString("ccc")), 3, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2p1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), statsC2(117l, 100l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
long c2p2Starts = w.getPos();
w.writeDataPage(3, 4, BytesInput.from(BYTES2), statsC2(null, null, null), 2, BIT_PACKED, BIT_PACKED, PLAIN);
long c2p3Starts = w.getPos();
w.writeDataPage(1, 4, BytesInput.from(BYTES2), statsC2(0l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), // Creating huge stats so the column index will reach the limit and won't be written
statsC1(Binary.fromConstantByteArray(new byte[(int) MAX_STATS_SIZE]), Binary.fromConstantByteArray(new byte[1])), 4, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
try (ParquetFileReader reader = new ParquetFileReader(HadoopInputFile.fromPath(path, configuration), ParquetReadOptions.builder().build())) {
ParquetMetadata footer = reader.getFooter();
assertEquals(3, footer.getBlocks().size());
BlockMetaData blockMeta = footer.getBlocks().get(1);
assertEquals(2, blockMeta.getColumns().size());
ColumnIndex columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(0));
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertTrue(Arrays.asList(1l, 0l).equals(columnIndex.getNullCounts()));
assertTrue(Arrays.asList(false, false).equals(columnIndex.getNullPages()));
List<ByteBuffer> minValues = columnIndex.getMinValues();
assertEquals(2, minValues.size());
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
assertEquals(2, maxValues.size());
assertEquals("aaa", new String(minValues.get(0).array(), StandardCharsets.UTF_8));
assertEquals("aaa", new String(maxValues.get(0).array(), StandardCharsets.UTF_8));
assertEquals("bbb", new String(minValues.get(1).array(), StandardCharsets.UTF_8));
assertEquals("ccc", new String(maxValues.get(1).array(), StandardCharsets.UTF_8));
columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(1));
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
assertTrue(Arrays.asList(0l, 3l, 0l).equals(columnIndex.getNullCounts()));
assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
minValues = columnIndex.getMinValues();
assertEquals(3, minValues.size());
maxValues = columnIndex.getMaxValues();
assertEquals(3, maxValues.size());
assertEquals(100, BytesUtils.bytesToLong(minValues.get(0).array()));
assertEquals(117, BytesUtils.bytesToLong(maxValues.get(0).array()));
assertEquals(0, minValues.get(1).array().length);
assertEquals(0, maxValues.get(1).array().length);
assertEquals(0, BytesUtils.bytesToLong(minValues.get(2).array()));
assertEquals(0, BytesUtils.bytesToLong(maxValues.get(2).array()));
OffsetIndex offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(0));
assertEquals(2, offsetIndex.getPageCount());
assertEquals(c1p1Starts, offsetIndex.getOffset(0));
assertEquals(c1p2Starts, offsetIndex.getOffset(1));
assertEquals(c1p2Starts - c1p1Starts, offsetIndex.getCompressedPageSize(0));
assertEquals(c1Ends - c1p2Starts, offsetIndex.getCompressedPageSize(1));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(1, offsetIndex.getFirstRowIndex(1));
offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(1));
assertEquals(3, offsetIndex.getPageCount());
assertEquals(c2p1Starts, offsetIndex.getOffset(0));
assertEquals(c2p2Starts, offsetIndex.getOffset(1));
assertEquals(c2p3Starts, offsetIndex.getOffset(2));
assertEquals(c2p2Starts - c2p1Starts, offsetIndex.getCompressedPageSize(0));
assertEquals(c2p3Starts - c2p2Starts, offsetIndex.getCompressedPageSize(1));
assertEquals(c2Ends - c2p3Starts, offsetIndex.getCompressedPageSize(2));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(1, offsetIndex.getFirstRowIndex(1));
assertEquals(3, offsetIndex.getFirstRowIndex(2));
assertNull(reader.readColumnIndex(footer.getBlocks().get(2).getColumns().get(0)));
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ShowColumnIndexCommand method run.
@Override
public int run() throws IOException {
Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files.");
InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
if (!showColumnIndex && !showOffsetIndex) {
showColumnIndex = true;
showOffsetIndex = true;
}
Set<String> rowGroupIndexSet = new HashSet<>();
if (rowGroupIndexes != null) {
rowGroupIndexSet.addAll(rowGroupIndexes);
}
try (ParquetFileReader reader = ParquetFileReader.open(in)) {
boolean firstBlock = true;
int rowGroupIndex = 0;
for (BlockMetaData block : reader.getFooter().getBlocks()) {
if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
++rowGroupIndex;
continue;
}
if (!firstBlock) {
console.info("");
}
firstBlock = false;
console.info("row-group {}:", rowGroupIndex);
for (ColumnChunkMetaData column : getColumns(block)) {
String path = column.getPath().toDotString();
if (showColumnIndex) {
console.info("column index for column {}:", path);
ColumnIndex columnIndex = reader.readColumnIndex(column);
if (columnIndex == null) {
console.info("NONE");
} else {
console.info(columnIndex.toString());
}
}
if (showOffsetIndex) {
console.info("offset index for column {}:", path);
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
if (offsetIndex == null) {
console.info("NONE");
} else {
console.info(offsetIndex.toString());
}
}
}
++rowGroupIndex;
}
}
return 0;
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ColumnEncryptor method processPages.
private void processPages(TransParquetFileReader reader, ColumnChunkMetaData chunk, ParquetFileWriter writer, String createdBy, int blockId, int columnId, boolean encrypt) throws IOException {
int pageOrdinal = 0;
EncryptorRunTime encryptorRunTime = new EncryptorRunTime(writer.getEncryptor(), chunk, blockId, columnId);
DictionaryPage dictionaryPage = null;
long readValues = 0;
ParquetMetadataConverter converter = new ParquetMetadataConverter();
OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
reader.setStreamPosition(chunk.getStartingPos());
long totalChunkValues = chunk.getValueCount();
while (readValues < totalChunkValues) {
PageHeader pageHeader = reader.readPageHeader();
byte[] pageLoad;
switch(pageHeader.type) {
case DICTIONARY_PAGE:
if (dictionaryPage != null) {
throw new IOException("has more than one dictionary page in column chunk");
}
// No quickUpdatePageAAD needed for dictionary page
DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDictPageAAD(), encrypt);
writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding())), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDictPageHeaderAAD());
break;
case DATA_PAGE:
if (encrypt) {
AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageHeaderAAD(), pageOrdinal);
AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageAAD(), pageOrdinal);
}
DataPageHeader headerV1 = pageHeader.data_page_header;
pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDataPageAAD(), encrypt);
readValues += headerV1.getNum_values();
if (offsetIndex != null) {
long rowCount = 1 + offsetIndex.getLastRowIndex(pageOrdinal, totalChunkValues) - offsetIndex.getFirstRowIndex(pageOrdinal);
writer.writeDataPage(Math.toIntExact(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), converter.fromParquetStatistics(createdBy, headerV1.getStatistics(), chunk.getPrimitiveType()), rowCount, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDataPageHeaderAAD());
} else {
writer.writeDataPage(Math.toIntExact(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), converter.fromParquetStatistics(createdBy, headerV1.getStatistics(), chunk.getPrimitiveType()), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDataPageHeaderAAD());
}
pageOrdinal++;
break;
case DATA_PAGE_V2:
if (encrypt) {
AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageHeaderAAD(), pageOrdinal);
AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageAAD(), pageOrdinal);
}
DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
int rlLength = headerV2.getRepetition_levels_byte_length();
BytesInput rlLevels = readBlockAllocate(rlLength, reader);
int dlLength = headerV2.getDefinition_levels_byte_length();
BytesInput dlLevels = readBlockAllocate(dlLength, reader);
int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
pageLoad = processPayload(reader, payLoadLength, encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDataPageAAD(), encrypt);
readValues += headerV2.getNum_values();
writer.writeDataPageV2(headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, converter.fromParquetStatistics(createdBy, headerV2.getStatistics(), chunk.getPrimitiveType()));
pageOrdinal++;
break;
default:
break;
}
}
}
Aggregations