use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.
the class TestParquetFileWriter method testColumnIndexWriteRead.
@Test
public void testColumnIndexWriteRead() throws Exception {
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
w.start();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 5, CODEC);
long c1p1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), statsC1(null, Binary.fromString("aaa")), 1, BIT_PACKED, BIT_PACKED, PLAIN);
long c1p2Starts = w.getPos();
w.writeDataPage(3, 4, BytesInput.from(BYTES1), statsC1(Binary.fromString("bbb"), Binary.fromString("ccc")), 3, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2p1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), statsC2(117l, 100l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
long c2p2Starts = w.getPos();
w.writeDataPage(3, 4, BytesInput.from(BYTES2), statsC2(null, null, null), 2, BIT_PACKED, BIT_PACKED, PLAIN);
long c2p3Starts = w.getPos();
w.writeDataPage(1, 4, BytesInput.from(BYTES2), statsC2(0l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), // Creating huge stats so the column index will reach the limit and won't be written
statsC1(Binary.fromConstantByteArray(new byte[(int) MAX_STATS_SIZE]), Binary.fromConstantByteArray(new byte[1])), 4, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
try (ParquetFileReader reader = new ParquetFileReader(HadoopInputFile.fromPath(path, configuration), ParquetReadOptions.builder().build())) {
ParquetMetadata footer = reader.getFooter();
assertEquals(3, footer.getBlocks().size());
BlockMetaData blockMeta = footer.getBlocks().get(1);
assertEquals(2, blockMeta.getColumns().size());
ColumnIndex columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(0));
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertTrue(Arrays.asList(1l, 0l).equals(columnIndex.getNullCounts()));
assertTrue(Arrays.asList(false, false).equals(columnIndex.getNullPages()));
List<ByteBuffer> minValues = columnIndex.getMinValues();
assertEquals(2, minValues.size());
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
assertEquals(2, maxValues.size());
assertEquals("aaa", new String(minValues.get(0).array(), StandardCharsets.UTF_8));
assertEquals("aaa", new String(maxValues.get(0).array(), StandardCharsets.UTF_8));
assertEquals("bbb", new String(minValues.get(1).array(), StandardCharsets.UTF_8));
assertEquals("ccc", new String(maxValues.get(1).array(), StandardCharsets.UTF_8));
columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(1));
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
assertTrue(Arrays.asList(0l, 3l, 0l).equals(columnIndex.getNullCounts()));
assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
minValues = columnIndex.getMinValues();
assertEquals(3, minValues.size());
maxValues = columnIndex.getMaxValues();
assertEquals(3, maxValues.size());
assertEquals(100, BytesUtils.bytesToLong(minValues.get(0).array()));
assertEquals(117, BytesUtils.bytesToLong(maxValues.get(0).array()));
assertEquals(0, minValues.get(1).array().length);
assertEquals(0, maxValues.get(1).array().length);
assertEquals(0, BytesUtils.bytesToLong(minValues.get(2).array()));
assertEquals(0, BytesUtils.bytesToLong(maxValues.get(2).array()));
OffsetIndex offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(0));
assertEquals(2, offsetIndex.getPageCount());
assertEquals(c1p1Starts, offsetIndex.getOffset(0));
assertEquals(c1p2Starts, offsetIndex.getOffset(1));
assertEquals(c1p2Starts - c1p1Starts, offsetIndex.getCompressedPageSize(0));
assertEquals(c1Ends - c1p2Starts, offsetIndex.getCompressedPageSize(1));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(1, offsetIndex.getFirstRowIndex(1));
offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(1));
assertEquals(3, offsetIndex.getPageCount());
assertEquals(c2p1Starts, offsetIndex.getOffset(0));
assertEquals(c2p2Starts, offsetIndex.getOffset(1));
assertEquals(c2p3Starts, offsetIndex.getOffset(2));
assertEquals(c2p2Starts - c2p1Starts, offsetIndex.getCompressedPageSize(0));
assertEquals(c2p3Starts - c2p2Starts, offsetIndex.getCompressedPageSize(1));
assertEquals(c2Ends - c2p3Starts, offsetIndex.getCompressedPageSize(2));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(1, offsetIndex.getFirstRowIndex(1));
assertEquals(3, offsetIndex.getFirstRowIndex(2));
assertNull(reader.readColumnIndex(footer.getBlocks().get(2).getColumns().get(0)));
}
}
use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.
the class ShowColumnIndexCommand method run.
@Override
public int run() throws IOException {
Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files.");
InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
if (!showColumnIndex && !showOffsetIndex) {
showColumnIndex = true;
showOffsetIndex = true;
}
Set<String> rowGroupIndexSet = new HashSet<>();
if (rowGroupIndexes != null) {
rowGroupIndexSet.addAll(rowGroupIndexes);
}
try (ParquetFileReader reader = ParquetFileReader.open(in)) {
boolean firstBlock = true;
int rowGroupIndex = 0;
for (BlockMetaData block : reader.getFooter().getBlocks()) {
if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
++rowGroupIndex;
continue;
}
if (!firstBlock) {
console.info("");
}
firstBlock = false;
console.info("row-group {}:", rowGroupIndex);
for (ColumnChunkMetaData column : getColumns(block)) {
String path = column.getPath().toDotString();
if (showColumnIndex) {
console.info("column index for column {}:", path);
ColumnIndex columnIndex = reader.readColumnIndex(column);
if (columnIndex == null) {
console.info("NONE");
} else {
console.info(columnIndex.toString());
}
}
if (showOffsetIndex) {
console.info("offset index for column {}:", path);
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
if (offsetIndex == null) {
console.info("NONE");
} else {
console.info(offsetIndex.toString());
}
}
}
++rowGroupIndex;
}
}
return 0;
}
use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.
the class TestParquetMetadataConverter method testColumnIndexConversion.
@Test
public void testColumnIndexConversion() {
PrimitiveType type = Types.required(PrimitiveTypeName.INT64).named("test_int64");
ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
Statistics<?> stats = Statistics.createStats(type);
stats.incrementNumNulls(16);
stats.updateStats(-100l);
stats.updateStats(100l);
builder.add(stats);
stats = Statistics.createStats(type);
stats.incrementNumNulls(111);
builder.add(stats);
stats = Statistics.createStats(type);
stats.updateStats(200l);
stats.updateStats(500l);
builder.add(stats);
org.apache.parquet.format.ColumnIndex parquetColumnIndex = ParquetMetadataConverter.toParquetColumnIndex(type, builder.build());
ColumnIndex columnIndex = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex);
assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
assertTrue(Arrays.asList(16l, 111l, 0l).equals(columnIndex.getNullCounts()));
assertTrue(Arrays.asList(ByteBuffer.wrap(BytesUtils.longToBytes(-100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(200l))).equals(columnIndex.getMinValues()));
assertTrue(Arrays.asList(ByteBuffer.wrap(BytesUtils.longToBytes(100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(500l))).equals(columnIndex.getMaxValues()));
assertNull("Should handle null column index", ParquetMetadataConverter.toParquetColumnIndex(Types.required(PrimitiveTypeName.INT32).named("test_int32"), null));
assertNull("Should ignore unsupported types", ParquetMetadataConverter.toParquetColumnIndex(Types.required(PrimitiveTypeName.INT96).named("test_int96"), columnIndex));
assertNull("Should ignore unsupported types", ParquetMetadataConverter.fromParquetColumnIndex(Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named("test_interval"), parquetColumnIndex));
}
Aggregations