use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteReadStatistics.
@Test
public void testWriteReadStatistics() throws Exception {
// this test assumes statistics will be read
Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
String[] path1 = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
String[] path2 = { "c", "d" };
ColumnDescriptor c2 = schema.getColumnDescription(path2);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 1, 2, 3, 4 };
byte[] bytes3 = { 2, 3, 4, 5 };
byte[] bytes4 = { 3, 4, 5, 6 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics statsB1C1P1 = new BinaryStatistics();
BinaryStatistics statsB1C1P2 = new BinaryStatistics();
LongStatistics statsB1C2P1 = new LongStatistics();
LongStatistics statsB1C2P2 = new LongStatistics();
BinaryStatistics statsB2C1P1 = new BinaryStatistics();
LongStatistics statsB2C2P1 = new LongStatistics();
statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
statsB1C2P1.setMinMax(2l, 10l);
statsB1C2P2.setMinMax(-6l, 4l);
statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
statsB2C2P1.setMinMax(11l, 122l);
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 6, codec);
w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(c2, 8, codec);
w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
for (BlockMetaData block : readFooter.getBlocks()) {
for (ColumnChunkMetaData col : block.getColumns()) {
col.getPath();
}
}
// correct statistics
BinaryStatistics bs1 = new BinaryStatistics();
bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
LongStatistics ls1 = new LongStatistics();
ls1.setMinMax(-6l, 10l);
BinaryStatistics bs2 = new BinaryStatistics();
bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
LongStatistics ls2 = new LongStatistics();
ls2.setMinMax(11l, 122l);
{
// assert stats are correct for the first block
BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
String str = new String(bsout.getMaxBytes());
String str2 = new String(bsout.getMinBytes());
TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
}
{
// assert stats are correct for the second block
TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
}
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestInputFormat method newBlock.
private BlockMetaData newBlock(long start, long compressedBlockSize) {
BlockMetaData blockMetaData = new BlockMetaData();
// assuming the compression ratio is 2
long uncompressedSize = compressedBlockSize * 2;
ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), new BinaryStatistics(), start, 0l, 0l, compressedBlockSize, uncompressedSize);
blockMetaData.addColumn(column);
blockMetaData.setTotalByteSize(uncompressedSize);
return blockMetaData;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestInputFormat method createParquetFile.
private void createParquetFile(File file) throws IOException {
Path path = new Path(file.toURI());
Configuration configuration = new Configuration();
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
String[] columnPath = { "a", "b" };
ColumnDescriptor c1 = schema.getColumnDescription(columnPath);
byte[] bytes1 = { 0, 1, 2, 3 };
byte[] bytes2 = { 2, 3, 4, 5 };
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics stats = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(c1, 5, codec);
w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.startBlock(4);
w.startColumn(c1, 7, codec);
w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method testColumnOrderV1.
@Test
public void testColumnOrderV1() throws IOException {
ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
InOrder inOrder = inOrder(mockFileWriter);
MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
BytesInput fakeData = BytesInput.fromInt(34);
int fakeCount = 3;
BinaryStatistics fakeStats = new BinaryStatistics();
// TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
// see comment at this constructor
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
for (ColumnDescriptor col : schema.getColumns()) {
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
}
// flush to the mock writer
store.flushToFileWriter(mockFileWriter);
for (ColumnDescriptor col : schema.getColumns()) {
inOrder.verify(mockFileWriter).writeColumnChunk(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED), isNull(DictionaryPage.class), any(), eq(fakeData.size()), eq(fakeData.size()), eq(fakeStats), // Deprecated writePage -> no column index
same(ColumnIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no offset index
same(OffsetIndexBuilder.getNoOpBuilder()), any(), any(), any(), any());
}
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method createColumnChunkMetaData.
private ColumnChunkMetaData createColumnChunkMetaData() {
Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
PrimitiveTypeName t = PrimitiveTypeName.BINARY;
ColumnPath p = ColumnPath.get("foo");
CompressionCodecName c = CompressionCodecName.GZIP;
BinaryStatistics s = new BinaryStatistics();
ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, 0, 0, 0, 0, 0);
return md;
}
Aggregations