Search in sources :

Example 6 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteReadStatistics.

@Test
public void testWriteReadStatistics() throws Exception {
    // this test assumes statistics will be read
    Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
    String[] path1 = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(path1);
    String[] path2 = { "c", "d" };
    ColumnDescriptor c2 = schema.getColumnDescription(path2);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 1, 2, 3, 4 };
    byte[] bytes3 = { 2, 3, 4, 5 };
    byte[] bytes4 = { 3, 4, 5, 6 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics statsB1C1P1 = new BinaryStatistics();
    BinaryStatistics statsB1C1P2 = new BinaryStatistics();
    LongStatistics statsB1C2P1 = new LongStatistics();
    LongStatistics statsB1C2P2 = new LongStatistics();
    BinaryStatistics statsB2C1P1 = new BinaryStatistics();
    LongStatistics statsB2C2P1 = new LongStatistics();
    statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
    statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
    statsB1C2P1.setMinMax(2l, 10l);
    statsB1C2P2.setMinMax(-6l, 4l);
    statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    statsB2C2P1.setMinMax(11l, 122l);
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 6, codec);
    w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 8, codec);
    w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    for (BlockMetaData block : readFooter.getBlocks()) {
        for (ColumnChunkMetaData col : block.getColumns()) {
            col.getPath();
        }
    }
    // correct statistics
    BinaryStatistics bs1 = new BinaryStatistics();
    bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
    LongStatistics ls1 = new LongStatistics();
    ls1.setMinMax(-6l, 10l);
    BinaryStatistics bs2 = new BinaryStatistics();
    bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    LongStatistics ls2 = new LongStatistics();
    ls2.setMinMax(11l, 122l);
    {
        // assert stats are correct for the first block
        BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
        String str = new String(bsout.getMaxBytes());
        String str2 = new String(bsout.getMinBytes());
        TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
    }
    {
        // assert stats are correct for the second block
        TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 7 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestInputFormat method newBlock.

private BlockMetaData newBlock(long start, long compressedBlockSize) {
    BlockMetaData blockMetaData = new BlockMetaData();
    // assuming the compression ratio is 2
    long uncompressedSize = compressedBlockSize * 2;
    ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)), new BinaryStatistics(), start, 0l, 0l, compressedBlockSize, uncompressedSize);
    blockMetaData.addColumn(column);
    blockMetaData.setTotalByteSize(uncompressedSize);
    return blockMetaData;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Encoding(org.apache.parquet.column.Encoding) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics)

Example 8 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestInputFormat method createParquetFile.

private void createParquetFile(File file) throws IOException {
    Path path = new Path(file.toURI());
    Configuration configuration = new Configuration();
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
    String[] columnPath = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(columnPath);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 2, 3, 4, 5 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics stats = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType)

Example 9 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method testColumnOrderV1.

@Test
public void testColumnOrderV1() throws IOException {
    ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
    InOrder inOrder = inOrder(mockFileWriter);
    MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
    BytesInput fakeData = BytesInput.fromInt(34);
    int fakeCount = 3;
    BinaryStatistics fakeStats = new BinaryStatistics();
    // TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
    // see comment at this constructor
    ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
    for (ColumnDescriptor col : schema.getColumns()) {
        PageWriter pageWriter = store.getPageWriter(col);
        pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
    }
    // flush to the mock writer
    store.flushToFileWriter(mockFileWriter);
    for (ColumnDescriptor col : schema.getColumns()) {
        inOrder.verify(mockFileWriter).writeColumnChunk(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED), isNull(DictionaryPage.class), any(), eq(fakeData.size()), eq(fakeData.size()), eq(fakeStats), // Deprecated writePage -> no column index
        same(ColumnIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no offset index
        same(OffsetIndexBuilder.getNoOpBuilder()), any(), any(), any(), any());
    }
}
Also used : InOrder(org.mockito.InOrder) BytesInput(org.apache.parquet.bytes.BytesInput) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 10 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method createColumnChunkMetaData.

private ColumnChunkMetaData createColumnChunkMetaData() {
    Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
    PrimitiveTypeName t = PrimitiveTypeName.BINARY;
    ColumnPath p = ColumnPath.get("foo");
    CompressionCodecName c = CompressionCodecName.GZIP;
    BinaryStatistics s = new BinaryStatistics();
    ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, 0, 0, 0, 0, 0);
    return md;
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Encoding(org.apache.parquet.column.Encoding) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) HashSet(java.util.HashSet) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Aggregations

BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)20 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)8 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)8 Statistics (org.apache.parquet.column.statistics.Statistics)8 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)6 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)6 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)5 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 MessageType (org.apache.parquet.schema.MessageType)5 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 Test (org.junit.Test)5 Stopwatch (com.google.common.base.Stopwatch)4 HashMap (java.util.HashMap)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 TypeProtos (org.apache.drill.common.types.TypeProtos)4 Configuration (org.apache.hadoop.conf.Configuration)4 Encoding (org.apache.parquet.column.Encoding)4 HashSet (java.util.HashSet)3 Path (org.apache.hadoop.fs.Path)3