Search in sources :

Example 16 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class DictionaryFilter method expandDictionary.

@SuppressWarnings("unchecked")
private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException {
    ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1);
    DictionaryPage page = dictionaries.readDictionaryPage(col);
    // the chunk may not be dictionary-encoded
    if (page == null) {
        return null;
    }
    Dictionary dict = page.getEncoding().initDictionary(col, page);
    Set dictSet = new HashSet<T>();
    for (int i = 0; i <= dict.getMaxId(); i++) {
        switch(meta.getType()) {
            case BINARY:
                dictSet.add(dict.decodeToBinary(i));
                break;
            case INT32:
                dictSet.add(dict.decodeToInt(i));
                break;
            case INT64:
                dictSet.add(dict.decodeToLong(i));
                break;
            case FLOAT:
                dictSet.add(dict.decodeToFloat(i));
                break;
            case DOUBLE:
                dictSet.add(dict.decodeToDouble(i));
                break;
            default:
                LOG.warn("Unknown dictionary type{}", meta.getType());
        }
    }
    return (Set<T>) dictSet;
}
Also used : Dictionary(org.apache.parquet.column.Dictionary) Set(java.util.Set) HashSet(java.util.HashSet) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) DictionaryPage(org.apache.parquet.column.page.DictionaryPage) HashSet(java.util.HashSet)

Example 17 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteReadStatistics.

@Test
public void testWriteReadStatistics() throws Exception {
    // this test assumes statistics will be read
    Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
    String[] path1 = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(path1);
    String[] path2 = { "c", "d" };
    ColumnDescriptor c2 = schema.getColumnDescription(path2);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 1, 2, 3, 4 };
    byte[] bytes3 = { 2, 3, 4, 5 };
    byte[] bytes4 = { 3, 4, 5, 6 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics statsB1C1P1 = new BinaryStatistics();
    BinaryStatistics statsB1C1P2 = new BinaryStatistics();
    LongStatistics statsB1C2P1 = new LongStatistics();
    LongStatistics statsB1C2P2 = new LongStatistics();
    BinaryStatistics statsB2C1P1 = new BinaryStatistics();
    LongStatistics statsB2C2P1 = new LongStatistics();
    statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
    statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
    statsB1C2P1.setMinMax(2l, 10l);
    statsB1C2P2.setMinMax(-6l, 4l);
    statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    statsB2C2P1.setMinMax(11l, 122l);
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 6, codec);
    w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 8, codec);
    w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    for (BlockMetaData block : readFooter.getBlocks()) {
        for (ColumnChunkMetaData col : block.getColumns()) {
            col.getPath();
        }
    }
    // correct statistics
    BinaryStatistics bs1 = new BinaryStatistics();
    bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
    LongStatistics ls1 = new LongStatistics();
    ls1.setMinMax(-6l, 10l);
    BinaryStatistics bs2 = new BinaryStatistics();
    bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    LongStatistics ls2 = new LongStatistics();
    ls2.setMinMax(11l, 122l);
    {
        // assert stats are correct for the first block
        BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
        String str = new String(bsout.getMaxBytes());
        String str2 = new String(bsout.getMinBytes());
        TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
    }
    {
        // assert stats are correct for the second block
        TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 18 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method testColumnOrderV1.

@Test
public void testColumnOrderV1() throws IOException {
    ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
    InOrder inOrder = inOrder(mockFileWriter);
    MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
    BytesInput fakeData = BytesInput.fromInt(34);
    int fakeCount = 3;
    BinaryStatistics fakeStats = new BinaryStatistics();
    // TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
    // see comment at this constructor
    ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator());
    for (ColumnDescriptor col : schema.getColumns()) {
        PageWriter pageWriter = store.getPageWriter(col);
        pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
    }
    // flush to the mock writer
    store.flushToFileWriter(mockFileWriter);
    for (ColumnDescriptor col : schema.getColumns()) {
        inOrder.verify(mockFileWriter).startColumn(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED));
    }
}
Also used : InOrder(org.mockito.InOrder) BytesInput(org.apache.parquet.bytes.BytesInput) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 19 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 20 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestInputFormat method createParquetFile.

private void createParquetFile(File file) throws IOException {
    Path path = new Path(file.toURI());
    Configuration configuration = new Configuration();
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
    String[] columnPath = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(columnPath);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 2, 3, 4, 5 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics stats = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7