Search in sources :

Example 51 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
    Configuration conf = new Configuration();
    List<BlockMetaData> blocks = meta.getBlocks();
    List<ColumnDescriptor> columns = schema.getColumns();
    if (showColumns != null) {
        columns = new ArrayList<ColumnDescriptor>();
        for (ColumnDescriptor column : schema.getColumns()) {
            String path = Joiner.on('.').skipNulls().join(column.getPath());
            if (showColumns.contains(path)) {
                columns.add(column);
            }
        }
    }
    ParquetFileReader freader = null;
    if (showmd) {
        try {
            long group = 0;
            for (BlockMetaData block : blocks) {
                if (group != 0)
                    out.println();
                out.format("row group %d%n", group++);
                out.rule('-');
                List<ColumnChunkMetaData> ccmds = block.getColumns();
                if (showColumns != null) {
                    ccmds = new ArrayList<ColumnChunkMetaData>();
                    for (ColumnChunkMetaData ccmd : block.getColumns()) {
                        String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
                        if (showColumns.contains(path)) {
                            ccmds.add(ccmd);
                        }
                    }
                }
                MetadataUtils.showDetails(out, ccmds);
                List<BlockMetaData> rblocks = Collections.singletonList(block);
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    out.incrementTabLevel();
                    for (ColumnDescriptor column : columns) {
                        out.println();
                        dump(out, store, column);
                    }
                    out.decrementTabLevel();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            }
        } finally {
            if (freader != null) {
                freader.close();
            }
        }
    }
    if (showdt) {
        boolean first = true;
        for (ColumnDescriptor column : columns) {
            if (!first || showmd)
                out.println();
            first = false;
            out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
            out.rule('-');
            try {
                long page = 1;
                long total = blocks.size();
                long offset = 1;
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
                    dump(out, crstore, column, page++, total, offset);
                    offset += store.getRowCount();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            } finally {
                out.flushColumns();
                if (freader != null) {
                    freader.close();
                }
            }
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) PageReadStore(org.apache.parquet.column.page.PageReadStore)

Example 52 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class MetadataUtils method showDetails.

private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
    String name = Strings.repeat(".", depth) + type.getName();
    OriginalType otype = type.getOriginalType();
    Repetition rep = type.getRepetition();
    PrimitiveTypeName ptype = type.getPrimitiveTypeName();
    out.format("%s: %s %s", name, rep, ptype);
    if (otype != null)
        out.format(" O:%s", otype);
    if (container != null) {
        cpath.add(type.getName());
        String[] paths = cpath.toArray(new String[cpath.size()]);
        cpath.remove(cpath.size() - 1);
        ColumnDescriptor desc = container.getColumnDescription(paths);
        int defl = desc.getMaxDefinitionLevel();
        int repl = desc.getMaxRepetitionLevel();
        out.format(" R:%d D:%d", repl, defl);
    }
    out.println();
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) Repetition(org.apache.parquet.schema.Type.Repetition) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 53 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class DefaultValuesWriterFactoryTest method doTestValueWriter.

private void doTestValueWriter(PrimitiveTypeName typeName, WriterVersion version, boolean enableDictionary, Class<? extends ValuesWriter> expectedValueWriterClass) {
    ColumnDescriptor mockPath = getMockColumn(typeName);
    ValuesWriterFactory factory = getDefaultFactory(version, enableDictionary);
    ValuesWriter writer = factory.newValuesWriter(mockPath);
    validateWriterType(writer, expectedValueWriterClass);
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) DictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter) BooleanPlainValuesWriter(org.apache.parquet.column.values.plain.BooleanPlainValuesWriter) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) DeltaBinaryPackingValuesWriter(org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) FixedLenByteArrayPlainValuesWriter(org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesWriter)

Example 54 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class DefaultValuesWriterFactoryTest method getMockColumn.

private ColumnDescriptor getMockColumn(PrimitiveTypeName typeName) {
    ColumnDescriptor mockPath = mock(ColumnDescriptor.class);
    when(mockPath.getType()).thenReturn(typeName);
    return mockPath;
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor)

Example 55 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestColumnReaderImpl method testOptional.

@Test
public void testOptional() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    MemPageWriter pageWriter = new MemPageWriter();
    ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder().withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0).withPageSize(2048).build());
    for (int i = 0; i < rows; i++) {
        columnWriterV2.writeNull(0, 0);
        if ((i + 1) % 1000 == 0) {
            columnWriterV2.writePage(i);
        }
    }
    columnWriterV2.writePage(rows);
    columnWriterV2.finalizeColumnChunk();
    List<DataPage> pages = pageWriter.getPages();
    int valueCount = 0;
    int rowCount = 0;
    for (DataPage dataPage : pages) {
        valueCount += dataPage.getValueCount();
        rowCount += ((DataPageV2) dataPage).getRowCount();
    }
    assertEquals(rows, rowCount);
    assertEquals(rows, valueCount);
    MemPageReader pageReader = new MemPageReader((long) rows, pages.iterator(), pageWriter.getDictionaryPage());
    ValidatingConverter converter = new ValidatingConverter();
    ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
    for (int i = 0; i < rows; i++) {
        assertEquals(0, columnReader.getCurrentRepetitionLevel());
        assertEquals(0, columnReader.getCurrentDefinitionLevel());
        columnReader.consume();
    }
    assertEquals(0, converter.count);
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MemPageWriter(org.apache.parquet.column.page.mem.MemPageWriter) ColumnReader(org.apache.parquet.column.ColumnReader) MessageType(org.apache.parquet.schema.MessageType) MemPageReader(org.apache.parquet.column.page.mem.MemPageReader) Test(org.junit.Test)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7