Search in sources :

Example 1 with ColumnReadStoreImpl

use of org.apache.parquet.column.impl.ColumnReadStoreImpl in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
    Configuration conf = new Configuration();
    List<BlockMetaData> blocks = meta.getBlocks();
    List<ColumnDescriptor> columns = schema.getColumns();
    if (showColumns != null) {
        columns = new ArrayList<ColumnDescriptor>();
        for (ColumnDescriptor column : schema.getColumns()) {
            String path = Joiner.on('.').skipNulls().join(column.getPath());
            if (showColumns.contains(path)) {
                columns.add(column);
            }
        }
    }
    ParquetFileReader freader = null;
    if (showmd) {
        try {
            long group = 0;
            for (BlockMetaData block : blocks) {
                if (group != 0)
                    out.println();
                out.format("row group %d%n", group++);
                out.rule('-');
                List<ColumnChunkMetaData> ccmds = block.getColumns();
                if (showColumns != null) {
                    ccmds = new ArrayList<ColumnChunkMetaData>();
                    for (ColumnChunkMetaData ccmd : block.getColumns()) {
                        String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
                        if (showColumns.contains(path)) {
                            ccmds.add(ccmd);
                        }
                    }
                }
                MetadataUtils.showDetails(out, ccmds);
                List<BlockMetaData> rblocks = Collections.singletonList(block);
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    out.incrementTabLevel();
                    for (ColumnDescriptor column : columns) {
                        out.println();
                        dump(out, store, column);
                    }
                    out.decrementTabLevel();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            }
        } finally {
            if (freader != null) {
                freader.close();
            }
        }
    }
    if (showdt) {
        boolean first = true;
        for (ColumnDescriptor column : columns) {
            if (!first || showmd)
                out.println();
            first = false;
            out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
            out.rule('-');
            try {
                long page = 1;
                long total = blocks.size();
                long offset = 1;
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
                    dump(out, crstore, column, page++, total, offset);
                    offset += store.getRowCount();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            } finally {
                out.flushColumns();
                if (freader != null) {
                    freader.close();
                }
            }
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) PageReadStore(org.apache.parquet.column.page.PageReadStore)

Example 2 with ColumnReadStoreImpl

use of org.apache.parquet.column.impl.ColumnReadStoreImpl in project parquet-mr by apache.

the class MessageColumnIO method getRecordReader.

public <T> RecordReader<T> getRecordReader(final PageReadStore columns, final RecordMaterializer<T> recordMaterializer, final Filter filter) {
    checkNotNull(columns, "columns");
    checkNotNull(recordMaterializer, "recordMaterializer");
    checkNotNull(filter, "filter");
    if (leaves.isEmpty()) {
        return new EmptyRecordReader<T>(recordMaterializer);
    }
    return filter.accept(new Visitor<RecordReader<T>>() {

        @Override
        public RecordReader<T> visit(FilterPredicateCompat filterPredicateCompat) {
            FilterPredicate predicate = filterPredicateCompat.getFilterPredicate();
            IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(leaves);
            IncrementallyUpdatedFilterPredicate streamingPredicate = builder.build(predicate);
            RecordMaterializer<T> filteringRecordMaterializer = new FilteringRecordMaterializer<T>(recordMaterializer, leaves, builder.getValueInspectorsByColumn(), streamingPredicate);
            return new RecordReaderImplementation<T>(MessageColumnIO.this, filteringRecordMaterializer, validating, new ColumnReadStoreImpl(columns, filteringRecordMaterializer.getRootConverter(), getType(), createdBy));
        }

        @Override
        public RecordReader<T> visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
            return new FilteredRecordReader<T>(MessageColumnIO.this, recordMaterializer, validating, new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy), unboundRecordFilterCompat.getUnboundRecordFilter(), columns.getRowCount());
        }

        @Override
        public RecordReader<T> visit(NoOpFilter noOpFilter) {
            return new RecordReaderImplementation<T>(MessageColumnIO.this, recordMaterializer, validating, new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy));
        }
    });
}
Also used : ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) NoOpFilter(org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter) FilteringRecordMaterializer(org.apache.parquet.filter2.recordlevel.FilteringRecordMaterializer) RecordMaterializer(org.apache.parquet.io.api.RecordMaterializer) FilterPredicateCompat(org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat) IncrementallyUpdatedFilterPredicateBuilder(org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicateBuilder) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) IncrementallyUpdatedFilterPredicate(org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate) UnboundRecordFilterCompat(org.apache.parquet.filter2.compat.FilterCompat.UnboundRecordFilterCompat) IncrementallyUpdatedFilterPredicate(org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate)

Aggregations

ColumnReadStoreImpl (org.apache.parquet.column.impl.ColumnReadStoreImpl)2 Configuration (org.apache.hadoop.conf.Configuration)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 PageReadStore (org.apache.parquet.column.page.PageReadStore)1 FilterPredicateCompat (org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat)1 NoOpFilter (org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter)1 UnboundRecordFilterCompat (org.apache.parquet.filter2.compat.FilterCompat.UnboundRecordFilterCompat)1 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)1 FilteringRecordMaterializer (org.apache.parquet.filter2.recordlevel.FilteringRecordMaterializer)1 IncrementallyUpdatedFilterPredicate (org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate)1 IncrementallyUpdatedFilterPredicateBuilder (org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicateBuilder)1 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)1 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)1 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)1 RecordMaterializer (org.apache.parquet.io.api.RecordMaterializer)1