Search in sources :

Example 11 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testWriteOffVerifyOn.

/**
 * Do not write out page level crc checksums, but enable verification on the read path. Tests
 * that the read still succeeds and does not throw an exception.
 */
@Test
public void testWriteOffVerifyOn() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
    try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();
        assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(), colAPage1Bytes);
        assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(), colAPage2Bytes);
        assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(), colBPage1Bytes);
        assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(), colBPage2Bytes);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) Test(org.junit.Test)

Example 12 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
        MessageType schema = reader.getFileMetaData().getSchema();
        Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
        if (this.columns == null || this.columns.isEmpty()) {
            for (ColumnDescriptor descriptor : schema.getColumns()) {
                columns.put(descriptor, primitive(schema, descriptor.getPath()));
            }
        } else {
            for (String column : this.columns) {
                columns.put(descriptor(column, schema), primitive(column, schema));
            }
        }
        CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
        // accumulate formatted lines to print by column
        Map<String, List<String>> formatted = Maps.newLinkedHashMap();
        PageFormatter formatter = new PageFormatter();
        PageReadStore pageStore;
        int rowGroupNum = 0;
        while ((pageStore = reader.readNextRowGroup()) != null) {
            for (ColumnDescriptor descriptor : columns.keySet()) {
                List<String> lines = formatted.get(columnName(descriptor));
                if (lines == null) {
                    lines = Lists.newArrayList();
                    formatted.put(columnName(descriptor), lines);
                }
                formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
                PageReader pages = pageStore.getPageReader(descriptor);
                DictionaryPage dict = pages.readDictionaryPage();
                if (dict != null) {
                    lines.add(formatter.format(dict));
                }
                DataPage page;
                while ((page = pages.readPage()) != null) {
                    lines.add(formatter.format(page));
                }
            }
            rowGroupNum += 1;
        }
        // TODO: Show total column size and overall size per value in the column summary line
        for (String columnName : formatted.keySet()) {
            console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
            console.info(formatter.getHeader());
            for (String line : formatted.get(columnName)) {
                console.info(line);
            }
            console.info("");
        }
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) TextStringBuilder(org.apache.commons.text.TextStringBuilder) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 13 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class ColumnIndexValidator method checkContractViolations.

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
    List<ContractViolation> violations = new ArrayList<>();
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
        FileMetaData meta = reader.getFooter().getFileMetaData();
        MessageType schema = meta.getSchema();
        List<ColumnDescriptor> columns = schema.getColumns();
        List<BlockMetaData> blocks = reader.getFooter().getBlocks();
        int rowGroupNumber = 0;
        PageReadStore rowGroup = reader.readNextRowGroup();
        while (rowGroup != null) {
            ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
            List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
            assert (columnChunks.size() == columns.size());
            for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
                ColumnDescriptor column = columns.get(columnNumber);
                ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
                ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
                if (columnIndex == null) {
                    continue;
                }
                ColumnPath columnPath = columnChunk.getPath();
                OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
                List<ByteBuffer> minValues = columnIndex.getMinValues();
                List<ByteBuffer> maxValues = columnIndex.getMaxValues();
                BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
                List<Long> nullCounts = columnIndex.getNullCounts();
                List<Boolean> nullPages = columnIndex.getNullPages();
                long rowNumber = 0;
                ColumnReader columnReader = columnReadStore.getColumnReader(column);
                ByteBuffer prevMinValue = null;
                ByteBuffer prevMaxValue = null;
                for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
                    boolean isNullPage = nullPages.get(pageNumber);
                    ByteBuffer minValue = minValues.get(pageNumber);
                    ByteBuffer maxValue = maxValues.get(pageNumber);
                    PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
                    if (!isNullPage) {
                        prevMinValue = minValue;
                        prevMaxValue = maxValue;
                    }
                    long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
                    while (rowNumber <= lastRowNumberInPage) {
                        pageValidator.validateValuesBelongingToRow();
                        ++rowNumber;
                    }
                    pageValidator.finishPage();
                }
            }
            rowGroup = reader.readNextRowGroup();
            rowGroupNumber++;
        }
    }
    return violations;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ArrayList(java.util.ArrayList) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) PageReadStore(org.apache.parquet.column.page.PageReadStore) BoundaryOrder(org.apache.parquet.internal.column.columnindex.BoundaryOrder) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ByteBuffer(java.nio.ByteBuffer) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnReadStore(org.apache.parquet.column.ColumnReadStore) DummyRecordConverter(org.apache.parquet.example.DummyRecordConverter)

Example 14 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
    Configuration conf = new Configuration();
    List<BlockMetaData> blocks = meta.getBlocks();
    List<ColumnDescriptor> columns = schema.getColumns();
    if (showColumns != null) {
        columns = new ArrayList<ColumnDescriptor>();
        for (ColumnDescriptor column : schema.getColumns()) {
            String path = Joiner.on('.').skipNulls().join(column.getPath());
            if (showColumns.contains(path)) {
                columns.add(column);
            }
        }
    }
    ParquetFileReader freader = null;
    if (showmd) {
        try {
            long group = 0;
            for (BlockMetaData block : blocks) {
                if (group != 0)
                    out.println();
                out.format("row group %d%n", group++);
                out.rule('-');
                List<ColumnChunkMetaData> ccmds = block.getColumns();
                if (showColumns != null) {
                    ccmds = new ArrayList<ColumnChunkMetaData>();
                    for (ColumnChunkMetaData ccmd : block.getColumns()) {
                        String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
                        if (showColumns.contains(path)) {
                            ccmds.add(ccmd);
                        }
                    }
                }
                MetadataUtils.showDetails(out, ccmds);
                List<BlockMetaData> rblocks = Collections.singletonList(block);
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    out.incrementTabLevel();
                    for (ColumnDescriptor column : columns) {
                        out.println();
                        dump(out, store, column);
                    }
                    out.decrementTabLevel();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            }
        } finally {
            if (freader != null) {
                freader.close();
            }
        }
    }
    if (showdt) {
        boolean first = true;
        for (ColumnDescriptor column : columns) {
            if (!first || showmd)
                out.println();
            first = false;
            out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
            out.rule('-');
            try {
                long page = 1;
                long total = blocks.size();
                long offset = 1;
                freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
                PageReadStore store = freader.readNextRowGroup();
                while (store != null) {
                    ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
                    dump(out, crstore, column, page++, total, offset);
                    offset += store.getRowCount();
                    store = freader.readNextRowGroup();
                }
                out.flushColumns();
            } finally {
                out.flushColumns();
                if (freader != null) {
                    freader.close();
                }
            }
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) PageReadStore(org.apache.parquet.column.page.PageReadStore)

Example 15 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project hive by apache.

the class VectorizedParquetRecordReader method checkEndOfRowGroup.

private void checkEndOfRowGroup() throws IOException {
    if (rowsReturned != totalCountLoadedSoFar) {
        return;
    }
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
        throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
    }
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    List<Type> types = requestedSchema.getFields();
    columnReaders = new VectorizedColumnReader[columns.size()];
    if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
        // However, if colsToInclude is not empty we should initialize each columnReader
        if (!colsToInclude.isEmpty()) {
            for (int i = 0; i < types.size(); ++i) {
                columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
            }
        }
    } else {
        for (int i = 0; i < types.size(); ++i) {
            columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
        }
    }
    totalCountLoadedSoFar += pages.getRowCount();
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)31 Configuration (org.apache.hadoop.conf.Configuration)22 Path (org.apache.hadoop.fs.Path)22 IOException (java.io.IOException)14 MessageType (org.apache.parquet.schema.MessageType)14 Test (org.junit.Test)13 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)12 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)8 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 RecordReader (org.apache.parquet.io.RecordReader)7 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 Encoding (org.apache.parquet.column.Encoding)5 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)5 File (java.io.File)4 List (java.util.List)4 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4