Search in sources :

Example 1 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by apache.

the class ParquetFooterStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
    // map from column name to ColumnDescriptor
    Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
    // map from column name to ColumnChunkMetaData
    final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
    // map from column name to MajorType
    final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
    // map from column name to SchemaElement
    final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
        if (fields.contains(schemaPath)) {
            columnDescMap.put(schemaPath, column);
        }
    }
    for (final SchemaElement se : fileMetaData.getSchema()) {
        final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
        if (fields.contains(schemaPath)) {
            schemaElementMap.put(schemaPath, se);
        }
    }
    for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
        if (fields.contains(schemaPath)) {
            columnChkMetaMap.put(schemaPath, colMetaData);
        }
    }
    for (final SchemaPath path : fields) {
        if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
            ColumnDescriptor columnDesc = columnDescMap.get(path);
            SchemaElement se = schemaElementMap.get(path);
            ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
            TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
            columnTypeMap.put(path, type);
            Statistics stat = metaData.getStatistics();
            if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
            }
            statMap.put(path, new ColumnStatistics(stat, type));
        } else {
            final String columnName = path.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(path, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Stopwatch(com.google.common.base.Stopwatch) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) SchemaPath(org.apache.drill.common.expression.SchemaPath) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter)

Example 2 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by apache.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath schemaPath : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
            statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
        } else {
            final String columnName = schemaPath.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(schemaPath, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 3 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetFileWriter method createFile.

private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
    String[] path1 = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(path1);
    String[] path2 = { "c", "d" };
    ColumnDescriptor c2 = schema.getColumnDescription(path2);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 1, 2, 3, 4 };
    byte[] bytes3 = { 2, 3, 4, 5 };
    byte[] bytes4 = { 3, 4, 5, 6 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics stats1 = new BinaryStatistics();
    BinaryStatistics stats2 = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 6, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 8, codec);
    w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    final HashMap<String, String> extraMetaData = new HashMap<String, String>();
    extraMetaData.put("foo", "bar");
    extraMetaData.put(path.getName(), path.getName());
    w.end(extraMetaData);
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics)

Example 4 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 5 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteReadStatistics.

@Test
public void testWriteReadStatistics() throws Exception {
    // this test assumes statistics will be read
    Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
    String[] path1 = { "a", "b" };
    ColumnDescriptor c1 = schema.getColumnDescription(path1);
    String[] path2 = { "c", "d" };
    ColumnDescriptor c2 = schema.getColumnDescription(path2);
    byte[] bytes1 = { 0, 1, 2, 3 };
    byte[] bytes2 = { 1, 2, 3, 4 };
    byte[] bytes3 = { 2, 3, 4, 5 };
    byte[] bytes4 = { 3, 4, 5, 6 };
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics statsB1C1P1 = new BinaryStatistics();
    BinaryStatistics statsB1C1P2 = new BinaryStatistics();
    LongStatistics statsB1C2P1 = new LongStatistics();
    LongStatistics statsB1C2P2 = new LongStatistics();
    BinaryStatistics statsB2C1P1 = new BinaryStatistics();
    LongStatistics statsB2C2P1 = new LongStatistics();
    statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
    statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
    statsB1C2P1.setMinMax(2l, 10l);
    statsB1C2P2.setMinMax(-6l, 4l);
    statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    statsB2C2P1.setMinMax(11l, 122l);
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(c1, 5, codec);
    w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 6, codec);
    w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(c1, 7, codec);
    w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(c2, 8, codec);
    w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    for (BlockMetaData block : readFooter.getBlocks()) {
        for (ColumnChunkMetaData col : block.getColumns()) {
            col.getPath();
        }
    }
    // correct statistics
    BinaryStatistics bs1 = new BinaryStatistics();
    bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
    LongStatistics ls1 = new LongStatistics();
    ls1.setMinMax(-6l, 10l);
    BinaryStatistics bs2 = new BinaryStatistics();
    bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
    LongStatistics ls2 = new LongStatistics();
    ls2.setMinMax(11l, 122l);
    {
        // assert stats are correct for the first block
        BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
        String str = new String(bsout.getMaxBytes());
        String str2 = new String(bsout.getMinBytes());
        TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
    }
    {
        // assert stats are correct for the second block
        TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
        TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)16 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)9 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)8 Statistics (org.apache.parquet.column.statistics.Statistics)8 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)7 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)6 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)6 PrimitiveType (org.apache.parquet.schema.PrimitiveType)6 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)5 Stopwatch (com.google.common.base.Stopwatch)4 HashMap (java.util.HashMap)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 TypeProtos (org.apache.drill.common.types.TypeProtos)4 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)4 Test (org.junit.Test)4 Configuration (org.apache.hadoop.conf.Configuration)3 MessageType (org.apache.parquet.schema.MessageType)3 HashSet (java.util.HashSet)2 Metadata (org.apache.drill.exec.store.parquet.Metadata)2 ParquetReaderUtility (org.apache.drill.exec.store.parquet.ParquetReaderUtility)2