Search in sources :

Example 11 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testBinaryStatsWithTruncation.

// The number of minLen and maxLen shouldn't matter because the comparision is controlled by prefix
private void testBinaryStatsWithTruncation(int truncateLen, int minLen, int maxLen) {
    BinaryStatistics stats = new BinaryStatistics();
    byte[] min = generateRandomString("a", minLen).getBytes();
    byte[] max = generateRandomString("b", maxLen).getBytes();
    stats.updateStats(Binary.fromConstantByteArray(min));
    stats.updateStats(Binary.fromConstantByteArray(max));
    ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(truncateLen);
    org.apache.parquet.format.Statistics formatStats = metadataConverter.toParquetStatistics(stats);
    if (minLen + maxLen >= ParquetMetadataConverter.MAX_STATS_SIZE) {
        assertNull(formatStats.getMin_value());
        assertNull(formatStats.getMax_value());
    } else {
        String minString = new String(min, Charset.forName("UTF-8"));
        String minStatString = new String(formatStats.getMin_value(), Charset.forName("UTF-8"));
        assertTrue(minStatString.compareTo(minString) <= 0);
        String maxString = new String(max, Charset.forName("UTF-8"));
        String maxStatString = new String(formatStats.getMax_value(), Charset.forName("UTF-8"));
        assertTrue(maxStatString.compareTo(maxString) >= 0);
    }
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics)

Example 12 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by axbaretto.

the class ParquetFooterStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
    // map from column name to ColumnDescriptor
    Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
    // map from column name to ColumnChunkMetaData
    final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
    // map from column name to MajorType
    final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
    // map from column name to SchemaElement
    final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
        if (fields.contains(schemaPath)) {
            columnDescMap.put(schemaPath, column);
        }
    }
    for (final SchemaElement se : fileMetaData.getSchema()) {
        final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
        if (fields.contains(schemaPath)) {
            schemaElementMap.put(schemaPath, se);
        }
    }
    for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
        if (fields.contains(schemaPath)) {
            columnChkMetaMap.put(schemaPath, colMetaData);
        }
    }
    for (final SchemaPath path : fields) {
        if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
            ColumnDescriptor columnDesc = columnDescMap.get(path);
            SchemaElement se = schemaElementMap.get(path);
            ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
            TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
            columnTypeMap.put(path, type);
            Statistics stat = metaData.getStatistics();
            if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
            }
            statMap.put(path, new ColumnStatistics(stat, type));
        } else {
            final String columnName = path.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(path, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Stopwatch(com.google.common.base.Stopwatch) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) SchemaPath(org.apache.drill.common.expression.SchemaPath) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter)

Example 13 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by axbaretto.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath field : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(field.getUnIndexed());
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            int precision = 0;
            int scale = 0;
            // ColumnTypeMetadata_v3 stores information about scale and precision
            if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3) {
                Metadata.ColumnTypeMetadata_v3 columnTypeInfo = ((Metadata.ParquetTableMetadata_v3) parquetTableMetadata).getColumnTypeInfo(columnMetadata.getName());
                scale = columnTypeInfo.scale;
                precision = columnTypeInfo.precision;
            }
            statMap.put(field, getStat(min, max, numNull, primitiveType, originalType, scale, precision));
        } else {
            final String columnName = field.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(field, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 14 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestColumnChunkMetaData method newMD.

private ColumnChunkMetaData newMD(long big) {
    Set<Encoding> e = new HashSet<Encoding>();
    PrimitiveTypeName t = BINARY;
    ColumnPath p = ColumnPath.get("foo");
    CompressionCodecName c = CompressionCodecName.GZIP;
    BinaryStatistics s = new BinaryStatistics();
    ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, big, 0, 0, 0, 0);
    return md;
}
Also used : Encoding(org.apache.parquet.column.Encoding) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) HashSet(java.util.HashSet) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 15 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method createParquetMetaData.

private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) {
    MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
    BlockMetaData blockMetaData = new BlockMetaData();
    EncodingStats.Builder builder = new EncodingStats.Builder();
    if (dicEncoding != null) {
        builder.addDictEncoding(dicEncoding).build();
    }
    builder.addDataEncoding(dataEncoding);
    EncodingStats es = builder.build();
    Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
    PrimitiveTypeName t = PrimitiveTypeName.INT32;
    ColumnPath p = ColumnPath.get("col");
    CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
    BinaryStatistics s = new BinaryStatistics();
    ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
    blockMetaData.addColumn(md);
    blockMetaDataList.add(blockMetaData);
    return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OffsetIndexBuilder(org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) ArrayList(java.util.ArrayList) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) EncodingStats(org.apache.parquet.column.EncodingStats) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) HashSet(java.util.HashSet) Encoding(org.apache.parquet.column.Encoding) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName)

Aggregations

BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)20 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)8 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)8 Statistics (org.apache.parquet.column.statistics.Statistics)8 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)6 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)6 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)5 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 MessageType (org.apache.parquet.schema.MessageType)5 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 Test (org.junit.Test)5 Stopwatch (com.google.common.base.Stopwatch)4 HashMap (java.util.HashMap)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 TypeProtos (org.apache.drill.common.types.TypeProtos)4 Configuration (org.apache.hadoop.conf.Configuration)4 Encoding (org.apache.parquet.column.Encoding)4 HashSet (java.util.HashSet)3 Path (org.apache.hadoop.fs.Path)3