Search in sources :

Example 11 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testMissingValuesFromStats.

@Test
public void testMissingValuesFromStats() {
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32");
    org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
    Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
    assertFalse(stats.isNumNullsSet());
    assertFalse(stats.hasNonNullValue());
    assertTrue(stats.isEmpty());
    assertEquals(-1, stats.getNumNulls());
    formatStats.clear();
    formatStats.setMin(BytesUtils.intToBytes(-100));
    formatStats.setMax(BytesUtils.intToBytes(100));
    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
    assertFalse(stats.isNumNullsSet());
    assertTrue(stats.hasNonNullValue());
    assertFalse(stats.isEmpty());
    assertEquals(-1, stats.getNumNulls());
    assertEquals(-100, stats.genericGetMin());
    assertEquals(100, stats.genericGetMax());
    formatStats.clear();
    formatStats.setNull_count(2000);
    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
    assertTrue(stats.isNumNullsSet());
    assertFalse(stats.hasNonNullValue());
    assertFalse(stats.isEmpty());
    assertEquals(2000, stats.getNumNulls());
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) Test(org.junit.Test)

Example 12 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testStillUseStatsWithSignedSortOrderIfSingleValue.

private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) {
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), binaryType);
    Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
    Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics)

Example 13 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testUseStatsWithSignedSortOrder.

private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
    // override defaults and use stats that were accumulated using signed order
    Configuration conf = new Configuration();
    conf.setBoolean("parquet.strings.signed-min-max.enabled", true);
    ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("z"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, helper.toParquetStatistics(stats), binaryType);
    Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
    Assert.assertTrue(convertedStats.isNumNullsSet());
    Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
    if (helper == StatsHelper.V1) {
        assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
    } else {
        Assert.assertEquals("Should have correct min (unsigned sort)", Binary.fromString("A"), convertedStats.genericGetMin());
        Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics)

Example 14 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testBinaryStats.

private void testBinaryStats(StatsHelper helper) {
    // make fake stats and verify the size check
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls(3004);
    byte[] min = new byte[904];
    byte[] max = new byte[2388];
    stats.updateStats(Binary.fromConstantByteArray(min));
    stats.updateStats(Binary.fromConstantByteArray(max));
    long totalLen = min.length + max.length;
    Assert.assertFalse("Should not be smaller than min + max size", stats.isSmallerThan(totalLen));
    Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1));
    org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);
    assertFalse("Min should not be set", formatStats.isSetMin());
    assertFalse("Max should not be set", formatStats.isSetMax());
    if (helper == StatsHelper.V2) {
        Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value());
        Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value());
    }
    Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
    // convert to empty stats because the values are too large
    stats.setMinMaxFromBytes(max, max);
    formatStats = helper.toParquetStatistics(stats);
    Assert.assertFalse("Min should not be set", formatStats.isSetMin());
    Assert.assertFalse("Max should not be set", formatStats.isSetMax());
    Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value());
    Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value());
    Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count());
    Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal(Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED);
    Assert.assertTrue(roundTripStats.isEmpty());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics)

Example 15 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.

the class ParquetFooterStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
    // map from column name to ColumnDescriptor
    Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
    // map from column name to ColumnChunkMetaData
    final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
    // map from column name to MajorType
    final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
    // map from column name to SchemaElement
    final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
        if (fields.contains(schemaPath)) {
            columnDescMap.put(schemaPath, column);
        }
    }
    for (final SchemaElement se : fileMetaData.getSchema()) {
        final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
        if (fields.contains(schemaPath)) {
            schemaElementMap.put(schemaPath, se);
        }
    }
    for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
        if (fields.contains(schemaPath)) {
            columnChkMetaMap.put(schemaPath, colMetaData);
        }
    }
    for (final SchemaPath path : fields) {
        if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
            ColumnDescriptor columnDesc = columnDescMap.get(path);
            SchemaElement se = schemaElementMap.get(path);
            ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
            TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
            columnTypeMap.put(path, type);
            Statistics stat = metaData.getStatistics();
            if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
            }
            statMap.put(path, new ColumnStatistics(stat, type));
        } else {
            final String columnName = path.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(path, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Stopwatch(com.google.common.base.Stopwatch) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) SchemaPath(org.apache.drill.common.expression.SchemaPath) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter)

Aggregations

Statistics (org.apache.parquet.column.statistics.Statistics)20 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)14 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)14 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)12 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)12 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)9 TypeProtos (org.apache.drill.common.types.TypeProtos)6 HashMap (java.util.HashMap)5 Stopwatch (com.google.common.base.Stopwatch)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 Slice (io.airlift.slice.Slice)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)2 Binary (org.apache.parquet.io.api.Binary)2 Test (org.junit.Test)2 Domain (com.facebook.presto.common.predicate.Domain)1 Range (com.facebook.presto.common.predicate.Range)1