Search in sources :

Example 16 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testUseStatsWithSignedSortOrder.

private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
    // override defaults and use stats that were accumulated using signed order
    Configuration conf = new Configuration();
    conf.setBoolean("parquet.strings.signed-min-max.enabled", true);
    ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("z"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, helper.toParquetStatistics(stats), binaryType);
    Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
    Assert.assertTrue(convertedStats.isNumNullsSet());
    Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
    if (helper == StatsHelper.V1) {
        assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
    } else {
        Assert.assertEquals("Should have correct min (unsigned sort)", Binary.fromString("A"), convertedStats.genericGetMin());
        Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics)

Example 17 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testBinaryStats.

private void testBinaryStats(StatsHelper helper) {
    // make fake stats and verify the size check
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls(3004);
    byte[] min = new byte[904];
    byte[] max = new byte[2388];
    stats.updateStats(Binary.fromConstantByteArray(min));
    stats.updateStats(Binary.fromConstantByteArray(max));
    long totalLen = min.length + max.length;
    Assert.assertFalse("Should not be smaller than min + max size", stats.isSmallerThan(totalLen));
    Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1));
    org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);
    assertFalse("Min should not be set", formatStats.isSetMin());
    assertFalse("Max should not be set", formatStats.isSetMax());
    if (helper == StatsHelper.V2) {
        Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value());
        Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value());
    }
    Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
    // convert to empty stats because the values are too large
    stats.setMinMaxFromBytes(max, max);
    formatStats = helper.toParquetStatistics(stats);
    Assert.assertFalse("Min should not be set", formatStats.isSetMin());
    Assert.assertFalse("Max should not be set", formatStats.isSetMax());
    Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value());
    Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value());
    Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count());
    Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal(Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED);
    Assert.assertTrue(roundTripStats.isEmpty());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics)

Example 18 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testStillUseStatsWithSignedSortOrderIfSingleValue.

private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) {
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), binaryType);
    Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
    Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics)

Example 19 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testIgnoreStatsWithSignedSortOrder.

@Test
public void testIgnoreStatsWithSignedSortOrder() {
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("z"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, StatsHelper.V1.toParquetStatistics(stats), binaryType);
    Assert.assertFalse("Stats should not include min/max: " + convertedStats, convertedStats.hasNonNullValue());
    Assert.assertTrue("Stats should have null count: " + convertedStats, convertedStats.isNumNullsSet());
    Assert.assertEquals("Stats should have 3 nulls: " + convertedStats, 3L, convertedStats.getNumNulls());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) Test(org.junit.Test)

Example 20 with BinaryStatistics

use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.

the class ParquetMetadataConverter method toParquetStatistics.

public static Statistics toParquetStatistics(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) {
    Statistics formatStats = new Statistics();
    // value has been truncated and is a lower bound and not in the page.
    if (!stats.isEmpty() && withinLimit(stats, truncateLength)) {
        formatStats.setNull_count(stats.getNumNulls());
        if (stats.hasNonNullValue()) {
            byte[] min;
            byte[] max;
            if (stats instanceof BinaryStatistics && truncateLength != Integer.MAX_VALUE) {
                BinaryTruncator truncator = BinaryTruncator.getTruncator(stats.type());
                min = tuncateMin(truncator, truncateLength, stats.getMinBytes());
                max = tuncateMax(truncator, truncateLength, stats.getMaxBytes());
            } else {
                min = stats.getMinBytes();
                max = stats.getMaxBytes();
            }
            // trivially true for equal min-max values)
            if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) {
                formatStats.setMin(min);
                formatStats.setMax(max);
            }
            if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) {
                formatStats.setMin_value(min);
                formatStats.setMax_value(max);
            }
        }
    }
    return formatStats;
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) BinaryTruncator(org.apache.parquet.internal.column.columnindex.BinaryTruncator) Statistics(org.apache.parquet.format.Statistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) CorruptStatistics(org.apache.parquet.CorruptStatistics)

Aggregations

BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)20 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)8 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)8 Statistics (org.apache.parquet.column.statistics.Statistics)8 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)6 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)6 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)5 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 MessageType (org.apache.parquet.schema.MessageType)5 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 Test (org.junit.Test)5 Stopwatch (com.google.common.base.Stopwatch)4 HashMap (java.util.HashMap)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 TypeProtos (org.apache.drill.common.types.TypeProtos)4 Configuration (org.apache.hadoop.conf.Configuration)4 Encoding (org.apache.parquet.column.Encoding)4 HashSet (java.util.HashSet)3 Path (org.apache.hadoop.fs.Path)3