Search in sources :

Example 1 with StringStatistics

use of io.prestosql.orc.metadata.statistics.StringStatistics in project hetu-core by openlookeng.

the class OrcMetadataReader method toStringStatistics.

static StringStatistics toStringStatistics(HiveWriterVersion hiveWriterVersion, OrcProto.StringStatistics stringStatistics, boolean isRowGroup) {
    if (hiveWriterVersion == ORIGINAL && !isRowGroup) {
        return null;
    }
    Slice maximum = stringStatistics.hasMaximum() ? maxStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMaximumBytes()), hiveWriterVersion) : null;
    Slice minimum = stringStatistics.hasMinimum() ? minStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMinimumBytes()), hiveWriterVersion) : null;
    long sum = stringStatistics.hasSum() ? stringStatistics.getSum() : 0;
    return new StringStatistics(minimum, maximum, sum);
}
Also used : StringStatistics(io.prestosql.orc.metadata.statistics.StringStatistics) Slice(io.airlift.slice.Slice)

Example 2 with StringStatistics

use of io.prestosql.orc.metadata.statistics.StringStatistics in project hetu-core by openlookeng.

the class TestTupleDomainOrcPredicate method stringColumnStats.

private static ColumnStatistics stringColumnStats(Long numberOfValues, String minimum, String maximum) {
    Slice minimumSlice = minimum == null ? null : utf8Slice(minimum);
    Slice maximumSlice = maximum == null ? null : utf8Slice(maximum);
    // sum and minAverageValueSizeInBytes are not used in this test; they could be arbitrary numbers
    return new ColumnStatistics(numberOfValues, 10L, null, null, null, new StringStatistics(minimumSlice, maximumSlice, 100L), null, null, null, null);
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) StringStatistics(io.prestosql.orc.metadata.statistics.StringStatistics) Slice(io.airlift.slice.Slice) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice)

Example 3 with StringStatistics

use of io.prestosql.orc.metadata.statistics.StringStatistics in project hetu-core by openlookeng.

the class OrcWriteValidation method validateColumnStatisticsEquivalent.

private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, ColumnStatistics actualColumnStatistics, ColumnStatistics expectedColumnStatistics) throws OrcCorruptionException {
    requireNonNull(name, "name is null");
    requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
    requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
    if (actualColumnStatistics.getNumberOfValues() != expectedColumnStatistics.getNumberOfValues()) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of values in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getBooleanStatistics(), expectedColumnStatistics.getBooleanStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected boolean counts in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getIntegerStatistics(), expectedColumnStatistics.getIntegerStatistics())) {
        IntegerStatistics actualIntegerStatistics = actualColumnStatistics.getIntegerStatistics();
        IntegerStatistics expectedIntegerStatistics = expectedColumnStatistics.getIntegerStatistics();
        // Ignore the validation of sum if one of the two sums is null.
        if (actualIntegerStatistics == null || expectedIntegerStatistics == null || !Objects.equals(actualIntegerStatistics.getMin(), expectedIntegerStatistics.getMin()) || !Objects.equals(actualIntegerStatistics.getMax(), expectedIntegerStatistics.getMax()) || (actualIntegerStatistics.getSum() != null && expectedIntegerStatistics.getSum() != null && !Objects.equals(actualIntegerStatistics.getSum(), expectedIntegerStatistics.getSum()))) {
            throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected integer range in %s statistics", name);
        }
    }
    if (!Objects.equals(actualColumnStatistics.getDoubleStatistics(), expectedColumnStatistics.getDoubleStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected double range in %s statistics", name);
    }
    StringStatistics expectedStringStatistics = expectedColumnStatistics.getStringStatistics();
    if (expectedStringStatistics != null) {
        expectedStringStatistics = new StringStatistics(minStringTruncateToValidRange(expectedStringStatistics.getMin(), HiveWriterVersion.ORC_HIVE_8732), maxStringTruncateToValidRange(expectedStringStatistics.getMax(), HiveWriterVersion.ORC_HIVE_8732), expectedStringStatistics.getSum());
    }
    StringStatistics actualStringStatistics = actualColumnStatistics.getStringStatistics();
    if (!Objects.equals(actualColumnStatistics.getStringStatistics(), expectedStringStatistics) && expectedStringStatistics != null) {
        // Merging row group stats can produce nulls given we have string stats limit.
        if (actualStringStatistics == null || actualStringStatistics.getSum() != expectedStringStatistics.getSum() || (expectedStringStatistics.getMax() != null && !Objects.equals(actualStringStatistics.getMax(), expectedStringStatistics.getMax())) || (expectedStringStatistics.getMin() != null && !Objects.equals(actualStringStatistics.getMin(), expectedStringStatistics.getMin()))) {
            throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected string range in %s statistics", name);
        }
    }
    if (!Objects.equals(actualColumnStatistics.getDateStatistics(), expectedColumnStatistics.getDateStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected date range in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getDecimalStatistics(), expectedColumnStatistics.getDecimalStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected decimal range in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getBloomFilter(), expectedColumnStatistics.getBloomFilter())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected bloom filter in %s statistics", name);
    }
}
Also used : StringStatistics(io.prestosql.orc.metadata.statistics.StringStatistics) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics)

Example 4 with StringStatistics

use of io.prestosql.orc.metadata.statistics.StringStatistics in project hetu-core by openlookeng.

the class TestOrcMetadataReader method testToStringStatistics.

@Test
public void testToStringStatistics() {
    // ORIGINAL version only produces stats at the row group level
    assertNull(OrcMetadataReader.toStringStatistics(ORIGINAL, OrcProto.StringStatistics.newBuilder().setMinimum("ant").setMaximum("cat").setSum(44).build(), false));
    // having only sum should work for current version
    for (boolean isRowGroup : ImmutableList.of(true, false)) {
        assertEquals(OrcMetadataReader.toStringStatistics(ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder().setSum(45).build(), isRowGroup), new StringStatistics(null, null, 45));
    }
    // and the ORIGINAL version row group stats (but not rolled up stats)
    assertEquals(OrcMetadataReader.toStringStatistics(ORIGINAL, OrcProto.StringStatistics.newBuilder().setSum(45).build(), true), new StringStatistics(null, null, 45));
    // having only a min or max should work
    assertEquals(OrcMetadataReader.toStringStatistics(ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder().setMinimum("ant").build(), true), new StringStatistics(utf8Slice("ant"), null, 0));
    assertEquals(OrcMetadataReader.toStringStatistics(ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder().setMaximum("cat").build(), true), new StringStatistics(null, utf8Slice("cat"), 0));
    // normal full stat
    assertEquals(OrcMetadataReader.toStringStatistics(ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder().setMinimum("ant").setMaximum("cat").setSum(79).build(), true), new StringStatistics(utf8Slice("ant"), utf8Slice("cat"), 79));
    for (Slice prefix : ALL_UTF8_SEQUENCES) {
        for (int testCodePoint : TEST_CODE_POINTS) {
            Slice codePoint = codePointToUtf8(testCodePoint);
            for (Slice suffix : ALL_UTF8_SEQUENCES) {
                Slice testValue = concatSlice(prefix, codePoint, suffix);
                testStringStatisticsTruncation(testValue, ORIGINAL);
                testStringStatisticsTruncation(testValue, ORC_HIVE_8732);
            }
        }
    }
}
Also used : StringStatistics(io.prestosql.orc.metadata.statistics.StringStatistics) Slice(io.airlift.slice.Slice) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Test(org.testng.annotations.Test)

Aggregations

StringStatistics (io.prestosql.orc.metadata.statistics.StringStatistics)4 Slice (io.airlift.slice.Slice)3 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)2 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)1 IntegerStatistics (io.prestosql.orc.metadata.statistics.IntegerStatistics)1 Test (org.testng.annotations.Test)1