Search in sources :

Example 6 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class StatisticsFilter method visit.

private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) {
    Column<T> filterColumn = ud.getColumn();
    ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
    U udp = ud.getUserDefinedPredicate();
    if (columnChunk == null) {
        // lets run the udp with null value to see if it keeps null or not.
        if (inverted) {
            return udp.keep(null);
        } else {
            return !udp.keep(null);
        }
    }
    Statistics<T> stats = columnChunk.getStatistics();
    if (stats.isEmpty()) {
        // we have no statistics available, we cannot drop any chunks
        return BLOCK_MIGHT_MATCH;
    }
    if (isAllNulls(columnChunk)) {
        // lets run the udp with null value to see if it keeps null or not.
        if (inverted) {
            return udp.keep(null);
        } else {
            return !udp.keep(null);
        }
    }
    if (!stats.hasNonNullValue()) {
        // stats does not contain min/max values, we cannot drop any chunks
        return BLOCK_MIGHT_MATCH;
    }
    org.apache.parquet.filter2.predicate.Statistics<T> udpStats = new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(), stats.comparator());
    if (inverted) {
        return udp.inverseCanDrop(udpStats);
    } else {
        return udp.canDrop(udpStats);
    }
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Statistics(org.apache.parquet.column.statistics.Statistics)

Example 7 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class DumpCommand method dump.

public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
    PageReader reader = store.getPageReader(column);
    long vc = reader.getTotalValueCount();
    int rmax = column.getMaxRepetitionLevel();
    int dmax = column.getMaxDefinitionLevel();
    out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);
    DictionaryPage dict = reader.readDictionaryPage();
    if (dict != null) {
        out.format(" DS:%d", dict.getDictionarySize());
        out.format(" DE:%s", dict.getEncoding());
    }
    out.println();
    out.rule('-');
    DataPage page = reader.readPage();
    for (long count = 0; page != null; count++) {
        out.format("page %d:", count);
        page.accept(new Visitor<Void>() {

            @Override
            public Void visit(DataPageV1 pageV1) {
                out.format(" DLE:%s", pageV1.getDlEncoding());
                out.format(" RLE:%s", pageV1.getRlEncoding());
                out.format(" VLE:%s", pageV1.getValueEncoding());
                Statistics<?> statistics = pageV1.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }

            @Override
            public Void visit(DataPageV2 pageV2) {
                out.format(" DLE:RLE");
                out.format(" RLE:RLE");
                out.format(" VLE:%s", pageV2.getDataEncoding());
                Statistics<?> statistics = pageV2.getStatistics();
                if (statistics != null) {
                    out.format(" ST:[%s]", statistics);
                } else {
                    out.format(" ST:[none]");
                }
                return null;
            }
        });
        out.format(" SZ:%d", page.getUncompressedSize());
        out.format(" VC:%d", page.getValueCount());
        out.println();
        page = reader.readPage();
    }
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) PageReader(org.apache.parquet.column.page.PageReader) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1) Statistics(org.apache.parquet.column.statistics.Statistics) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 8 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by apache.

the class RangeExprEvaluator method visitFunctionHolderExpression.

@Override
public Statistics visitFunctionHolderExpression(FunctionHolderExpression holderExpr, Void value) throws RuntimeException {
    FuncHolder funcHolder = holderExpr.getHolder();
    if (!(funcHolder instanceof DrillSimpleFuncHolder)) {
        // Only Drill function is allowed.
        return null;
    }
    final String funcName = ((DrillSimpleFuncHolder) funcHolder).getRegisteredNames()[0];
    if (CastFunctions.isCastFunction(funcName)) {
        Statistics stat = holderExpr.args.get(0).accept(this, null);
        if (stat != null && !stat.isEmpty()) {
            return evalCastFunc(holderExpr, stat);
        }
    }
    return null;
}
Also used : DrillSimpleFuncHolder(org.apache.drill.exec.expr.fn.DrillSimpleFuncHolder) FuncHolder(org.apache.drill.common.expression.fn.FuncHolder) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) DrillSimpleFuncHolder(org.apache.drill.exec.expr.fn.DrillSimpleFuncHolder)

Example 9 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class ParquetMetadataCommand method printColumnChunk.

private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
    String[] path = column.getPath().toArray();
    PrimitiveType type = primitive(schema, path);
    Preconditions.checkNotNull(type);
    ColumnDescriptor desc = schema.getColumnDescription(path);
    long size = column.getTotalSize();
    long count = column.getValueCount();
    float perValue = ((float) size) / count;
    CompressionCodecName codec = column.getCodec();
    Set<Encoding> encodings = column.getEncodings();
    EncodingStats encodingStats = column.getEncodingStats();
    String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats);
    Statistics stats = column.getStatistics();
    String name = column.getPath().toDotString();
    PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
    if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    } else {
        console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType())));
    }
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Encoding(org.apache.parquet.column.Encoding) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingsAsString(org.apache.parquet.cli.Util.encodingsAsString) Util.encodingStatsAsString(org.apache.parquet.cli.Util.encodingStatsAsString) Statistics(org.apache.parquet.column.statistics.Statistics)

Example 10 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.

the class TestParquetMetadataConverter method testIgnoreStatsWithSignedSortOrder.

@Test
public void testIgnoreStatsWithSignedSortOrder() {
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    BinaryStatistics stats = new BinaryStatistics();
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("A"));
    stats.incrementNumNulls();
    stats.updateStats(Binary.fromString("z"));
    stats.incrementNumNulls();
    PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
    Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, StatsHelper.V1.toParquetStatistics(stats), binaryType);
    Assert.assertTrue("Stats should be empty: " + convertedStats, convertedStats.isEmpty());
}
Also used : BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) Test(org.junit.Test)

Aggregations

Statistics (org.apache.parquet.column.statistics.Statistics)20 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)14 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)14 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)12 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)12 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)9 TypeProtos (org.apache.drill.common.types.TypeProtos)6 HashMap (java.util.HashMap)5 Stopwatch (com.google.common.base.Stopwatch)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 Slice (io.airlift.slice.Slice)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)2 Binary (org.apache.parquet.io.api.Binary)2 Test (org.junit.Test)2 Domain (com.facebook.presto.common.predicate.Domain)1 Range (com.facebook.presto.common.predicate.Range)1