Search in sources :

Example 31 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class DataWritableReadSupport method init.

/**
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
    final String columns = configuration.get(IOConstants.COLUMNS);
    final Map<String, String> contextMetadata = new HashMap<String, String>();
    if (columns != null) {
        final List<String> listColumns = getColumns(columns);
        final List<Type> typeListTable = new ArrayList<Type>();
        for (final String col : listColumns) {
            // listColumns contains partition columns which are metadata only
            if (fileSchema.containsField(col)) {
                typeListTable.add(fileSchema.getType(col));
            } else {
                // below allows schema evolution
                typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
            }
        }
        MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
        contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());
        MessageType requestedSchemaByUser = tableSchema;
        final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
        final List<Type> typeListWanted = new ArrayList<Type>();
        for (final Integer idx : indexColumnsWanted) {
            typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
        }
        requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration);
        return new ReadContext(requestedSchemaByUser, contextMetadata);
    } else {
        contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
        return new ReadContext(fileSchema, contextMetadata);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 32 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by apache.

the class DrillParquetGroupConverter method getVarDecimalConverter.

private PrimitiveConverter getVarDecimalConverter(String name, PrimitiveType type) {
    int scale = type.getDecimalMetadata().getScale();
    int precision = type.getDecimalMetadata().getPrecision();
    VarDecimalWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varDecimal(precision, scale), l -> l.list().varDecimal(precision, scale)) : getWriter(name, (m, f) -> m.varDecimal(f, precision, scale), l -> l.varDecimal(precision, scale));
    return new DrillVarDecimalConverter(writer, precision, scale, mutator.getManagedBuffer());
}
Also used : IntervalHolder(org.apache.drill.exec.expr.holders.IntervalHolder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) VarDecimalHolder(org.apache.drill.exec.expr.holders.VarDecimalHolder) SingleMapWriter(org.apache.drill.exec.vector.complex.impl.SingleMapWriter) BiFunction(java.util.function.BiFunction) VarCharHolder(org.apache.drill.exec.expr.holders.VarCharHolder) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) OutputMutator(org.apache.drill.exec.physical.impl.OutputMutator) VarCharWriter(org.apache.drill.exec.vector.complex.writer.VarCharWriter) PathSegment(org.apache.drill.common.expression.PathSegment) TimeStampWriter(org.apache.drill.exec.vector.complex.writer.TimeStampWriter) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) DateHolder(org.apache.drill.exec.expr.holders.DateHolder) DrillBuf(io.netty.buffer.DrillBuf) BigIntHolder(org.apache.drill.exec.expr.holders.BigIntHolder) VarBinaryWriter(org.apache.drill.exec.vector.complex.writer.VarBinaryWriter) BigIntWriter(org.apache.drill.exec.vector.complex.writer.BigIntWriter) AbstractRepeatedMapWriter(org.apache.drill.exec.vector.complex.impl.AbstractRepeatedMapWriter) IntWriter(org.apache.drill.exec.vector.complex.writer.IntWriter) GroupType(org.apache.parquet.schema.GroupType) GroupConverter(org.apache.parquet.io.api.GroupConverter) VarDecimalWriter(org.apache.drill.exec.vector.complex.writer.VarDecimalWriter) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Float4Writer(org.apache.drill.exec.vector.complex.writer.Float4Writer) TimeWriter(org.apache.drill.exec.vector.complex.writer.TimeWriter) BaseWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter) MapWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter) Binary(org.apache.parquet.io.api.Binary) Longs(org.apache.drill.shaded.guava.com.google.common.primitives.Longs) List(java.util.List) DYNAMIC_STAR(org.apache.drill.common.expression.SchemaPath.DYNAMIC_STAR) Optional(java.util.Optional) Type(org.apache.parquet.schema.Type) ExecConstants(org.apache.drill.exec.ExecConstants) DateWriter(org.apache.drill.exec.vector.complex.writer.DateWriter) BitHolder(org.apache.drill.exec.expr.holders.BitHolder) OptionManager(org.apache.drill.exec.server.options.OptionManager) Ints(org.apache.drill.shaded.guava.com.google.common.primitives.Ints) BitWriter(org.apache.drill.exec.vector.complex.writer.BitWriter) Float8Writer(org.apache.drill.exec.vector.complex.writer.Float8Writer) Converter(org.apache.parquet.io.api.Converter) ListWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter) Repetition(org.apache.parquet.schema.Type.Repetition) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Float8Holder(org.apache.drill.exec.expr.holders.Float8Holder) VarBinaryHolder(org.apache.drill.exec.expr.holders.VarBinaryHolder) ArrayList(java.util.ArrayList) Float4Holder(org.apache.drill.exec.expr.holders.Float4Holder) NanoTimeUtils.getDateTimeValueFromBinary(org.apache.drill.exec.store.parquet.ParquetReaderUtility.NanoTimeUtils.getDateTimeValueFromBinary) TimeHolder(org.apache.drill.exec.expr.holders.TimeHolder) IntHolder(org.apache.drill.exec.expr.holders.IntHolder) DictWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.DictWriter) Iterator(java.util.Iterator) IntervalWriter(org.apache.drill.exec.vector.complex.writer.IntervalWriter) ParquetColumnMetadata(org.apache.drill.exec.store.parquet.columnreaders.ParquetColumnMetadata) Collections(java.util.Collections) TimeStampHolder(org.apache.drill.exec.expr.holders.TimeStampHolder) DateTimeConstants(org.joda.time.DateTimeConstants) VarDecimalWriter(org.apache.drill.exec.vector.complex.writer.VarDecimalWriter)

Example 33 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by apache.

the class DrillParquetGroupConverter method getConverterForType.

protected PrimitiveConverter getConverterForType(String name, PrimitiveType type) {
    switch(type.getPrimitiveTypeName()) {
        case INT32:
            {
                if (type.getOriginalType() == null) {
                    return getIntConverter(name, type);
                }
                switch(type.getOriginalType()) {
                    case UINT_8:
                    case UINT_16:
                    case UINT_32:
                    case INT_8:
                    case INT_16:
                    case INT_32:
                        {
                            return getIntConverter(name, type);
                        }
                    case DECIMAL:
                        {
                            ParquetReaderUtility.checkDecimalTypeEnabled(options);
                            return getVarDecimalConverter(name, type);
                        }
                    case DATE:
                        {
                            DateWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).date(), l -> l.list().date()) : getWriter(name, (m, f) -> m.date(f), l -> l.date());
                            switch(containsCorruptedDates) {
                                case META_SHOWS_CORRUPTION:
                                    return new DrillCorruptedDateConverter(writer);
                                case META_SHOWS_NO_CORRUPTION:
                                    return new DrillDateConverter(writer);
                                case META_UNCLEAR_TEST_VALUES:
                                    return new CorruptionDetectingDateConverter(writer);
                                default:
                                    throw new DrillRuntimeException(String.format("Issue setting up parquet reader for date type, " + "unrecognized date corruption status %s. See DRILL-4203 for more info.", containsCorruptedDates));
                            }
                        }
                    case TIME_MILLIS:
                        {
                            TimeWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).time(), l -> l.list().time()) : getWriter(name, (m, f) -> m.time(f), l -> l.time());
                            return new DrillTimeConverter(writer);
                        }
                    default:
                        {
                            throw new UnsupportedOperationException("Unsupported type: " + type.getOriginalType());
                        }
                }
            }
        case INT64:
            {
                if (type.getOriginalType() == null) {
                    return getBigIntConverter(name, type);
                }
                switch(type.getOriginalType()) {
                    case UINT_64:
                    case INT_64:
                        return getBigIntConverter(name, type);
                    case TIMESTAMP_MICROS:
                        {
                            TimeStampWriter writer = getTimeStampWriter(name, type);
                            return new DrillTimeStampMicrosConverter(writer);
                        }
                    case TIME_MICROS:
                        {
                            TimeWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).time(), l -> l.list().time()) : getWriter(name, MapWriter::time, ListWriter::time);
                            return new DrillTimeMicrosConverter(writer);
                        }
                    case DECIMAL:
                        {
                            ParquetReaderUtility.checkDecimalTypeEnabled(options);
                            return getVarDecimalConverter(name, type);
                        }
                    case TIMESTAMP_MILLIS:
                        {
                            TimeStampWriter writer = getTimeStampWriter(name, type);
                            return new DrillTimeStampConverter(writer);
                        }
                    default:
                        {
                            throw new UnsupportedOperationException("Unsupported type " + type.getOriginalType());
                        }
                }
            }
        case INT96:
            {
                // TODO: replace null with TIMESTAMP_NANOS once parquet support such type annotation.
                if (type.getOriginalType() == null) {
                    if (options.getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) {
                        TimeStampWriter writer = getTimeStampWriter(name, type);
                        return new DrillFixedBinaryToTimeStampConverter(writer);
                    } else {
                        VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, (m, f) -> m.varBinary(f), listWriter -> listWriter.varBinary());
                        return new DrillFixedBinaryToVarbinaryConverter(writer, ParquetColumnMetadata.getTypeLengthInBits(type.getPrimitiveTypeName()) / 8, mutator.getManagedBuffer());
                    }
                }
            }
        case FLOAT:
            {
                Float4Writer writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).float4(), l -> l.list().float4()) : getWriter(name, (m, f) -> m.float4(f), l -> l.float4());
                return new DrillFloat4Converter(writer);
            }
        case DOUBLE:
            {
                Float8Writer writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).float8(), l -> l.list().float8()) : getWriter(name, (m, f) -> m.float8(f), l -> l.float8());
                return new DrillFloat8Converter(writer);
            }
        case BOOLEAN:
            {
                BitWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).bit(), l -> l.list().bit()) : getWriter(name, (m, f) -> m.bit(f), l -> l.bit());
                return new DrillBoolConverter(writer);
            }
        case BINARY:
            {
                LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter> typeAnnotationVisitor = new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter>() {

                    @Override
                    public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
                        ParquetReaderUtility.checkDecimalTypeEnabled(options);
                        return Optional.of(getVarDecimalConverter(name, type));
                    }

                    @Override
                    public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
                        return Optional.of(getVarCharConverter(name, type));
                    }

                    @Override
                    public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation stringLogicalType) {
                        return Optional.of(getVarCharConverter(name, type));
                    }
                };
                Supplier<PrimitiveConverter> converterSupplier = () -> {
                    VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, MapWriter::varBinary, ListWriter::varBinary);
                    return new DrillVarBinaryConverter(writer, mutator.getManagedBuffer());
                };
                return Optional.ofNullable(type.getLogicalTypeAnnotation()).map(typeAnnotation -> typeAnnotation.accept(typeAnnotationVisitor)).flatMap(Function.identity()).orElseGet(converterSupplier);
            }
        case FIXED_LEN_BYTE_ARRAY:
            LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter> typeAnnotationVisitor = new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<PrimitiveConverter>() {

                @Override
                public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
                    ParquetReaderUtility.checkDecimalTypeEnabled(options);
                    return Optional.of(getVarDecimalConverter(name, type));
                }

                @Override
                public Optional<PrimitiveConverter> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
                    IntervalWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).interval(), l -> l.list().interval()) : getWriter(name, MapWriter::interval, ListWriter::interval);
                    return Optional.of(new DrillFixedLengthByteArrayToInterval(writer));
                }
            };
            Supplier<PrimitiveConverter> converterSupplier = () -> {
                VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED) ? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary()) : getWriter(name, MapWriter::varBinary, ListWriter::varBinary);
                return new DrillFixedBinaryToVarbinaryConverter(writer, type.getTypeLength(), mutator.getManagedBuffer());
            };
            return Optional.ofNullable(type.getLogicalTypeAnnotation()).map(typeAnnotation -> typeAnnotation.accept(typeAnnotationVisitor)).flatMap(Function.identity()).orElseGet(converterSupplier);
        default:
            throw new UnsupportedOperationException("Unsupported type: " + type.getPrimitiveTypeName());
    }
}
Also used : IntervalHolder(org.apache.drill.exec.expr.holders.IntervalHolder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) VarDecimalHolder(org.apache.drill.exec.expr.holders.VarDecimalHolder) SingleMapWriter(org.apache.drill.exec.vector.complex.impl.SingleMapWriter) BiFunction(java.util.function.BiFunction) VarCharHolder(org.apache.drill.exec.expr.holders.VarCharHolder) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) OutputMutator(org.apache.drill.exec.physical.impl.OutputMutator) VarCharWriter(org.apache.drill.exec.vector.complex.writer.VarCharWriter) PathSegment(org.apache.drill.common.expression.PathSegment) TimeStampWriter(org.apache.drill.exec.vector.complex.writer.TimeStampWriter) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) DateHolder(org.apache.drill.exec.expr.holders.DateHolder) DrillBuf(io.netty.buffer.DrillBuf) BigIntHolder(org.apache.drill.exec.expr.holders.BigIntHolder) VarBinaryWriter(org.apache.drill.exec.vector.complex.writer.VarBinaryWriter) BigIntWriter(org.apache.drill.exec.vector.complex.writer.BigIntWriter) AbstractRepeatedMapWriter(org.apache.drill.exec.vector.complex.impl.AbstractRepeatedMapWriter) IntWriter(org.apache.drill.exec.vector.complex.writer.IntWriter) GroupType(org.apache.parquet.schema.GroupType) GroupConverter(org.apache.parquet.io.api.GroupConverter) VarDecimalWriter(org.apache.drill.exec.vector.complex.writer.VarDecimalWriter) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Float4Writer(org.apache.drill.exec.vector.complex.writer.Float4Writer) TimeWriter(org.apache.drill.exec.vector.complex.writer.TimeWriter) BaseWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter) MapWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter) Binary(org.apache.parquet.io.api.Binary) Longs(org.apache.drill.shaded.guava.com.google.common.primitives.Longs) List(java.util.List) DYNAMIC_STAR(org.apache.drill.common.expression.SchemaPath.DYNAMIC_STAR) Optional(java.util.Optional) Type(org.apache.parquet.schema.Type) ExecConstants(org.apache.drill.exec.ExecConstants) DateWriter(org.apache.drill.exec.vector.complex.writer.DateWriter) BitHolder(org.apache.drill.exec.expr.holders.BitHolder) OptionManager(org.apache.drill.exec.server.options.OptionManager) Ints(org.apache.drill.shaded.guava.com.google.common.primitives.Ints) BitWriter(org.apache.drill.exec.vector.complex.writer.BitWriter) Float8Writer(org.apache.drill.exec.vector.complex.writer.Float8Writer) Converter(org.apache.parquet.io.api.Converter) ListWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter) Repetition(org.apache.parquet.schema.Type.Repetition) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Float8Holder(org.apache.drill.exec.expr.holders.Float8Holder) VarBinaryHolder(org.apache.drill.exec.expr.holders.VarBinaryHolder) ArrayList(java.util.ArrayList) Float4Holder(org.apache.drill.exec.expr.holders.Float4Holder) NanoTimeUtils.getDateTimeValueFromBinary(org.apache.drill.exec.store.parquet.ParquetReaderUtility.NanoTimeUtils.getDateTimeValueFromBinary) TimeHolder(org.apache.drill.exec.expr.holders.TimeHolder) IntHolder(org.apache.drill.exec.expr.holders.IntHolder) DictWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.DictWriter) Iterator(java.util.Iterator) IntervalWriter(org.apache.drill.exec.vector.complex.writer.IntervalWriter) ParquetColumnMetadata(org.apache.drill.exec.store.parquet.columnreaders.ParquetColumnMetadata) Collections(java.util.Collections) TimeStampHolder(org.apache.drill.exec.expr.holders.TimeStampHolder) DateTimeConstants(org.joda.time.DateTimeConstants) BitWriter(org.apache.drill.exec.vector.complex.writer.BitWriter) SingleMapWriter(org.apache.drill.exec.vector.complex.impl.SingleMapWriter) AbstractRepeatedMapWriter(org.apache.drill.exec.vector.complex.impl.AbstractRepeatedMapWriter) MapWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter) Float4Writer(org.apache.drill.exec.vector.complex.writer.Float4Writer) TimeWriter(org.apache.drill.exec.vector.complex.writer.TimeWriter) VarBinaryWriter(org.apache.drill.exec.vector.complex.writer.VarBinaryWriter) TimeStampWriter(org.apache.drill.exec.vector.complex.writer.TimeStampWriter) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) ListWriter(org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter) DateWriter(org.apache.drill.exec.vector.complex.writer.DateWriter) Supplier(java.util.function.Supplier) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) IntervalWriter(org.apache.drill.exec.vector.complex.writer.IntervalWriter) Optional(java.util.Optional) Float8Writer(org.apache.drill.exec.vector.complex.writer.Float8Writer) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter)

Example 34 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by apache.

the class ParquetSchemaMerge method main.

public static void main(String[] args) {
    MessageType message1;
    MessageType message2;
    PrimitiveType c = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "c");
    GroupType b = new GroupType(Repetition.REQUIRED, "b");
    GroupType a = new GroupType(Repetition.OPTIONAL, "a", b);
    message1 = new MessageType("root", a);
    PrimitiveType c2 = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "d");
    GroupType b2 = new GroupType(Repetition.OPTIONAL, "b", c2);
    GroupType a2 = new GroupType(Repetition.OPTIONAL, "a", b2);
    message2 = new MessageType("root", a2);
    MessageType message3 = message1.union(message2);
    StringBuilder builder = new StringBuilder();
    message3.writeToStringBuilder(builder, "");
    logger.info(builder.toString());
}
Also used : GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 35 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildBinaryUtf8.

@Test
public void testBuildBinaryUtf8() {
    PrimitiveType type = Types.required(BINARY).as(UTF8).named("test_binary_utf8");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    // assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class));
    assertNull(builder.build());
    Operators.BinaryColumn col = binaryColumn("test_col");
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, stringBinary("Jeltz"), stringBinary("Slartibartfast"), null, null));
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Prefect")));
    builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Trilian"), null));
    builder.add(sb.stats(type, stringBinary("Beeblebrox")));
    builder.add(sb.stats(type, null, null));
    assertEquals(8, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    ColumnIndex columnIndex = builder.build();
    assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 2, 5, 2, 0, 1, 0, 2);
    assertCorrectNullPages(columnIndex, true, false, true, true, false, false, false, true);
    assertCorrectValues(columnIndex.getMaxValues(), null, stringBinary("Slartibartfast"), null, null, stringBinary("Prefect"), stringBinary("Trilian"), stringBinary("Beeblebrox"), null);
    assertCorrectValues(columnIndex.getMinValues(), null, stringBinary("Jeltz"), null, null, stringBinary("Beeblebrox"), stringBinary("Dent"), stringBinary("Beeblebrox"), null);
    assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 1, 4, 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 5, 7);
    assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Beeblebrox")), 0, 1, 2, 3, 4, 5, 7);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5, 6);
    assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 4, 5);
    assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 4, 6);
    assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 4, 5, 6);
    assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 4, 6);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 7);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Dent"), null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Jeltz")));
    builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Prefect"), null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, stringBinary("Slartibartfast")));
    builder.add(sb.stats(type, null, null));
    assertEquals(8, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 2, 5, 0, 1, 2, 0, 2);
    assertCorrectNullPages(columnIndex, false, true, true, false, false, true, false, true);
    assertCorrectValues(columnIndex.getMaxValues(), stringBinary("Dent"), null, null, stringBinary("Jeltz"), stringBinary("Prefect"), null, stringBinary("Slartibartfast"), null);
    assertCorrectValues(columnIndex.getMinValues(), stringBinary("Beeblebrox"), null, null, stringBinary("Dent"), stringBinary("Dent"), null, stringBinary("Slartibartfast"), null);
    assertCorrectFiltering(columnIndex, eq(col, stringBinary("Jeltz")), 3, 4);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 4, 5, 7);
    assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Slartibartfast")), 0, 1, 2, 3, 4, 5, 7);
    assertCorrectFiltering(columnIndex, notEq(col, null), 0, 3, 4, 6);
    assertCorrectFiltering(columnIndex, gt(col, stringBinary("Marvin")), 4, 6);
    assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Marvin")), 4, 6);
    assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 0);
    assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 0, 3, 4);
    assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 0);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, stringBinary("Slartibartfast")));
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, stringBinary("Prefect"), stringBinary("Jeltz"), null));
    builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Dent")));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Beeblebrox"), null, null));
    assertEquals(8, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 0, 5, 1, 0, 2, 2, 2);
    assertCorrectNullPages(columnIndex, true, false, true, false, false, true, true, false);
    assertCorrectValues(columnIndex.getMaxValues(), null, stringBinary("Slartibartfast"), null, stringBinary("Prefect"), stringBinary("Dent"), null, null, stringBinary("Dent"));
    assertCorrectValues(columnIndex.getMinValues(), null, stringBinary("Slartibartfast"), null, stringBinary("Jeltz"), stringBinary("Dent"), null, null, stringBinary("Beeblebrox"));
    assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 3);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 5, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 4, 7);
    assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1);
    assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 3);
    assertCorrectFiltering(columnIndex, lt(col, stringBinary("Marvin")), 3, 4, 7);
    assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Marvin")), 3, 4, 7);
    assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 7);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7);
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10