Search in sources :

Example 46 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class DataWritableReadSupport method projectLeafTypes.

private static List<Type> projectLeafTypes(List<Type> types, List<FieldNode> nodes) {
    List<Type> res = new ArrayList<>();
    if (nodes.isEmpty()) {
        return res;
    }
    Map<String, FieldNode> fieldMap = new HashMap<>();
    for (FieldNode n : nodes) {
        fieldMap.put(n.getFieldName().toLowerCase(), n);
    }
    for (Type type : types) {
        String tn = type.getName().toLowerCase();
        if (fieldMap.containsKey(tn)) {
            FieldNode f = fieldMap.get(tn);
            if (f.getNodes().isEmpty()) {
                // no child, no need for pruning
                res.add(type);
            } else {
                if (type instanceof GroupType) {
                    GroupType groupType = type.asGroupType();
                    List<Type> ts = projectLeafTypes(groupType.getFields(), f.getNodes());
                    GroupType g = buildProjectedGroupType(groupType, ts);
                    if (g != null) {
                        res.add(g);
                    }
                } else {
                    throw new RuntimeException("Primitive type " + f.getFieldName() + "should not " + "doesn't match type" + f.toString());
                }
            }
        }
    }
    return res;
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) GroupType(org.apache.parquet.schema.GroupType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 47 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class VectorizedParquetRecordReader method checkEndOfRowGroup.

private void checkEndOfRowGroup() throws IOException {
    if (rowsReturned != totalCountLoadedSoFar) {
        return;
    }
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
        throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
    }
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    List<Type> types = requestedSchema.getFields();
    columnReaders = new VectorizedColumnReader[columns.size()];
    if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
        // However, if colsToInclude is not empty we should initialize each columnReader
        if (!colsToInclude.isEmpty()) {
            for (int i = 0; i < types.size(); ++i) {
                columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
            }
        }
    } else {
        for (int i = 0; i < types.size(); ++i) {
            columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
        }
    }
    totalCountLoadedSoFar += pages.getRowCount();
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 48 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class VectorizedParquetRecordReader method buildVectorizedParquetReader.

// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, int depth) throws IOException {
    List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
    switch(typeInfo.getCategory()) {
        case PRIMITIVE:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            if (fileSchema.getColumns().contains(descriptors.get(0))) {
                return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo);
            } else {
                // Support for schema evolution
                return new VectorizedDummyColumnReader();
            }
        case STRUCT:
            StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
            List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
            List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
            List<Type> types = type.asGroupType().getFields();
            for (int i = 0; i < fieldTypes.size(); i++) {
                VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, depth + 1);
                if (r != null) {
                    fieldReaders.add(r);
                } else {
                    throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
                }
            }
            return new VectorizedStructColumnReader(fieldReaders);
        case LIST:
            checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo);
        case MAP:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            // to handle the different Map definition in Parquet, eg:
            // definition has 1 group:
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // definition has 2 groups:
            // optional group m1 (MAP) {
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // }
            int nestGroup = 0;
            GroupType groupType = type.asGroupType();
            // otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
            while (groupType.getFieldCount() < 2) {
                if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
                    throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
                }
                groupType = groupType.getFields().get(0).asGroupType();
                nestGroup++;
            }
            List<Type> kvTypes = groupType.getFields();
            VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo);
            VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo);
            return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
        case UNION:
        default:
            throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ParquetRuntimeException(org.apache.parquet.ParquetRuntimeException) GroupType(org.apache.parquet.schema.GroupType)

Example 49 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class DataWritableWriter method writeData.

private void writeData(final ArrayWritable arr, final GroupType type) {
    if (arr == null) {
        return;
    }
    final int fieldCount = type.getFieldCount();
    Writable[] values = arr.get();
    for (int field = 0; field < fieldCount; ++field) {
        final Type fieldType = type.getType(field);
        final String fieldName = fieldType.getName();
        final Writable value = values[field];
        if (value == null) {
            continue;
        }
        recordConsumer.startField(fieldName, field);
        if (fieldType.isPrimitive()) {
            writePrimitive(value);
        } else {
            recordConsumer.startGroup();
            if (value instanceof ArrayWritable) {
                if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) {
                    writeArray((ArrayWritable) value, fieldType.asGroupType());
                } else {
                    writeData((ArrayWritable) value, fieldType.asGroupType());
                }
            } else if (value != null) {
                throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value);
            }
            recordConsumer.endGroup();
        }
        recordConsumer.endField(fieldName, field);
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) ArrayWritable(org.apache.hadoop.io.ArrayWritable) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable) BigDecimalWritable(org.apache.hadoop.hive.ql.io.parquet.writable.BigDecimalWritable) BinaryWritable(org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 50 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class AvroSchemaConverter method convertFields.

private Schema convertFields(String name, List<Type> parquetFields) {
    List<Schema.Field> fields = new ArrayList<Schema.Field>();
    for (Type parquetType : parquetFields) {
        Schema fieldSchema = convertField(parquetType);
        if (parquetType.isRepetition(REPEATED)) {
            throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType);
        } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
            fields.add(new Schema.Field(parquetType.getName(), optional(fieldSchema), null, NULL_VALUE));
        } else {
            // REQUIRED
            fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null, (Object) null));
        }
    }
    Schema schema = Schema.createRecord(name, null, null, false);
    schema.setFields(fields);
    return schema;
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList)

Aggregations

Type (org.apache.parquet.schema.Type)78 GroupType (org.apache.parquet.schema.GroupType)67 MessageType (org.apache.parquet.schema.MessageType)62 OriginalType (org.apache.parquet.schema.OriginalType)39 PrimitiveType (org.apache.parquet.schema.PrimitiveType)34 ArrayList (java.util.ArrayList)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 HashMap (java.util.HashMap)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)9 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)4 ExecConstants (org.apache.drill.exec.ExecConstants)4