Search in sources :

Example 71 with Type

use of org.apache.parquet.schema.Type in project presto by prestodb.

the class TestDataWritableWriter method writeMap.

/**
 * It writes a map type and its key-pair values to the Parquet RecordConsumer.
 * This is called when the original type (MAP) is detected by writeValue().
 * This function assumes the following schema:
 * optional group mapCol (MAP) {
 * repeated group map (MAP_KEY_VALUE) {
 * required TYPE key;
 * optional TYPE value;
 * }
 * }
 *
 * @param value The object that contains the map key-values.
 * @param inspector The object inspector used to get the correct value type.
 * @param type Type that contains information about the group (MAP) schema.
 */
private void writeMap(final Object value, final MapObjectInspector inspector, final GroupType type) {
    // Get the internal map structure (MAP_KEY_VALUE)
    GroupType repeatedType = type.getType(0).asGroupType();
    recordConsumer.startGroup();
    Map<?, ?> mapValues = inspector.getMap(value);
    if (mapValues != null && mapValues.size() > 0) {
        recordConsumer.startField(repeatedType.getName(), 0);
        Type keyType = repeatedType.getType(0);
        String keyName = keyType.getName();
        ObjectInspector keyInspector = inspector.getMapKeyObjectInspector();
        Type valuetype = repeatedType.getType(1);
        String valueName = valuetype.getName();
        ObjectInspector valueInspector = inspector.getMapValueObjectInspector();
        for (Map.Entry<?, ?> keyValue : mapValues.entrySet()) {
            recordConsumer.startGroup();
            if (keyValue != null) {
                // write key element
                Object keyElement = keyValue.getKey();
                recordConsumer.startField(keyName, 0);
                writeValue(keyElement, keyInspector, keyType);
                recordConsumer.endField(keyName, 0);
                // write value element
                Object valueElement = keyValue.getValue();
                if (valueElement != null) {
                    recordConsumer.startField(valueName, 1);
                    writeValue(valueElement, valueInspector, valuetype);
                    recordConsumer.endField(valueName, 1);
                }
            }
            recordConsumer.endGroup();
        }
        recordConsumer.endField(repeatedType.getName(), 0);
    }
    recordConsumer.endGroup();
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) HiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector) HiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) DateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector) GroupType(org.apache.parquet.schema.GroupType) Map(java.util.Map)

Example 72 with Type

use of org.apache.parquet.schema.Type in project flink by apache.

the class ParquetVectorizedInputFormat method clipParquetSchema.

/**
 * Clips `parquetSchema` according to `fieldNames`.
 */
private MessageType clipParquetSchema(GroupType parquetSchema) {
    Type[] types = new Type[projectedFields.length];
    if (isCaseSensitive) {
        for (int i = 0; i < projectedFields.length; ++i) {
            String fieldName = projectedFields[i];
            if (!parquetSchema.containsField(fieldName)) {
                LOG.warn("{} does not exist in {}, will fill the field with null.", fieldName, parquetSchema);
                types[i] = ParquetSchemaConverter.convertToParquetType(fieldName, projectedTypes[i]);
                unknownFieldsIndices.add(i);
            } else {
                types[i] = parquetSchema.getType(fieldName);
            }
        }
    } else {
        Map<String, Type> caseInsensitiveFieldMap = new HashMap<>();
        for (Type type : parquetSchema.getFields()) {
            caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), (key, previousType) -> {
                if (previousType != null) {
                    throw new FlinkRuntimeException("Parquet with case insensitive mode should have no duplicate key: " + key);
                }
                return type;
            });
        }
        for (int i = 0; i < projectedFields.length; ++i) {
            Type type = caseInsensitiveFieldMap.get(projectedFields[i].toLowerCase(Locale.ROOT));
            if (type == null) {
                LOG.warn("{} does not exist in {}, will fill the field with null.", projectedFields[i], parquetSchema);
                type = ParquetSchemaConverter.convertToParquetType(projectedFields[i].toLowerCase(Locale.ROOT), projectedTypes[i]);
                unknownFieldsIndices.add(i);
            }
            // TODO clip for array,map,row types.
            types[i] = type;
        }
    }
    return Types.buildMessage().addFields(types).named("flink-parquet");
}
Also used : RowType(org.apache.flink.table.types.logical.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException)

Example 73 with Type

use of org.apache.parquet.schema.Type in project flink by apache.

the class ParquetColumnarRowSplitReader method checkSchema.

private void checkSchema() throws IOException, UnsupportedOperationException {
    if (selectedTypes.length != requestedSchema.getFieldCount()) {
        throw new RuntimeException("The quality of field type is incompatible with the request schema!");
    }
    /*
         * Check that the requested schema is supported.
         */
    for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
        Type t = requestedSchema.getFields().get(i);
        if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
            throw new UnsupportedOperationException("Complex types not supported.");
        }
        String[] colPath = requestedSchema.getPaths().get(i);
        if (fileSchema.containsPath(colPath)) {
            ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
            if (!fd.equals(requestedSchema.getColumns().get(i))) {
                throw new UnsupportedOperationException("Schema evolution not supported.");
            }
        } else {
            if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
                // invalid.
                throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
            }
        }
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 74 with Type

use of org.apache.parquet.schema.Type in project flink by apache.

the class ParquetColumnarRowSplitReader method clipParquetSchema.

/**
 * Clips `parquetSchema` according to `fieldNames`.
 */
private static MessageType clipParquetSchema(GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) {
    Type[] types = new Type[fieldNames.length];
    if (caseSensitive) {
        for (int i = 0; i < fieldNames.length; ++i) {
            String fieldName = fieldNames[i];
            if (parquetSchema.getFieldIndex(fieldName) < 0) {
                throw new IllegalArgumentException(fieldName + " does not exist");
            }
            types[i] = parquetSchema.getType(fieldName);
        }
    } else {
        Map<String, Type> caseInsensitiveFieldMap = new HashMap<>();
        for (Type type : parquetSchema.getFields()) {
            caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), (key, previousType) -> {
                if (previousType != null) {
                    throw new FlinkRuntimeException("Parquet with case insensitive mode should have no duplicate key: " + key);
                }
                return type;
            });
        }
        for (int i = 0; i < fieldNames.length; ++i) {
            Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT));
            if (type == null) {
                throw new IllegalArgumentException(fieldNames[i] + " does not exist");
            }
            // TODO clip for array,map,row types.
            types[i] = type;
        }
    }
    return Types.buildMessage().addFields(types).named("flink-parquet");
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException)

Example 75 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class DruidParquetReadSupport method getPartialReadSchema.

/**
 * Select the columns from the parquet schema that are used in the schema of the ingestion job
 *
 * @param context The context of the file to be read
 *
 * @return the partial schema that only contains the columns that are being used in the schema
 */
private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    ParseSpec parseSpec = config.getParser().getParseSpec();
    // parse the flatten spec and determine it isn't auto discovering props?
    if (parseSpec instanceof ParquetParseSpec) {
        if (((ParquetParseSpec) parseSpec).getFlattenSpec() != null) {
            return fullSchema;
        }
    }
    String tsField = parseSpec.getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = parseSpec.getDimensionsSpec().getDimensions();
    Set<String> dimensions = new HashSet<>();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = new HashSet<>();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = new ArrayList<>();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}
Also used : ParseSpec(org.apache.druid.data.input.impl.ParseSpec) ArrayList(java.util.ArrayList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Aggregations

Type (org.apache.parquet.schema.Type)78 GroupType (org.apache.parquet.schema.GroupType)67 MessageType (org.apache.parquet.schema.MessageType)62 OriginalType (org.apache.parquet.schema.OriginalType)39 PrimitiveType (org.apache.parquet.schema.PrimitiveType)34 ArrayList (java.util.ArrayList)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 HashMap (java.util.HashMap)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)9 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)4 ExecConstants (org.apache.drill.exec.ExecConstants)4