Search in sources :

Example 21 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class MetadataUtils method showDetails.

private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
    String name = Strings.repeat(".", depth) + type.getName();
    Repetition rep = type.getRepetition();
    int fcount = type.getFieldCount();
    out.format("%s: %s F:%d%n", name, rep, fcount);
    cpath.add(type.getName());
    for (Type ftype : type.getFields()) {
        showDetails(out, ftype, depth + 1, container, cpath);
    }
    cpath.remove(cpath.size() - 1);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) Repetition(org.apache.parquet.schema.Type.Repetition)

Example 22 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class TupleWriteSupport method write.

@Override
public void write(TupleEntry record) {
    recordConsumer.startMessage();
    final List<Type> fields = rootSchema.getFields();
    for (int i = 0; i < fields.size(); i++) {
        Type field = fields.get(i);
        if (record == null || record.getObject(field.getName()) == null) {
            continue;
        }
        recordConsumer.startField(field.getName(), i);
        if (field.isPrimitive()) {
            writePrimitive(record, field.asPrimitiveType());
        } else {
            throw new UnsupportedOperationException("Complex type not implemented");
        }
        recordConsumer.endField(field.getName(), i);
    }
    recordConsumer.endMessage();
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type)

Example 23 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class DataWritableReadSupport method init.

/**
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
    final String columns = configuration.get(IOConstants.COLUMNS);
    final Map<String, String> contextMetadata = new HashMap<String, String>();
    if (columns != null) {
        final List<String> listColumns = getColumns(columns);
        final List<Type> typeListTable = new ArrayList<Type>();
        for (final String col : listColumns) {
            // listColumns contains partition columns which are metadata only
            if (fileSchema.containsField(col)) {
                typeListTable.add(fileSchema.getType(col));
            } else {
                // below allows schema evolution
                typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
            }
        }
        MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
        contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());
        MessageType requestedSchemaByUser = tableSchema;
        final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
        final List<Type> typeListWanted = new ArrayList<Type>();
        for (final Integer idx : indexColumnsWanted) {
            typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
        }
        requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration);
        return new ReadContext(requestedSchemaByUser, contextMetadata);
    } else {
        contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
        return new ReadContext(fileSchema, contextMetadata);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 24 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class HiveSchemaConverter method convertMapType.

// An optional group containing a repeated anonymous group "map", containing
// 2 elements: "key", "value"
private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) {
    final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
    final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo());
    return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType)

Example 25 with Type

use of org.apache.parquet.schema.Type in project drill by apache.

the class ParquetReaderUtility method containsComplexColumn.

/**
 * Check whether any of columns in the given list is either nested or repetitive.
 *
 * @param footer  Parquet file schema
 * @param columns list of query SchemaPath objects
 */
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {
    MessageType schema = footer.getFileMetaData().getSchema();
    if (Utilities.isStarQuery(columns)) {
        for (Type type : schema.getFields()) {
            if (!type.isPrimitive()) {
                return true;
            }
        }
        for (ColumnDescriptor col : schema.getColumns()) {
            if (col.getMaxRepetitionLevel() > 0) {
                return true;
            }
        }
        return false;
    } else {
        Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
        Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
        for (SchemaPath schemaPath : columns) {
            // Schema path which is non-leaf is complex column
            if (!schemaPath.isLeaf()) {
                logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
                return true;
            }
            // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
            // 1. success: queried column is complex, i.e. GroupType
            // 2. failure: queried column is not in schema and thus is non-complex
            ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());
            if (column == null) {
                SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
                if (schemaElement != null) {
                    return true;
                }
            } else {
                if (column.getMaxRepetitionLevel() > 0) {
                    logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
                    return true;
                }
            }
        }
    }
    return false;
}
Also used : ConvertedType(org.apache.parquet.format.ConvertedType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) SchemaElement(org.apache.parquet.format.SchemaElement) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

Type (org.apache.parquet.schema.Type)88 MessageType (org.apache.parquet.schema.MessageType)72 GroupType (org.apache.parquet.schema.GroupType)69 OriginalType (org.apache.parquet.schema.OriginalType)35 PrimitiveType (org.apache.parquet.schema.PrimitiveType)35 ArrayList (java.util.ArrayList)25 HashMap (java.util.HashMap)10 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4