Search in sources :

Example 41 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class DataWritableReadSupport method getProjectedGroupFields.

/**
 * Searchs column names by name on a given Parquet schema, and returns its corresponded
 * Parquet schema types.
 *
 * @param schema Group schema where to search for column names.
 * @param colNames List of column names.
 * @param colTypes List of column types.
 * @return List of GroupType objects of projected columns.
 */
private static List<Type> getProjectedGroupFields(GroupType schema, List<String> colNames, List<TypeInfo> colTypes) {
    List<Type> schemaTypes = new ArrayList<Type>();
    ListIterator<String> columnIterator = colNames.listIterator();
    Map<String, Type> schemaTypeMap = new HashMap<>();
    schema.getFields().forEach(t -> schemaTypeMap.put(t.getName().toLowerCase(), t));
    while (columnIterator.hasNext()) {
        TypeInfo colType = colTypes.get(columnIterator.nextIndex());
        String colName = columnIterator.next();
        Type fieldType = schemaTypeMap.get(colName.toLowerCase());
        if (fieldType == null) {
            schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName));
        } else {
            schemaTypes.add(getProjectedType(colType, fieldType));
        }
    }
    return schemaTypes;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Example 42 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class HiveParquetSchemaTestUtils method testConversion.

public static void testConversion(final String columnNamesStr, final String columnsTypeStr, final String actualSchema, final Configuration conf) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, conf);
    final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema);
    assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound);
    // Required to check the original types manually as PrimitiveType.equals does not care about it
    List<Type> expectedFields = expectedMT.getFields();
    List<Type> actualFields = messageTypeFound.getFields();
    for (int i = 0, n = expectedFields.size(); i < n; ++i) {
        LogicalTypeAnnotation expectedLogicalType = expectedFields.get(i).getLogicalTypeAnnotation();
        LogicalTypeAnnotation actualLogicalType = actualFields.get(i).getLogicalTypeAnnotation();
        assertEquals("Logical type annotations of the field do not match", expectedLogicalType, actualLogicalType);
    }
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 43 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class HiveParquetSchemaTestUtils method testLogicalTypeAnnotations.

public static void testLogicalTypeAnnotations(final String hiveColumnNames, final String hiveColumnTypes, final Map<String, LogicalTypeAnnotation> expectedLogicalTypes, Configuration conf) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, conf);
    List<Type> actualFields = messageTypeFound.getFields();
    for (Type actualField : actualFields) {
        LogicalTypeAnnotation expectedLogicalType = expectedLogicalTypes.get(actualField.getName());
        LogicalTypeAnnotation actualLogicalType = actualField.getLogicalTypeAnnotation();
        if (expectedLogicalType != null) {
            assertNotNull("The logical type annotation cannot be null.", actualLogicalType);
            assertEquals("Logical type annotations of the field do not match", expectedLogicalType, actualLogicalType);
        } else {
            assertNull("The logical type annotation must be null.", actualLogicalType);
        }
    }
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 44 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class TestHiveSchemaConverter method testListOriginalType.

@Test
public void testListOriginalType() throws Exception {
    final MessageType messageTypeFound = createSchema("array<tinyint>", "arrayCol");
    assertEquals(1, messageTypeFound.getFieldCount());
    Type topLevel = messageTypeFound.getFields().get(0);
    checkField(topLevel, "arrayCol", Repetition.OPTIONAL, LogicalTypeAnnotation.listType());
    assertEquals(1, topLevel.asGroupType().getFieldCount());
    Type secondLevel = topLevel.asGroupType().getFields().get(0);
    checkField(secondLevel, "bag", Repetition.REPEATED, null);
    assertEquals(1, secondLevel.asGroupType().getFieldCount());
    Type thirdLevel = secondLevel.asGroupType().getFields().get(0);
    checkField(thirdLevel, "array_element", Repetition.OPTIONAL, LogicalTypeAnnotation.intType(8, true));
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 45 with Type

use of org.apache.parquet.schema.Type in project Gaffer by gchq.

the class GafferElementConverter method buildFieldToConverter.

private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) {
    final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount);
    int i = 0;
    for (final Type field : schema.getFields()) {
        if (field.isPrimitive()) {
            fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[] { field.getName() }, field.getOriginalType()));
        } else {
            fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[] { field.getName() }));
        }
        i++;
    }
    return fieldToConverter;
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) HashMap(java.util.HashMap) Converter(org.apache.parquet.io.api.Converter) GroupConverter(org.apache.parquet.io.api.GroupConverter) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter)

Aggregations

Type (org.apache.parquet.schema.Type)88 MessageType (org.apache.parquet.schema.MessageType)72 GroupType (org.apache.parquet.schema.GroupType)69 OriginalType (org.apache.parquet.schema.OriginalType)35 PrimitiveType (org.apache.parquet.schema.PrimitiveType)35 ArrayList (java.util.ArrayList)25 HashMap (java.util.HashMap)10 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4