Search in sources :

Example 21 with DataType

use of org.apache.nifi.serialization.record.DataType in project nifi by apache.

the class AvroTypeUtil method createSchema.

/**
 * Converts an Avro Schema to a RecordSchema
 *
 * @param avroSchema the Avro Schema to convert
 * @param schemaText the textual representation of the schema
 * @param schemaId the identifier of the schema
 * @return the Corresponding Record Schema
 */
public static RecordSchema createSchema(final Schema avroSchema, final String schemaText, final SchemaIdentifier schemaId) {
    if (avroSchema == null) {
        throw new IllegalArgumentException("Avro Schema cannot be null");
    }
    String schemaFullName = avroSchema.getNamespace() + "." + avroSchema.getName();
    SimpleRecordSchema recordSchema = new SimpleRecordSchema(schemaText, AVRO_SCHEMA_FORMAT, schemaId);
    DataType recordSchemaType = RecordFieldType.RECORD.getRecordDataType(recordSchema);
    Map<String, DataType> knownRecords = new HashMap<>();
    knownRecords.put(schemaFullName, recordSchemaType);
    final List<RecordField> recordFields = new ArrayList<>(avroSchema.getFields().size());
    for (final Field field : avroSchema.getFields()) {
        final String fieldName = field.name();
        final Schema fieldSchema = field.schema();
        final DataType dataType = AvroTypeUtil.determineDataType(fieldSchema, knownRecords);
        final boolean nullable = isNullable(fieldSchema);
        addFieldToList(recordFields, field, fieldName, fieldSchema, dataType, nullable);
    }
    recordSchema.setFields(recordFields);
    return recordSchema;
}
Also used : SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) RecordField(org.apache.nifi.serialization.record.RecordField) HashMap(java.util.HashMap) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) ArrayList(java.util.ArrayList) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType)

Example 22 with DataType

use of org.apache.nifi.serialization.record.DataType in project nifi by apache.

the class AvroTypeUtil method convertAvroRecordToMap.

public static Map<String, Object> convertAvroRecordToMap(final GenericRecord avroRecord, final RecordSchema recordSchema) {
    final Map<String, Object> values = new HashMap<>(recordSchema.getFieldCount());
    for (final RecordField recordField : recordSchema.getFields()) {
        Object value = avroRecord.get(recordField.getFieldName());
        if (value == null) {
            for (final String alias : recordField.getAliases()) {
                value = avroRecord.get(alias);
                if (value != null) {
                    break;
                }
            }
        }
        final String fieldName = recordField.getFieldName();
        try {
            final Field avroField = avroRecord.getSchema().getField(fieldName);
            if (avroField == null) {
                values.put(fieldName, null);
                continue;
            }
            final Schema fieldSchema = avroField.schema();
            final Object rawValue = normalizeValue(value, fieldSchema, fieldName);
            final DataType desiredType = recordField.getDataType();
            final Object coercedValue = DataTypeUtils.convertType(rawValue, desiredType, fieldName);
            values.put(fieldName, coercedValue);
        } catch (Exception ex) {
            logger.debug("fail to convert field " + fieldName, ex);
            throw ex;
        }
    }
    return values;
}
Also used : Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) RecordField(org.apache.nifi.serialization.record.RecordField) HashMap(java.util.HashMap) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) IllegalTypeConversionException(org.apache.nifi.serialization.record.util.IllegalTypeConversionException) IOException(java.io.IOException)

Example 23 with DataType

use of org.apache.nifi.serialization.record.DataType in project nifi by apache.

the class AvroTypeUtil method buildAvroSchema.

private static Schema buildAvroSchema(final DataType dataType, final String fieldName, final boolean nullable) {
    final Schema schema;
    switch(dataType.getFieldType()) {
        case ARRAY:
            final ArrayDataType arrayDataType = (ArrayDataType) dataType;
            final DataType elementDataType = arrayDataType.getElementType();
            if (RecordFieldType.BYTE.equals(elementDataType.getFieldType())) {
                schema = Schema.create(Type.BYTES);
            } else {
                final Schema elementType = buildAvroSchema(elementDataType, fieldName, false);
                schema = Schema.createArray(elementType);
            }
            break;
        case BIGINT:
            schema = Schema.create(Type.STRING);
            break;
        case BOOLEAN:
            schema = Schema.create(Type.BOOLEAN);
            break;
        case BYTE:
            schema = Schema.create(Type.INT);
            break;
        case CHAR:
            schema = Schema.create(Type.STRING);
            break;
        case CHOICE:
            final ChoiceDataType choiceDataType = (ChoiceDataType) dataType;
            final List<DataType> options = choiceDataType.getPossibleSubTypes();
            // We need to keep track of which types have been added to the union, because if we have
            // two elements in the UNION with the same type, it will fail - even if the logical type is
            // different. So if we have an int and a logical type date (which also has a 'concrete type' of int)
            // then an Exception will be thrown when we try to create the union. To avoid this, we just keep track
            // of the Types and avoid adding it in such a case.
            final List<Schema> unionTypes = new ArrayList<>(options.size());
            final Set<Type> typesAdded = new HashSet<>();
            for (final DataType option : options) {
                final Schema optionSchema = buildAvroSchema(option, fieldName, false);
                if (!typesAdded.contains(optionSchema.getType())) {
                    unionTypes.add(optionSchema);
                    typesAdded.add(optionSchema.getType());
                }
            }
            schema = Schema.createUnion(unionTypes);
            break;
        case DATE:
            schema = Schema.create(Type.INT);
            LogicalTypes.date().addToSchema(schema);
            break;
        case DOUBLE:
            schema = Schema.create(Type.DOUBLE);
            break;
        case FLOAT:
            schema = Schema.create(Type.FLOAT);
            break;
        case INT:
            schema = Schema.create(Type.INT);
            break;
        case LONG:
            schema = Schema.create(Type.LONG);
            break;
        case MAP:
            schema = Schema.createMap(buildAvroSchema(((MapDataType) dataType).getValueType(), fieldName, false));
            break;
        case RECORD:
            final RecordDataType recordDataType = (RecordDataType) dataType;
            final RecordSchema childSchema = recordDataType.getChildSchema();
            final List<Field> childFields = new ArrayList<>(childSchema.getFieldCount());
            for (final RecordField field : childSchema.getFields()) {
                childFields.add(buildAvroField(field));
            }
            schema = Schema.createRecord(fieldName + "Type", null, "org.apache.nifi", false, childFields);
            break;
        case SHORT:
            schema = Schema.create(Type.INT);
            break;
        case STRING:
            schema = Schema.create(Type.STRING);
            break;
        case TIME:
            schema = Schema.create(Type.INT);
            LogicalTypes.timeMillis().addToSchema(schema);
            break;
        case TIMESTAMP:
            schema = Schema.create(Type.LONG);
            LogicalTypes.timestampMillis().addToSchema(schema);
            break;
        default:
            return null;
    }
    if (nullable) {
        return nullable(schema);
    } else {
        return schema;
    }
}
Also used : RecordField(org.apache.nifi.serialization.record.RecordField) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) ArrayList(java.util.ArrayList) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) RecordFieldType(org.apache.nifi.serialization.record.RecordFieldType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) Type(org.apache.avro.Schema.Type) LogicalType(org.apache.avro.LogicalType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) HashSet(java.util.HashSet)

Example 24 with DataType

use of org.apache.nifi.serialization.record.DataType in project nifi by apache.

the class TestAvroTypeUtil method testCreateAvroSchemaPrimitiveTypes.

@Test
public void testCreateAvroSchemaPrimitiveTypes() throws SchemaNotFoundException {
    final List<RecordField> fields = new ArrayList<>();
    fields.add(new RecordField("int", RecordFieldType.INT.getDataType()));
    fields.add(new RecordField("long", RecordFieldType.LONG.getDataType()));
    fields.add(new RecordField("string", RecordFieldType.STRING.getDataType(), "hola", Collections.singleton("greeting")));
    fields.add(new RecordField("byte", RecordFieldType.BYTE.getDataType()));
    fields.add(new RecordField("char", RecordFieldType.CHAR.getDataType()));
    fields.add(new RecordField("short", RecordFieldType.SHORT.getDataType()));
    fields.add(new RecordField("double", RecordFieldType.DOUBLE.getDataType()));
    fields.add(new RecordField("float", RecordFieldType.FLOAT.getDataType()));
    fields.add(new RecordField("time", RecordFieldType.TIME.getDataType()));
    fields.add(new RecordField("date", RecordFieldType.DATE.getDataType()));
    fields.add(new RecordField("timestamp", RecordFieldType.TIMESTAMP.getDataType()));
    final DataType arrayType = RecordFieldType.ARRAY.getArrayDataType(RecordFieldType.STRING.getDataType());
    fields.add(new RecordField("strings", arrayType));
    final DataType mapType = RecordFieldType.MAP.getMapDataType(RecordFieldType.LONG.getDataType());
    fields.add(new RecordField("map", mapType));
    final List<RecordField> personFields = new ArrayList<>();
    personFields.add(new RecordField("name", RecordFieldType.STRING.getDataType()));
    personFields.add(new RecordField("dob", RecordFieldType.DATE.getDataType()));
    final RecordSchema personSchema = new SimpleRecordSchema(personFields);
    final DataType personType = RecordFieldType.RECORD.getRecordDataType(personSchema);
    fields.add(new RecordField("person", personType));
    final RecordSchema recordSchema = new SimpleRecordSchema(fields);
    final Schema avroSchema = AvroTypeUtil.extractAvroSchema(recordSchema);
    // everything should be a union, since it's nullable.
    for (final Field field : avroSchema.getFields()) {
        final Schema fieldSchema = field.schema();
        assertEquals(Type.UNION, fieldSchema.getType());
        assertTrue("Field " + field.name() + " does not contain NULL type", fieldSchema.getTypes().contains(Schema.create(Type.NULL)));
    }
    final RecordSchema afterConversion = AvroTypeUtil.createSchema(avroSchema);
    assertEquals(RecordFieldType.INT.getDataType(), afterConversion.getDataType("int").get());
    assertEquals(RecordFieldType.LONG.getDataType(), afterConversion.getDataType("long").get());
    assertEquals(RecordFieldType.STRING.getDataType(), afterConversion.getDataType("string").get());
    assertEquals(RecordFieldType.INT.getDataType(), afterConversion.getDataType("byte").get());
    assertEquals(RecordFieldType.STRING.getDataType(), afterConversion.getDataType("char").get());
    assertEquals(RecordFieldType.INT.getDataType(), afterConversion.getDataType("short").get());
    assertEquals(RecordFieldType.DOUBLE.getDataType(), afterConversion.getDataType("double").get());
    assertEquals(RecordFieldType.FLOAT.getDataType(), afterConversion.getDataType("float").get());
    assertEquals(RecordFieldType.TIME.getDataType(), afterConversion.getDataType("time").get());
    assertEquals(RecordFieldType.DATE.getDataType(), afterConversion.getDataType("date").get());
    assertEquals(RecordFieldType.TIMESTAMP.getDataType(), afterConversion.getDataType("timestamp").get());
    assertEquals(arrayType, afterConversion.getDataType("strings").get());
    assertEquals(mapType, afterConversion.getDataType("map").get());
    assertEquals(personType, afterConversion.getDataType("person").get());
    final RecordField stringField = afterConversion.getField("string").get();
    assertEquals("hola", stringField.getDefaultValue());
    assertEquals(Collections.singleton("greeting"), stringField.getAliases());
}
Also used : SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) RecordField(org.apache.nifi.serialization.record.RecordField) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) ArrayList(java.util.ArrayList) DataType(org.apache.nifi.serialization.record.DataType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) Test(org.junit.Test)

Example 25 with DataType

use of org.apache.nifi.serialization.record.DataType in project nifi by apache.

the class AvroTypeUtil method determineDataType.

public static DataType determineDataType(final Schema avroSchema, Map<String, DataType> knownRecordTypes) {
    if (knownRecordTypes == null) {
        throw new IllegalArgumentException("'knownRecordTypes' cannot be null.");
    }
    final Type avroType = avroSchema.getType();
    final LogicalType logicalType = avroSchema.getLogicalType();
    if (logicalType != null) {
        final String logicalTypeName = logicalType.getName();
        switch(logicalTypeName) {
            case LOGICAL_TYPE_DATE:
                return RecordFieldType.DATE.getDataType();
            case LOGICAL_TYPE_TIME_MILLIS:
            case LOGICAL_TYPE_TIME_MICROS:
                return RecordFieldType.TIME.getDataType();
            case LOGICAL_TYPE_TIMESTAMP_MILLIS:
            case LOGICAL_TYPE_TIMESTAMP_MICROS:
                return RecordFieldType.TIMESTAMP.getDataType();
            case LOGICAL_TYPE_DECIMAL:
                // Alternatively we could convert it to String, but numeric type is generally more preferable by users.
                return RecordFieldType.DOUBLE.getDataType();
        }
    }
    switch(avroType) {
        case ARRAY:
            return RecordFieldType.ARRAY.getArrayDataType(determineDataType(avroSchema.getElementType(), knownRecordTypes));
        case BYTES:
        case FIXED:
            return RecordFieldType.ARRAY.getArrayDataType(RecordFieldType.BYTE.getDataType());
        case BOOLEAN:
            return RecordFieldType.BOOLEAN.getDataType();
        case DOUBLE:
            return RecordFieldType.DOUBLE.getDataType();
        case ENUM:
        case STRING:
            return RecordFieldType.STRING.getDataType();
        case FLOAT:
            return RecordFieldType.FLOAT.getDataType();
        case INT:
            return RecordFieldType.INT.getDataType();
        case LONG:
            return RecordFieldType.LONG.getDataType();
        case RECORD:
            {
                String schemaFullName = avroSchema.getNamespace() + "." + avroSchema.getName();
                if (knownRecordTypes.containsKey(schemaFullName)) {
                    return knownRecordTypes.get(schemaFullName);
                } else {
                    SimpleRecordSchema recordSchema = new SimpleRecordSchema(avroSchema.toString(), AVRO_SCHEMA_FORMAT, SchemaIdentifier.EMPTY);
                    DataType recordSchemaType = RecordFieldType.RECORD.getRecordDataType(recordSchema);
                    knownRecordTypes.put(schemaFullName, recordSchemaType);
                    final List<Field> avroFields = avroSchema.getFields();
                    final List<RecordField> recordFields = new ArrayList<>(avroFields.size());
                    for (final Field field : avroFields) {
                        final String fieldName = field.name();
                        final Schema fieldSchema = field.schema();
                        final DataType fieldType = determineDataType(fieldSchema, knownRecordTypes);
                        final boolean nullable = isNullable(fieldSchema);
                        addFieldToList(recordFields, field, fieldName, fieldSchema, fieldType, nullable);
                    }
                    recordSchema.setFields(recordFields);
                    return recordSchemaType;
                }
            }
        case NULL:
            return RecordFieldType.STRING.getDataType();
        case MAP:
            final Schema valueSchema = avroSchema.getValueType();
            final DataType valueType = determineDataType(valueSchema, knownRecordTypes);
            return RecordFieldType.MAP.getMapDataType(valueType);
        case UNION:
            {
                final List<Schema> nonNullSubSchemas = getNonNullSubSchemas(avroSchema);
                if (nonNullSubSchemas.size() == 1) {
                    return determineDataType(nonNullSubSchemas.get(0), knownRecordTypes);
                }
                final List<DataType> possibleChildTypes = new ArrayList<>(nonNullSubSchemas.size());
                for (final Schema subSchema : nonNullSubSchemas) {
                    final DataType childDataType = determineDataType(subSchema, knownRecordTypes);
                    possibleChildTypes.add(childDataType);
                }
                return RecordFieldType.CHOICE.getChoiceDataType(possibleChildTypes);
            }
    }
    return null;
}
Also used : SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) RecordFieldType(org.apache.nifi.serialization.record.RecordFieldType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) Type(org.apache.avro.Schema.Type) LogicalType(org.apache.avro.LogicalType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) LogicalType(org.apache.avro.LogicalType) DataType(org.apache.nifi.serialization.record.DataType) ChoiceDataType(org.apache.nifi.serialization.record.type.ChoiceDataType) MapDataType(org.apache.nifi.serialization.record.type.MapDataType) ArrayDataType(org.apache.nifi.serialization.record.type.ArrayDataType) RecordDataType(org.apache.nifi.serialization.record.type.RecordDataType) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

DataType (org.apache.nifi.serialization.record.DataType)45 RecordField (org.apache.nifi.serialization.record.RecordField)36 RecordSchema (org.apache.nifi.serialization.record.RecordSchema)27 ArrayDataType (org.apache.nifi.serialization.record.type.ArrayDataType)24 SimpleRecordSchema (org.apache.nifi.serialization.SimpleRecordSchema)22 RecordDataType (org.apache.nifi.serialization.record.type.RecordDataType)22 ChoiceDataType (org.apache.nifi.serialization.record.type.ChoiceDataType)21 MapDataType (org.apache.nifi.serialization.record.type.MapDataType)20 ArrayList (java.util.ArrayList)17 RecordFieldType (org.apache.nifi.serialization.record.RecordFieldType)17 HashMap (java.util.HashMap)15 Record (org.apache.nifi.serialization.record.Record)14 Map (java.util.Map)13 MapRecord (org.apache.nifi.serialization.record.MapRecord)13 Test (org.junit.Test)13 LinkedHashMap (java.util.LinkedHashMap)11 List (java.util.List)11 ComponentLog (org.apache.nifi.logging.ComponentLog)10 File (java.io.File)9 IOException (java.io.IOException)9