Search in sources :

Example 1 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project cdap by caskdata.

the class SchemaTypeAdapter method getFields.

/**
 * Get the list of {@link Schema.Field} associated with current RECORD.
 * @param recordName the name of the RECORD for which fields to be returned
 * @param reader the reader to read the record
 * @param knownRecords record names already encountered during the reading
 * @return the list of fields associated with the current record
 * @throws IOException when error occurs during reading the json
 */
private List<Schema.Field> getFields(String recordName, JsonReader reader, Map<String, Schema> knownRecords) throws IOException {
    knownRecords.put(recordName, null);
    List<Schema.Field> fieldBuilder = new ArrayList<>();
    reader.beginArray();
    while (reader.peek() != JsonToken.END_ARRAY) {
        reader.beginObject();
        String fieldName = null;
        Schema innerSchema = null;
        while (reader.hasNext()) {
            String name = reader.nextName();
            switch(name) {
                case NAME:
                    fieldName = reader.nextString();
                    break;
                case TYPE:
                    innerSchema = read(reader, knownRecords);
                    break;
                default:
                    reader.skipValue();
            }
        }
        fieldBuilder.add(Schema.Field.of(fieldName, innerSchema));
        reader.endObject();
    }
    reader.endArray();
    return fieldBuilder;
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList)

Example 2 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project cdap by caskdata.

the class SchemaTypeAdapter method readObject.

/**
 * Read JSON object and return Schema corresponding to it.
 * @param reader JsonReader used to read the json object
 * @param knownRecords Set of record name already encountered during the reading.
 * @return Schema reflecting json
 * @throws IOException when error occurs during reading json
 */
private Schema readObject(JsonReader reader, Map<String, Schema> knownRecords) throws IOException {
    reader.beginObject();
    // Type of the schema
    Schema.Type schemaType = null;
    // Logical Type of the schema
    Schema.LogicalType logicalType = null;
    // Name of the element
    String elementName = null;
    // Store enum values for ENUM type
    List<String> enumValues = new ArrayList<>();
    // Store schema for key and value for MAP type
    Schema keys = null;
    Schema values = null;
    // List of fields for RECORD type
    List<Schema.Field> fields = null;
    // List of items for ARRAY type
    Schema items = null;
    int precision = 0;
    int scale = 0;
    // For RECORD type fields will be populated
    while (reader.hasNext()) {
        String name = reader.nextName();
        switch(name) {
            case LOGICAL_TYPE:
                logicalType = Schema.LogicalType.fromToken(reader.nextString());
                break;
            case PRECISION:
                precision = Integer.parseInt(reader.nextString());
                break;
            case SCALE:
                scale = Integer.parseInt(reader.nextString());
                break;
            case TYPE:
                schemaType = Schema.Type.valueOf(reader.nextString().toUpperCase());
                break;
            case NAME:
                elementName = reader.nextString();
                if (schemaType == Schema.Type.RECORD) {
                    /*
              Put a null schema in the map for the recursive references.
              For example, if we are looking at the outer 'node' reference in the example below, we
              add the record name in the knownRecords map, so that when we get to the inner 'node'
              reference, we know that its a record type and not a Schema.Type.
              {
                "type": "record",
                "name": "node",
                "fields": [{
                  "name": "children",
                  "type": [{
                    "type": "array",
                    "items": ["node", "null"]
                  }, "null"]
                },
                {
                  "name": "data",
                  "type": "int"
                }]
              }
              Full schema corresponding to this RECORD will be put in knownRecords once the fields in the
              RECORD are explored.
            */
                    knownRecords.put(elementName, null);
                }
                break;
            case SYMBOLS:
                enumValues = readEnum(reader);
                break;
            case ITEMS:
                items = read(reader, knownRecords);
                break;
            case KEYS:
                keys = read(reader, knownRecords);
                break;
            case VALUES:
                values = read(reader, knownRecords);
                break;
            case FIELDS:
                fields = getFields(name, reader, knownRecords);
                knownRecords.put(elementName, Schema.recordOf(elementName, fields));
                break;
            default:
                reader.skipValue();
                break;
        }
    }
    reader.endObject();
    if (schemaType == null) {
        throw new IllegalStateException("Schema type cannot be null.");
    }
    if (logicalType != null) {
        if (logicalType == Schema.LogicalType.DECIMAL) {
            try {
                return Schema.decimalOf(precision, scale);
            } catch (IllegalArgumentException e) {
                throw new IOException("Decimal type must contain a positive precision value.");
            }
        }
        return Schema.of(logicalType);
    }
    Schema schema;
    switch(schemaType) {
        case ARRAY:
            schema = Schema.arrayOf(items);
            break;
        case ENUM:
            schema = Schema.enumWith(enumValues);
            break;
        case MAP:
            schema = Schema.mapOf(keys, values);
            break;
        case RECORD:
            schema = Schema.recordOf(elementName, fields);
            break;
        default:
            schema = Schema.of(schemaType);
            break;
    }
    return schema;
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 3 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project cdap by caskdata.

the class SQLSchemaParser method parseMap.

// <type,type>
private Schema parseMap() throws IOException {
    expectChar('<', "map must be followed by a '<'");
    skipWhitespace();
    Schema keyType = parseType();
    // key and value must be separated by a comma
    advancePastComma("Expected a comma separating map key and value types");
    Schema valueType = parseType();
    skipWhitespace();
    expectChar('>', "map must end with a '>'");
    return Schema.mapOf(keyType, valueType);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema)

Example 4 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project cdap by caskdata.

the class SQLSchemaParser method parseArray.

// <type>
private Schema parseArray() throws IOException {
    expectChar('<', "array must be followed by a '<'");
    skipWhitespace();
    Schema componentType = parseType();
    skipWhitespace();
    expectChar('>', "array must end with a '>'");
    return Schema.arrayOf(componentType);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema)

Example 5 with Schema

use of io.cdap.cdap.api.data.schema.Schema in project cdap by caskdata.

the class DataFrames method dataTypeToSchema.

/**
 * Converts a Spark {@link DataType} to a {@link Schema} object.
 *
 * @param dataType the data type to convert from
 * @param recordCounter tracks number of record schema becoming created; used for record name generation only
 * @return a new {@link Schema}.
 */
private static Schema dataTypeToSchema(DataType dataType, int[] recordCounter) {
    if (dataType.equals(DataTypes.NullType)) {
        return Schema.of(Schema.Type.NULL);
    }
    if (dataType.equals(DataTypes.BooleanType)) {
        return Schema.of(Schema.Type.BOOLEAN);
    }
    if (dataType.equals(DataTypes.ByteType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.ShortType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.IntegerType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.LongType)) {
        return Schema.of(Schema.Type.LONG);
    }
    if (dataType.equals(DataTypes.FloatType)) {
        return Schema.of(Schema.Type.FLOAT);
    }
    if (dataType.equals(DataTypes.DoubleType)) {
        return Schema.of(Schema.Type.DOUBLE);
    }
    if (dataType.equals(DataTypes.BinaryType)) {
        return Schema.of(Schema.Type.BYTES);
    }
    if (dataType.equals(DataTypes.StringType)) {
        return Schema.of(Schema.Type.STRING);
    }
    if (dataType instanceof ArrayType) {
        ArrayType arrayType = (ArrayType) dataType;
        // Special case for byte array
        if (arrayType.elementType() == DataTypes.ByteType) {
            return Schema.of(Schema.Type.BYTES);
        }
        Schema componentSchema = dataTypeToSchema(arrayType.elementType(), recordCounter);
        return Schema.arrayOf(arrayType.containsNull() ? Schema.nullableOf(componentSchema) : componentSchema);
    }
    if (dataType instanceof MapType) {
        MapType mapType = (MapType) dataType;
        Schema valueSchema = dataTypeToSchema(mapType.valueType(), recordCounter);
        return Schema.mapOf(dataTypeToSchema(mapType.keyType(), recordCounter), mapType.valueContainsNull() ? Schema.nullableOf(valueSchema) : valueSchema);
    }
    if (dataType instanceof StructType) {
        List<Schema.Field> fields = new ArrayList<>();
        for (StructField structField : ((StructType) dataType).fields()) {
            Schema fieldSchema = dataTypeToSchema(structField.dataType(), recordCounter);
            fields.add(Schema.Field.of(structField.name(), structField.nullable() ? Schema.nullableOf(fieldSchema) : fieldSchema));
        }
        return Schema.recordOf("Record" + recordCounter[0]++, fields);
    }
    // Some special types in Spark SQL
    if (dataType.equals(DataTypes.TimestampType)) {
        return Schema.of(Schema.Type.LONG);
    }
    if (dataType.equals(DataTypes.DateType)) {
        return Schema.of(Schema.Type.LONG);
    }
    // Not support the CalendarInterval type for now, as there is no equivalent in Schema
    throw new IllegalArgumentException("Unsupported data type: " + dataType.typeName());
}
Also used : ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) MapType(org.apache.spark.sql.types.MapType)

Aggregations

Schema (io.cdap.cdap.api.data.schema.Schema)362 Test (org.junit.Test)208 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)165 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)75 Table (io.cdap.cdap.api.dataset.table.Table)70 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)66 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)62 ApplicationManager (io.cdap.cdap.test.ApplicationManager)60 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)53 HashSet (java.util.HashSet)53 WorkflowManager (io.cdap.cdap.test.WorkflowManager)46 ArrayList (java.util.ArrayList)46 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)38 HashMap (java.util.HashMap)37 Map (java.util.Map)25 ReflectionSchemaGenerator (io.cdap.cdap.internal.io.ReflectionSchemaGenerator)23 IOException (java.io.IOException)23 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)18 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 List (java.util.List)14