Search in sources :

Example 1 with MapType

use of org.apache.spark.sql.types.MapType in project cdap by caskdata.

the class DataFrames method dataTypeToSchema.

/**
 * Converts a Spark {@link DataType} to a {@link Schema} object.
 *
 * @param dataType the data type to convert from
 * @param recordCounter tracks number of record schema becoming created; used for record name generation only
 * @return a new {@link Schema}.
 */
private static Schema dataTypeToSchema(DataType dataType, int[] recordCounter) {
    if (dataType.equals(DataTypes.NullType)) {
        return Schema.of(Schema.Type.NULL);
    }
    if (dataType.equals(DataTypes.BooleanType)) {
        return Schema.of(Schema.Type.BOOLEAN);
    }
    if (dataType.equals(DataTypes.ByteType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.ShortType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.IntegerType)) {
        return Schema.of(Schema.Type.INT);
    }
    if (dataType.equals(DataTypes.LongType)) {
        return Schema.of(Schema.Type.LONG);
    }
    if (dataType.equals(DataTypes.FloatType)) {
        return Schema.of(Schema.Type.FLOAT);
    }
    if (dataType.equals(DataTypes.DoubleType)) {
        return Schema.of(Schema.Type.DOUBLE);
    }
    if (dataType.equals(DataTypes.BinaryType)) {
        return Schema.of(Schema.Type.BYTES);
    }
    if (dataType.equals(DataTypes.StringType)) {
        return Schema.of(Schema.Type.STRING);
    }
    if (dataType instanceof ArrayType) {
        ArrayType arrayType = (ArrayType) dataType;
        // Special case for byte array
        if (arrayType.elementType() == DataTypes.ByteType) {
            return Schema.of(Schema.Type.BYTES);
        }
        Schema componentSchema = dataTypeToSchema(arrayType.elementType(), recordCounter);
        return Schema.arrayOf(arrayType.containsNull() ? Schema.nullableOf(componentSchema) : componentSchema);
    }
    if (dataType instanceof MapType) {
        MapType mapType = (MapType) dataType;
        Schema valueSchema = dataTypeToSchema(mapType.valueType(), recordCounter);
        return Schema.mapOf(dataTypeToSchema(mapType.keyType(), recordCounter), mapType.valueContainsNull() ? Schema.nullableOf(valueSchema) : valueSchema);
    }
    if (dataType instanceof StructType) {
        List<Schema.Field> fields = new ArrayList<>();
        for (StructField structField : ((StructType) dataType).fields()) {
            Schema fieldSchema = dataTypeToSchema(structField.dataType(), recordCounter);
            fields.add(Schema.Field.of(structField.name(), structField.nullable() ? Schema.nullableOf(fieldSchema) : fieldSchema));
        }
        return Schema.recordOf("Record" + recordCounter[0]++, fields);
    }
    // Some special types in Spark SQL
    if (dataType.equals(DataTypes.TimestampType)) {
        return Schema.of(Schema.Type.LONG);
    }
    if (dataType.equals(DataTypes.DateType)) {
        return Schema.of(Schema.Type.LONG);
    }
    // Not support the CalendarInterval type for now, as there is no equivalent in Schema
    throw new IllegalArgumentException("Unsupported data type: " + dataType.typeName());
}
Also used : ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) MapType(org.apache.spark.sql.types.MapType)

Example 2 with MapType

use of org.apache.spark.sql.types.MapType in project cdap by caskdata.

the class DataFrames method toRowValue.

/**
 * Converts an object value to a value type acceptable by {@link Row}
 *
 * @param value the value to convert
 * @param dataType the target {@link DataType} of the value
 * @param path the current field path from the top. It is just for error message purpose.
 * @return an object that is compatible with Spark {@link Row}.
 */
private static Object toRowValue(@Nullable Object value, DataType dataType, String path) {
    if (value == null) {
        return null;
    }
    if (dataType.equals(DataTypes.NullType)) {
        return null;
    }
    if (dataType.equals(DataTypes.BooleanType)) {
        return value;
    }
    if (dataType.equals(DataTypes.ByteType)) {
        return value;
    }
    if (dataType.equals(DataTypes.ShortType)) {
        return value;
    }
    if (dataType.equals(DataTypes.IntegerType)) {
        return value;
    }
    if (dataType.equals(DataTypes.LongType)) {
        return value;
    }
    if (dataType.equals(DataTypes.FloatType)) {
        return value;
    }
    if (dataType.equals(DataTypes.DoubleType)) {
        return value;
    }
    if (dataType.equals(DataTypes.BinaryType)) {
        if (value instanceof ByteBuffer) {
            return Bytes.toBytes((ByteBuffer) value);
        }
        return value;
    }
    if (dataType.equals(DataTypes.StringType)) {
        return value;
    }
    if (dataType instanceof ArrayType) {
        @SuppressWarnings("unchecked") Collection<Object> collection;
        int size;
        if (value instanceof Collection) {
            collection = (Collection<Object>) value;
        } else if (value.getClass().isArray()) {
            collection = Arrays.asList((Object[]) value);
        } else {
            throw new IllegalArgumentException("Value type " + value.getClass() + " is not supported as array type value. It must either be a Collection or an array");
        }
        List<Object> result = new ArrayList<>(collection.size());
        String elementPath = path + "[]";
        ArrayType arrayType = (ArrayType) dataType;
        for (Object obj : collection) {
            Object elementValue = toRowValue(obj, arrayType.elementType(), elementPath);
            if (elementValue == null && !arrayType.containsNull()) {
                throw new IllegalArgumentException("Null value is not allowed for array element at " + elementPath);
            }
            result.add(elementValue);
        }
        return JavaConversions.asScalaBuffer(result).toSeq();
    }
    if (dataType instanceof MapType) {
        @SuppressWarnings("unchecked") Map<Object, Object> map = (Map<Object, Object>) value;
        Map<Object, Object> result = new LinkedHashMap<>(map.size());
        String mapPath = path + "<>";
        MapType mapType = (MapType) dataType;
        for (Map.Entry<?, ?> entry : map.entrySet()) {
            Object mapKey = toRowValue(entry.getKey(), mapType.keyType(), mapPath);
            if (mapKey == null) {
                throw new IllegalArgumentException("Null key is not allowed for map at " + mapPath);
            }
            Object mapValue = toRowValue(entry.getValue(), mapType.valueType(), mapPath);
            if (mapValue == null && !mapType.valueContainsNull()) {
                throw new IllegalArgumentException("Null value is not allowed for map at " + mapPath);
            }
            result.put(mapKey, mapValue);
        }
        return JavaConversions.mapAsScalaMap(result);
    }
    if (dataType instanceof StructType) {
        StructuredRecord record = (StructuredRecord) value;
        StructField[] fields = ((StructType) dataType).fields();
        Object[] fieldValues = new Object[fields.length];
        for (int i = 0; i < fields.length; i++) {
            String fieldName = fields[i].name();
            String fieldPath = path + "/" + fieldName;
            Object fieldValue = toRowValue(record.get(fieldName), fields[i].dataType(), fieldPath);
            if (fieldValue == null && !fields[i].nullable()) {
                throw new IllegalArgumentException("Null value is not allowed for row field at " + fieldPath);
            }
            fieldValues[i] = fieldValue;
        }
        return RowFactory.create(fieldValues);
    }
    // Some special types in Spark SQL
    if (dataType.equals(DataTypes.TimestampType)) {
        return new Timestamp((long) value);
    }
    if (dataType.equals(DataTypes.DateType)) {
        return new Date((long) value);
    }
    // Not support the CalendarInterval type for now, as there is no equivalent in Schema
    throw new IllegalArgumentException("Unsupported data type: " + dataType.typeName());
}
Also used : StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) ByteBuffer(java.nio.ByteBuffer) Timestamp(java.sql.Timestamp) MapType(org.apache.spark.sql.types.MapType) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Date(java.sql.Date) LinkedHashMap(java.util.LinkedHashMap) ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) Collection(java.util.Collection) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 3 with MapType

use of org.apache.spark.sql.types.MapType in project cdap by caskdata.

the class DataFramesTest method testStructType.

@Test
public void testStructType() {
    Schema schema = Schema.recordOf("Record0", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT)), Schema.Field.of("profile", Schema.nullableOf(Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING)))));
    StructType dataType = DataFrames.toDataType(schema);
    Assert.assertEquals(DataTypes.StringType, dataType.apply("name").dataType());
    Assert.assertFalse(dataType.apply("name").nullable());
    Assert.assertEquals(DataTypes.IntegerType, dataType.apply("age").dataType());
    Assert.assertFalse(dataType.apply("age").nullable());
    Assert.assertTrue(dataType.apply("profile").dataType() instanceof MapType);
    Assert.assertTrue(dataType.apply("profile").nullable());
    Assert.assertEquals(schema, DataFrames.toSchema(dataType));
}
Also used : StructType(org.apache.spark.sql.types.StructType) Schema(co.cask.cdap.api.data.schema.Schema) MapType(org.apache.spark.sql.types.MapType) Test(org.junit.Test)

Example 4 with MapType

use of org.apache.spark.sql.types.MapType in project cdap by caskdata.

the class DataFramesTest method testMapType.

@Test
public void testMapType() {
    // Simple Map
    Schema schema = Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.BOOLEAN));
    MapType dataType = DataFrames.toDataType(schema);
    Assert.assertFalse(dataType.valueContainsNull());
    Assert.assertEquals(DataTypes.StringType, dataType.keyType());
    Assert.assertEquals(DataTypes.BooleanType, dataType.valueType());
    Assert.assertEquals(schema, DataFrames.toSchema(dataType));
    // Map with nullable value
    schema = Schema.mapOf(Schema.of(Schema.Type.INT), Schema.nullableOf(Schema.of(Schema.Type.STRING)));
    dataType = DataFrames.toDataType(schema);
    Assert.assertTrue(dataType.valueContainsNull());
    Assert.assertEquals(DataTypes.IntegerType, dataType.keyType());
    Assert.assertEquals(DataTypes.StringType, dataType.valueType());
    Assert.assertEquals(schema, DataFrames.toSchema(dataType));
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) MapType(org.apache.spark.sql.types.MapType) Test(org.junit.Test)

Aggregations

MapType (org.apache.spark.sql.types.MapType)4 Schema (co.cask.cdap.api.data.schema.Schema)3 StructType (org.apache.spark.sql.types.StructType)3 ArrayList (java.util.ArrayList)2 ArrayType (org.apache.spark.sql.types.ArrayType)2 StructField (org.apache.spark.sql.types.StructField)2 Test (org.junit.Test)2 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1 ByteBuffer (java.nio.ByteBuffer)1 Date (java.sql.Date)1 Timestamp (java.sql.Timestamp)1 Collection (java.util.Collection)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1