Search in sources :

Example 1 with BinaryType

use of org.apache.spark.sql.types.BinaryType in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method createConverterFor.

static Converter createConverterFor(DataType sparkType, Schema avroType) {
    if (sparkType instanceof NullType && avroType.getType() == Schema.Type.NULL) {
        return (getter, ordinal) -> null;
    }
    if (sparkType instanceof BooleanType && avroType.getType() == Schema.Type.BOOLEAN) {
        return (getter, ordinal) -> getter.getBoolean(ordinal);
    }
    if (sparkType instanceof ByteType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getByte(ordinal));
    }
    if (sparkType instanceof ShortType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getShort(ordinal));
    }
    if (sparkType instanceof IntegerType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getInt(ordinal));
    }
    if (sparkType instanceof LongType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof FloatType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> Double.valueOf(getter.getFloat(ordinal));
    }
    if (sparkType instanceof DoubleType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> getter.getDouble(ordinal);
    }
    if (sparkType instanceof DecimalType && avroType.getType() == Schema.Type.BYTES) {
        DecimalType decimalType = (DecimalType) sparkType;
        return (getter, ordinal) -> {
            Decimal decimal = getter.getDecimal(ordinal, decimalType.precision(), decimalType.scale());
            return DECIMAL_CONVERSIONS.toBytes(decimal.toJavaBigDecimal(), avroType, LogicalTypes.decimal(decimalType.precision(), decimalType.scale()));
        };
    }
    if (sparkType instanceof StringType && avroType.getType() == Schema.Type.STRING) {
        return (getter, ordinal) -> new Utf8(getter.getUTF8String(ordinal).getBytes());
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.FIXED) {
        int size = avroType.getFixedSize();
        return (getter, ordinal) -> {
            byte[] data = getter.getBinary(ordinal);
            if (data.length != size) {
                throw new IllegalArgumentException(String.format("Cannot write %s bytes of binary data into FIXED Type with size of %s bytes", data.length, size));
            }
            return new GenericData.Fixed(avroType, data);
        };
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.BYTES) {
        return (getter, ordinal) -> ByteBuffer.wrap(getter.getBinary(ordinal));
    }
    if (sparkType instanceof DateType && avroType.getType() == Schema.Type.INT) {
        return (getter, ordinal) -> getter.getInt(ordinal);
    }
    if (sparkType instanceof TimestampType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof ArrayType && avroType.getType() == Schema.Type.ARRAY) {
        DataType et = ((ArrayType) sparkType).elementType();
        boolean containsNull = ((ArrayType) sparkType).containsNull();
        Converter elementConverter = createConverterFor(et, resolveNullableType(avroType.getElementType(), containsNull));
        return (getter, ordinal) -> {
            ArrayData arrayData = getter.getArray(ordinal);
            int len = arrayData.numElements();
            Object[] result = new Object[len];
            for (int i = 0; i < len; i++) {
                if (containsNull && arrayData.isNullAt(i)) {
                    result[i] = null;
                } else {
                    result[i] = elementConverter.convert(arrayData, i);
                }
            }
            // `ArrayList` backed by the specified array without data copying.
            return java.util.Arrays.asList(result);
        };
    }
    if (sparkType instanceof StructType && avroType.getType() == Schema.Type.RECORD) {
        StructType sparkStruct = (StructType) sparkType;
        StructConverter structConverter = new StructConverter(sparkStruct, avroType);
        int numFields = sparkStruct.length();
        return (getter, ordinal) -> structConverter.convert(getter.getStruct(ordinal, numFields));
    }
    if (sparkType instanceof UserDefinedType) {
        UserDefinedType userDefinedType = (UserDefinedType) sparkType;
        return createConverterFor(userDefinedType.sqlType(), avroType);
    }
    throw new IllegalArgumentException(String.format("Cannot convert Catalyst type %s to Avro type %s", sparkType, avroType));
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) Decimal(org.apache.spark.sql.types.Decimal) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FloatType(org.apache.spark.sql.types.FloatType) DecimalType(org.apache.spark.sql.types.DecimalType) ByteBuffer(java.nio.ByteBuffer) GenericData(org.apache.avro.generic.GenericData) ArrayType(org.apache.spark.sql.types.ArrayType) ByteType(org.apache.spark.sql.types.ByteType) LogicalTypes(org.apache.avro.LogicalTypes) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData) SpecializedGetters(org.apache.spark.sql.catalyst.expressions.SpecializedGetters) DoubleType(org.apache.spark.sql.types.DoubleType) Conversions(org.apache.avro.Conversions) NullType(org.apache.spark.sql.types.NullType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Utf8(org.apache.avro.util.Utf8) Schema(org.apache.avro.Schema) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) TimestampType(org.apache.spark.sql.types.TimestampType) ShortType(org.apache.spark.sql.types.ShortType) SchemaBuilder(org.apache.avro.SchemaBuilder) List(java.util.List) Optional(java.util.Optional) Preconditions(com.google.common.base.Preconditions) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) MapType(org.apache.spark.sql.types.MapType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ByteType(org.apache.spark.sql.types.ByteType) FloatType(org.apache.spark.sql.types.FloatType) ArrayType(org.apache.spark.sql.types.ArrayType) Decimal(org.apache.spark.sql.types.Decimal) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType) BinaryType(org.apache.spark.sql.types.BinaryType) ShortType(org.apache.spark.sql.types.ShortType) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) GenericData(org.apache.avro.generic.GenericData) IntegerType(org.apache.spark.sql.types.IntegerType) DoubleType(org.apache.spark.sql.types.DoubleType) DecimalType(org.apache.spark.sql.types.DecimalType) Utf8(org.apache.avro.util.Utf8) NullType(org.apache.spark.sql.types.NullType) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData)

Example 2 with BinaryType

use of org.apache.spark.sql.types.BinaryType in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method sparkTypeToRawAvroType.

static Schema sparkTypeToRawAvroType(DataType dataType, String recordName, SchemaBuilder.TypeBuilder<Schema> builder) {
    if (dataType instanceof BinaryType) {
        return builder.bytesType();
    }
    if (dataType instanceof ByteType || dataType instanceof ShortType || dataType instanceof IntegerType || dataType instanceof LongType) {
        return builder.longType();
    }
    if (dataType instanceof BooleanType) {
        return builder.booleanType();
    }
    if (dataType instanceof FloatType || dataType instanceof DoubleType) {
        return builder.doubleType();
    }
    if (dataType instanceof DecimalType) {
        DecimalType decimalType = (DecimalType) dataType;
        if (decimalType.precision() <= SchemaConverters.BQ_NUMERIC_PRECISION && decimalType.scale() <= SchemaConverters.BQ_NUMERIC_SCALE) {
            return LogicalTypes.decimal(decimalType.precision(), decimalType.scale()).addToSchema(builder.bytesType());
        } else {
            throw new IllegalArgumentException("Decimal type is too wide to fit in BigQuery Numeric format");
        }
    }
    if (dataType instanceof StringType) {
        return builder.stringType();
    }
    if (dataType instanceof TimestampType) {
        // team adds microsecond support to their backend
        return LogicalTypes.timestampMicros().addToSchema(builder.longType());
    }
    if (dataType instanceof DateType) {
        return LogicalTypes.date().addToSchema(builder.intType());
    }
    if (dataType instanceof ArrayType) {
        return builder.array().items(sparkTypeToRawAvroType(((ArrayType) dataType).elementType(), ((ArrayType) dataType).containsNull(), recordName));
    }
    if (dataType instanceof StructType) {
        SchemaBuilder.FieldAssembler<Schema> fieldsAssembler = builder.record(recordName).fields();
        for (StructField field : ((StructType) dataType).fields()) {
            Schema avroType = sparkTypeToRawAvroType(field.dataType(), field.nullable(), field.name());
            fieldsAssembler.name(field.name()).type(avroType).noDefault();
        }
        return fieldsAssembler.endRecord();
    }
    if (dataType instanceof UserDefinedType) {
        DataType userDefinedType = ((UserDefinedType) dataType).sqlType();
        return sparkTypeToRawAvroType(userDefinedType, recordName, builder);
    }
    if (dataType instanceof MapType) {
        throw new IllegalArgumentException(SchemaConverters.MAPTYPE_ERROR_MESSAGE);
    } else {
        throw new IllegalArgumentException("Data type not supported: " + dataType.simpleString());
    }
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ShortType(org.apache.spark.sql.types.ShortType) Schema(org.apache.avro.Schema) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) ByteType(org.apache.spark.sql.types.ByteType) MapType(org.apache.spark.sql.types.MapType) FloatType(org.apache.spark.sql.types.FloatType) IntegerType(org.apache.spark.sql.types.IntegerType) ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) DoubleType(org.apache.spark.sql.types.DoubleType) SchemaBuilder(org.apache.avro.SchemaBuilder) DecimalType(org.apache.spark.sql.types.DecimalType) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType)

Aggregations

Schema (org.apache.avro.Schema)2 SchemaBuilder (org.apache.avro.SchemaBuilder)2 ArrayType (org.apache.spark.sql.types.ArrayType)2 BinaryType (org.apache.spark.sql.types.BinaryType)2 BooleanType (org.apache.spark.sql.types.BooleanType)2 ByteType (org.apache.spark.sql.types.ByteType)2 DataType (org.apache.spark.sql.types.DataType)2 DateType (org.apache.spark.sql.types.DateType)2 DecimalType (org.apache.spark.sql.types.DecimalType)2 DoubleType (org.apache.spark.sql.types.DoubleType)2 FloatType (org.apache.spark.sql.types.FloatType)2 IntegerType (org.apache.spark.sql.types.IntegerType)2 LongType (org.apache.spark.sql.types.LongType)2 MapType (org.apache.spark.sql.types.MapType)2 ShortType (org.apache.spark.sql.types.ShortType)2 StringType (org.apache.spark.sql.types.StringType)2 StructField (org.apache.spark.sql.types.StructField)2 StructType (org.apache.spark.sql.types.StructType)2 TimestampType (org.apache.spark.sql.types.TimestampType)2 UserDefinedType (org.apache.spark.sql.types.UserDefinedType)2