Search in sources :

Example 1 with IntegerType$

use of org.apache.spark.sql.types.IntegerType$ in project iceberg by apache.

the class Spark3Util method findWidth.

@SuppressWarnings("unchecked")
private static int findWidth(Transform transform) {
    for (Expression expr : transform.arguments()) {
        if (expr instanceof Literal) {
            if (((Literal) expr).dataType() instanceof IntegerType) {
                Literal<Integer> lit = (Literal<Integer>) expr;
                Preconditions.checkArgument(lit.value() > 0, "Unsupported width for transform: %s", transform.describe());
                return lit.value();
            } else if (((Literal) expr).dataType() instanceof LongType) {
                Literal<Long> lit = (Literal<Long>) expr;
                Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, "Unsupported width for transform: %s", transform.describe());
                if (lit.value() > Integer.MAX_VALUE) {
                    throw new IllegalArgumentException();
                }
                return lit.value().intValue();
            }
        }
    }
    throw new IllegalArgumentException("Cannot find width for transform: " + transform.describe());
}
Also used : IntegerType(org.apache.spark.sql.types.IntegerType) LongType(org.apache.spark.sql.types.LongType) Expression(org.apache.spark.sql.connector.expressions.Expression) Literal(org.apache.spark.sql.connector.expressions.Literal)

Example 2 with IntegerType$

use of org.apache.spark.sql.types.IntegerType$ in project OpenLineage by OpenLineage.

the class SparkReadWriteIntegTest method testWithLogicalRdd.

@Test
public void testWithLogicalRdd(@TempDir Path tmpDir, SparkSession spark) throws InterruptedException, TimeoutException {
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    String csvPath = tmpDir.toAbsolutePath() + "/csv_data";
    String csvUri = "file://" + csvPath;
    spark.createDataFrame(Arrays.asList(new GenericRow(new Object[] { 1, "seven" }), new GenericRow(new Object[] { 6, "one" }), new GenericRow(new Object[] { 72, "fourteen" }), new GenericRow(new Object[] { 99, "sixteen" })), schema).write().csv(csvUri);
    StaticExecutionContextFactory.waitForExecutionEnd();
    // reset to start counting now
    reset(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT);
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getJobNamespace()).thenReturn("theNamespace");
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentJobName()).thenReturn("theParentJob");
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentRunId()).thenReturn(Optional.of(UUID.randomUUID()));
    JobConf conf = new JobConf();
    FileInputFormat.addInputPath(conf, new org.apache.hadoop.fs.Path(csvUri));
    JavaRDD<Tuple2<LongWritable, Text>> csvRdd = spark.sparkContext().hadoopRDD(conf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD();
    JavaRDD<Row> splitDf = csvRdd.map(t -> new String(t._2.getBytes()).split(",")).map(arr -> new GenericRow(new Object[] { Integer.parseInt(arr[0]), arr[1] }));
    Dataset<Row> df = spark.createDataFrame(splitDf, schema);
    String outputPath = tmpDir.toAbsolutePath() + "/output_data";
    String jsonPath = "file://" + outputPath;
    df.write().json(jsonPath);
    // wait for event processing to complete
    StaticExecutionContextFactory.waitForExecutionEnd();
    ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
    OpenLineage.RunEvent completeEvent = lineageEvent.getAllValues().get(2);
    assertThat(completeEvent).hasFieldOrPropertyWithValue("eventType", RunEvent.EventType.COMPLETE);
    assertThat(completeEvent.getInputs()).singleElement().hasFieldOrPropertyWithValue("name", csvPath).hasFieldOrPropertyWithValue("namespace", "file");
    assertThat(completeEvent.getOutputs()).singleElement().hasFieldOrPropertyWithValue("name", outputPath).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Provides(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Provides) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) DockerImageName(org.testcontainers.utility.DockerImageName) ArgumentMatchers.eq(org.mockito.ArgumentMatchers.eq) InstanceOfAssertFactories(org.assertj.core.api.InstanceOfAssertFactories) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) InputDataset(io.openlineage.client.OpenLineage.InputDataset) TimeoutException(java.util.concurrent.TimeoutException) Text(org.apache.hadoop.io.Text) Random(java.util.Random) StandardTableDefinition(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.StandardTableDefinition) LongWritable(org.apache.hadoop.io.LongWritable) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) JsonGenerator(org.codehaus.jackson.JsonGenerator) Field(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Field) StringSerializer(org.apache.kafka.common.serialization.StringSerializer) Tag(org.junit.jupiter.api.Tag) Module(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Module) Path(java.nio.file.Path) RunEvent(io.openlineage.client.OpenLineage.RunEvent) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) KafkaContainer(org.testcontainers.containers.KafkaContainer) IntegerType$(org.apache.spark.sql.types.IntegerType$) ImmutableMap(com.google.common.collect.ImmutableMap) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) UUID(java.util.UUID) Tuple2(scala.Tuple2) StandardCharsets(java.nio.charset.StandardCharsets) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) Test(org.junit.jupiter.api.Test) Schema(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Schema) List(java.util.List) VerificationModeFactory.times(org.mockito.internal.verification.VerificationModeFactory.times) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ObjectAssert(org.assertj.core.api.ObjectAssert) Optional(java.util.Optional) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ProducerRecord(org.apache.kafka.clients.producer.ProducerRecord) Dataset(org.apache.spark.sql.Dataset) TableId(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId) Binder(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Binder) CompletableFuture(java.util.concurrent.CompletableFuture) LongType$(org.apache.spark.sql.types.LongType$) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) ArgumentCaptor(org.mockito.ArgumentCaptor) BigQueryUtil(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.connector.common.BigQueryUtil) DefaultRunFacet(io.openlineage.client.OpenLineage.DefaultRunFacet) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) TestOpenLineageEventHandlerFactory(io.openlineage.spark.agent.util.TestOpenLineageEventHandlerFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) BigQuery(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQuery) Properties(java.util.Properties) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) StandardSQLTypeName(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.StandardSQLTypeName) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) Mockito.when(org.mockito.Mockito.when) Row(org.apache.spark.sql.Row) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) JobConf(org.apache.hadoop.mapred.JobConf) Mockito(org.mockito.Mockito) AfterEach(org.junit.jupiter.api.AfterEach) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) BinaryType$(org.apache.spark.sql.types.BinaryType$) MockBigQueryRelationProvider(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.MockBigQueryRelationProvider) HashMap(scala.collection.immutable.HashMap) Mockito.reset(org.mockito.Mockito.reset) OpenLineage(io.openlineage.client.OpenLineage) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) RunEvent(io.openlineage.client.OpenLineage.RunEvent) StructType(org.apache.spark.sql.types.StructType) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) StructField(org.apache.spark.sql.types.StructField) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Tuple2(scala.Tuple2) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Test(org.junit.jupiter.api.Test)

Example 3 with IntegerType$

use of org.apache.spark.sql.types.IntegerType$ in project OpenLineage by OpenLineage.

the class LogicalRDDVisitorTest method testApply.

@Test
public void testApply(@TempDir Path tmpDir) {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    jobConf = new JobConf();
    FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
    RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
    LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
    assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
    List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
    assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Seq$(scala.collection.Seq$) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InternalRow(org.apache.spark.sql.catalyst.InternalRow) SinglePartition$(org.apache.spark.sql.catalyst.plans.physical.SinglePartition$) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) IntegerType$(org.apache.spark.sql.types.IntegerType$) SparkSession$(org.apache.spark.sql.SparkSession$) DatasetFactory(io.openlineage.spark.api.DatasetFactory) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) SortOrder(org.apache.spark.sql.catalyst.expressions.SortOrder) TempDir(org.junit.jupiter.api.io.TempDir) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) HashMap(scala.collection.immutable.HashMap) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) StructField(org.apache.spark.sql.types.StructField) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.jupiter.api.Test)

Example 4 with IntegerType$

use of org.apache.spark.sql.types.IntegerType$ in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method createConverterFor.

static Converter createConverterFor(DataType sparkType, Schema avroType) {
    if (sparkType instanceof NullType && avroType.getType() == Schema.Type.NULL) {
        return (getter, ordinal) -> null;
    }
    if (sparkType instanceof BooleanType && avroType.getType() == Schema.Type.BOOLEAN) {
        return (getter, ordinal) -> getter.getBoolean(ordinal);
    }
    if (sparkType instanceof ByteType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getByte(ordinal));
    }
    if (sparkType instanceof ShortType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getShort(ordinal));
    }
    if (sparkType instanceof IntegerType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getInt(ordinal));
    }
    if (sparkType instanceof LongType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof FloatType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> Double.valueOf(getter.getFloat(ordinal));
    }
    if (sparkType instanceof DoubleType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> getter.getDouble(ordinal);
    }
    if (sparkType instanceof DecimalType && avroType.getType() == Schema.Type.BYTES) {
        DecimalType decimalType = (DecimalType) sparkType;
        return (getter, ordinal) -> {
            Decimal decimal = getter.getDecimal(ordinal, decimalType.precision(), decimalType.scale());
            return DECIMAL_CONVERSIONS.toBytes(decimal.toJavaBigDecimal(), avroType, LogicalTypes.decimal(decimalType.precision(), decimalType.scale()));
        };
    }
    if (sparkType instanceof StringType && avroType.getType() == Schema.Type.STRING) {
        return (getter, ordinal) -> new Utf8(getter.getUTF8String(ordinal).getBytes());
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.FIXED) {
        int size = avroType.getFixedSize();
        return (getter, ordinal) -> {
            byte[] data = getter.getBinary(ordinal);
            if (data.length != size) {
                throw new IllegalArgumentException(String.format("Cannot write %s bytes of binary data into FIXED Type with size of %s bytes", data.length, size));
            }
            return new GenericData.Fixed(avroType, data);
        };
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.BYTES) {
        return (getter, ordinal) -> ByteBuffer.wrap(getter.getBinary(ordinal));
    }
    if (sparkType instanceof DateType && avroType.getType() == Schema.Type.INT) {
        return (getter, ordinal) -> getter.getInt(ordinal);
    }
    if (sparkType instanceof TimestampType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof ArrayType && avroType.getType() == Schema.Type.ARRAY) {
        DataType et = ((ArrayType) sparkType).elementType();
        boolean containsNull = ((ArrayType) sparkType).containsNull();
        Converter elementConverter = createConverterFor(et, resolveNullableType(avroType.getElementType(), containsNull));
        return (getter, ordinal) -> {
            ArrayData arrayData = getter.getArray(ordinal);
            int len = arrayData.numElements();
            Object[] result = new Object[len];
            for (int i = 0; i < len; i++) {
                if (containsNull && arrayData.isNullAt(i)) {
                    result[i] = null;
                } else {
                    result[i] = elementConverter.convert(arrayData, i);
                }
            }
            // `ArrayList` backed by the specified array without data copying.
            return java.util.Arrays.asList(result);
        };
    }
    if (sparkType instanceof StructType && avroType.getType() == Schema.Type.RECORD) {
        StructType sparkStruct = (StructType) sparkType;
        StructConverter structConverter = new StructConverter(sparkStruct, avroType);
        int numFields = sparkStruct.length();
        return (getter, ordinal) -> structConverter.convert(getter.getStruct(ordinal, numFields));
    }
    if (sparkType instanceof UserDefinedType) {
        UserDefinedType userDefinedType = (UserDefinedType) sparkType;
        return createConverterFor(userDefinedType.sqlType(), avroType);
    }
    throw new IllegalArgumentException(String.format("Cannot convert Catalyst type %s to Avro type %s", sparkType, avroType));
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) Decimal(org.apache.spark.sql.types.Decimal) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FloatType(org.apache.spark.sql.types.FloatType) DecimalType(org.apache.spark.sql.types.DecimalType) ByteBuffer(java.nio.ByteBuffer) GenericData(org.apache.avro.generic.GenericData) ArrayType(org.apache.spark.sql.types.ArrayType) ByteType(org.apache.spark.sql.types.ByteType) LogicalTypes(org.apache.avro.LogicalTypes) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData) SpecializedGetters(org.apache.spark.sql.catalyst.expressions.SpecializedGetters) DoubleType(org.apache.spark.sql.types.DoubleType) Conversions(org.apache.avro.Conversions) NullType(org.apache.spark.sql.types.NullType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Utf8(org.apache.avro.util.Utf8) Schema(org.apache.avro.Schema) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) TimestampType(org.apache.spark.sql.types.TimestampType) ShortType(org.apache.spark.sql.types.ShortType) SchemaBuilder(org.apache.avro.SchemaBuilder) List(java.util.List) Optional(java.util.Optional) Preconditions(com.google.common.base.Preconditions) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) MapType(org.apache.spark.sql.types.MapType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ByteType(org.apache.spark.sql.types.ByteType) FloatType(org.apache.spark.sql.types.FloatType) ArrayType(org.apache.spark.sql.types.ArrayType) Decimal(org.apache.spark.sql.types.Decimal) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType) BinaryType(org.apache.spark.sql.types.BinaryType) ShortType(org.apache.spark.sql.types.ShortType) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) GenericData(org.apache.avro.generic.GenericData) IntegerType(org.apache.spark.sql.types.IntegerType) DoubleType(org.apache.spark.sql.types.DoubleType) DecimalType(org.apache.spark.sql.types.DecimalType) Utf8(org.apache.avro.util.Utf8) NullType(org.apache.spark.sql.types.NullType) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData)

Example 5 with IntegerType$

use of org.apache.spark.sql.types.IntegerType$ in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method sparkTypeToRawAvroType.

static Schema sparkTypeToRawAvroType(DataType dataType, String recordName, SchemaBuilder.TypeBuilder<Schema> builder) {
    if (dataType instanceof BinaryType) {
        return builder.bytesType();
    }
    if (dataType instanceof ByteType || dataType instanceof ShortType || dataType instanceof IntegerType || dataType instanceof LongType) {
        return builder.longType();
    }
    if (dataType instanceof BooleanType) {
        return builder.booleanType();
    }
    if (dataType instanceof FloatType || dataType instanceof DoubleType) {
        return builder.doubleType();
    }
    if (dataType instanceof DecimalType) {
        DecimalType decimalType = (DecimalType) dataType;
        if (decimalType.precision() <= SchemaConverters.BQ_NUMERIC_PRECISION && decimalType.scale() <= SchemaConverters.BQ_NUMERIC_SCALE) {
            return LogicalTypes.decimal(decimalType.precision(), decimalType.scale()).addToSchema(builder.bytesType());
        } else {
            throw new IllegalArgumentException("Decimal type is too wide to fit in BigQuery Numeric format");
        }
    }
    if (dataType instanceof StringType) {
        return builder.stringType();
    }
    if (dataType instanceof TimestampType) {
        // team adds microsecond support to their backend
        return LogicalTypes.timestampMicros().addToSchema(builder.longType());
    }
    if (dataType instanceof DateType) {
        return LogicalTypes.date().addToSchema(builder.intType());
    }
    if (dataType instanceof ArrayType) {
        return builder.array().items(sparkTypeToRawAvroType(((ArrayType) dataType).elementType(), ((ArrayType) dataType).containsNull(), recordName));
    }
    if (dataType instanceof StructType) {
        SchemaBuilder.FieldAssembler<Schema> fieldsAssembler = builder.record(recordName).fields();
        for (StructField field : ((StructType) dataType).fields()) {
            Schema avroType = sparkTypeToRawAvroType(field.dataType(), field.nullable(), field.name());
            fieldsAssembler.name(field.name()).type(avroType).noDefault();
        }
        return fieldsAssembler.endRecord();
    }
    if (dataType instanceof UserDefinedType) {
        DataType userDefinedType = ((UserDefinedType) dataType).sqlType();
        return sparkTypeToRawAvroType(userDefinedType, recordName, builder);
    }
    if (dataType instanceof MapType) {
        throw new IllegalArgumentException(SchemaConverters.MAPTYPE_ERROR_MESSAGE);
    } else {
        throw new IllegalArgumentException("Data type not supported: " + dataType.simpleString());
    }
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ShortType(org.apache.spark.sql.types.ShortType) Schema(org.apache.avro.Schema) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) ByteType(org.apache.spark.sql.types.ByteType) MapType(org.apache.spark.sql.types.MapType) FloatType(org.apache.spark.sql.types.FloatType) IntegerType(org.apache.spark.sql.types.IntegerType) ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) DoubleType(org.apache.spark.sql.types.DoubleType) SchemaBuilder(org.apache.avro.SchemaBuilder) DecimalType(org.apache.spark.sql.types.DecimalType) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)5 IntegerType (org.apache.spark.sql.types.IntegerType)4 LongType (org.apache.spark.sql.types.LongType)4 StructField (org.apache.spark.sql.types.StructField)4 List (java.util.List)3 ArrayType (org.apache.spark.sql.types.ArrayType)3 BooleanType (org.apache.spark.sql.types.BooleanType)3 DateType (org.apache.spark.sql.types.DateType)3 DecimalType (org.apache.spark.sql.types.DecimalType)3 DoubleType (org.apache.spark.sql.types.DoubleType)3 FloatType (org.apache.spark.sql.types.FloatType)3 ShortType (org.apache.spark.sql.types.ShortType)3 StringType (org.apache.spark.sql.types.StringType)3 TimestampType (org.apache.spark.sql.types.TimestampType)3 OpenLineage (io.openlineage.client.OpenLineage)2 SparkAgentTestExtension (io.openlineage.spark.agent.SparkAgentTestExtension)2 OpenLineageClient (io.openlineage.spark.agent.client.OpenLineageClient)2 Path (java.nio.file.Path)2 Optional (java.util.Optional)2 Schema (org.apache.avro.Schema)2