Search in sources :

Example 1 with StringType$

use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.

the class SparkReadWriteIntegTest method testWithLogicalRdd.

@Test
public void testWithLogicalRdd(@TempDir Path tmpDir, SparkSession spark) throws InterruptedException, TimeoutException {
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    String csvPath = tmpDir.toAbsolutePath() + "/csv_data";
    String csvUri = "file://" + csvPath;
    spark.createDataFrame(Arrays.asList(new GenericRow(new Object[] { 1, "seven" }), new GenericRow(new Object[] { 6, "one" }), new GenericRow(new Object[] { 72, "fourteen" }), new GenericRow(new Object[] { 99, "sixteen" })), schema).write().csv(csvUri);
    StaticExecutionContextFactory.waitForExecutionEnd();
    // reset to start counting now
    reset(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT);
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getJobNamespace()).thenReturn("theNamespace");
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentJobName()).thenReturn("theParentJob");
    when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentRunId()).thenReturn(Optional.of(UUID.randomUUID()));
    JobConf conf = new JobConf();
    FileInputFormat.addInputPath(conf, new org.apache.hadoop.fs.Path(csvUri));
    JavaRDD<Tuple2<LongWritable, Text>> csvRdd = spark.sparkContext().hadoopRDD(conf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD();
    JavaRDD<Row> splitDf = csvRdd.map(t -> new String(t._2.getBytes()).split(",")).map(arr -> new GenericRow(new Object[] { Integer.parseInt(arr[0]), arr[1] }));
    Dataset<Row> df = spark.createDataFrame(splitDf, schema);
    String outputPath = tmpDir.toAbsolutePath() + "/output_data";
    String jsonPath = "file://" + outputPath;
    df.write().json(jsonPath);
    // wait for event processing to complete
    StaticExecutionContextFactory.waitForExecutionEnd();
    ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
    OpenLineage.RunEvent completeEvent = lineageEvent.getAllValues().get(2);
    assertThat(completeEvent).hasFieldOrPropertyWithValue("eventType", RunEvent.EventType.COMPLETE);
    assertThat(completeEvent.getInputs()).singleElement().hasFieldOrPropertyWithValue("name", csvPath).hasFieldOrPropertyWithValue("namespace", "file");
    assertThat(completeEvent.getOutputs()).singleElement().hasFieldOrPropertyWithValue("name", outputPath).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Provides(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Provides) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) DockerImageName(org.testcontainers.utility.DockerImageName) ArgumentMatchers.eq(org.mockito.ArgumentMatchers.eq) InstanceOfAssertFactories(org.assertj.core.api.InstanceOfAssertFactories) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) InputDataset(io.openlineage.client.OpenLineage.InputDataset) TimeoutException(java.util.concurrent.TimeoutException) Text(org.apache.hadoop.io.Text) Random(java.util.Random) StandardTableDefinition(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.StandardTableDefinition) LongWritable(org.apache.hadoop.io.LongWritable) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) JsonGenerator(org.codehaus.jackson.JsonGenerator) Field(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Field) StringSerializer(org.apache.kafka.common.serialization.StringSerializer) Tag(org.junit.jupiter.api.Tag) Module(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Module) Path(java.nio.file.Path) RunEvent(io.openlineage.client.OpenLineage.RunEvent) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) KafkaContainer(org.testcontainers.containers.KafkaContainer) IntegerType$(org.apache.spark.sql.types.IntegerType$) ImmutableMap(com.google.common.collect.ImmutableMap) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) UUID(java.util.UUID) Tuple2(scala.Tuple2) StandardCharsets(java.nio.charset.StandardCharsets) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) Test(org.junit.jupiter.api.Test) Schema(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Schema) List(java.util.List) VerificationModeFactory.times(org.mockito.internal.verification.VerificationModeFactory.times) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ObjectAssert(org.assertj.core.api.ObjectAssert) Optional(java.util.Optional) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ProducerRecord(org.apache.kafka.clients.producer.ProducerRecord) Dataset(org.apache.spark.sql.Dataset) TableId(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId) Binder(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Binder) CompletableFuture(java.util.concurrent.CompletableFuture) LongType$(org.apache.spark.sql.types.LongType$) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) ArgumentCaptor(org.mockito.ArgumentCaptor) BigQueryUtil(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.connector.common.BigQueryUtil) DefaultRunFacet(io.openlineage.client.OpenLineage.DefaultRunFacet) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) TestOpenLineageEventHandlerFactory(io.openlineage.spark.agent.util.TestOpenLineageEventHandlerFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) BigQuery(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQuery) Properties(java.util.Properties) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) StandardSQLTypeName(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.StandardSQLTypeName) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) Mockito.when(org.mockito.Mockito.when) Row(org.apache.spark.sql.Row) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) JobConf(org.apache.hadoop.mapred.JobConf) Mockito(org.mockito.Mockito) AfterEach(org.junit.jupiter.api.AfterEach) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) BinaryType$(org.apache.spark.sql.types.BinaryType$) MockBigQueryRelationProvider(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.MockBigQueryRelationProvider) HashMap(scala.collection.immutable.HashMap) Mockito.reset(org.mockito.Mockito.reset) OpenLineage(io.openlineage.client.OpenLineage) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) RunEvent(io.openlineage.client.OpenLineage.RunEvent) StructType(org.apache.spark.sql.types.StructType) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) StructField(org.apache.spark.sql.types.StructField) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Tuple2(scala.Tuple2) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Test(org.junit.jupiter.api.Test)

Example 2 with StringType$

use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.

the class LogicalRDDVisitorTest method testApply.

@Test
public void testApply(@TempDir Path tmpDir) {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    jobConf = new JobConf();
    FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
    RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
    LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
    assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
    List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
    assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Seq$(scala.collection.Seq$) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InternalRow(org.apache.spark.sql.catalyst.InternalRow) SinglePartition$(org.apache.spark.sql.catalyst.plans.physical.SinglePartition$) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) IntegerType$(org.apache.spark.sql.types.IntegerType$) SparkSession$(org.apache.spark.sql.SparkSession$) DatasetFactory(io.openlineage.spark.api.DatasetFactory) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) SortOrder(org.apache.spark.sql.catalyst.expressions.SortOrder) TempDir(org.junit.jupiter.api.io.TempDir) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) HashMap(scala.collection.immutable.HashMap) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) StructField(org.apache.spark.sql.types.StructField) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.jupiter.api.Test)

Example 3 with StringType$

use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.

the class DeltaDataSourceTest method testInsertIntoDeltaSource.

@Test
public void testInsertIntoDeltaSource(@TempDir Path tempDir, SparkSession spark) throws IOException, InterruptedException, TimeoutException {
    StructType tableSchema = new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()), new StructField("age", LongType$.MODULE$, false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(Arrays.asList(new GenericRowWithSchema(new Object[] { "john", 25L }, tableSchema), new GenericRowWithSchema(new Object[] { "sam", 22L }, tableSchema), new GenericRowWithSchema(new Object[] { "alicia", 35L }, tableSchema), new GenericRowWithSchema(new Object[] { "bob", 47L }, tableSchema), new GenericRowWithSchema(new Object[] { "jordan", 52L }, tableSchema), new GenericRowWithSchema(new Object[] { "liz", 19L }, tableSchema), new GenericRowWithSchema(new Object[] { "marcia", 83L }, tableSchema), new GenericRowWithSchema(new Object[] { "maria", 40L }, tableSchema), new GenericRowWithSchema(new Object[] { "luis", 8L }, tableSchema), new GenericRowWithSchema(new Object[] { "gabriel", 30L }, tableSchema)), tableSchema);
    String deltaDir = tempDir.resolve("deltaData").toAbsolutePath().toString();
    df.write().format("delta").option("path", deltaDir).mode(SaveMode.Overwrite).save();
    // wait for event processing to complete
    StaticExecutionContextFactory.waitForExecutionEnd();
    ArgumentCaptor<RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, Mockito.atLeast(2)).emit(lineageEvent.capture());
    List<RunEvent> events = lineageEvent.getAllValues();
    Optional<RunEvent> completionEvent = events.stream().filter(e -> e.getEventType().equals(EventType.COMPLETE) && !e.getOutputs().isEmpty()).findFirst();
    assertTrue(completionEvent.isPresent());
    OpenLineage.RunEvent event = completionEvent.get();
    List<OpenLineage.OutputDataset> outputs = event.getOutputs();
    assertEquals(1, outputs.size());
    assertEquals("file", outputs.get(0).getNamespace());
    assertEquals(deltaDir, outputs.get(0).getName());
}
Also used : Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) TimeoutException(java.util.concurrent.TimeoutException) LongType$(org.apache.spark.sql.types.LongType$) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ArgumentCaptor(org.mockito.ArgumentCaptor) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Path(java.nio.file.Path) RunEvent(io.openlineage.client.OpenLineage.RunEvent) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) EventType(io.openlineage.client.OpenLineage.RunEvent.EventType) SaveMode(org.apache.spark.sql.SaveMode) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test) Mockito(org.mockito.Mockito) List(java.util.List) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Optional(java.util.Optional) StaticExecutionContextFactory(io.openlineage.spark.agent.lifecycle.StaticExecutionContextFactory) OpenLineage(io.openlineage.client.OpenLineage) RunEvent(io.openlineage.client.OpenLineage.RunEvent) StructType(org.apache.spark.sql.types.StructType) StructField(org.apache.spark.sql.types.StructField) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) OpenLineage(io.openlineage.client.OpenLineage) Row(org.apache.spark.sql.Row) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Test(org.junit.jupiter.api.Test)

Example 4 with StringType$

use of org.apache.spark.sql.types.StringType$ in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method createConverterFor.

static Converter createConverterFor(DataType sparkType, Schema avroType) {
    if (sparkType instanceof NullType && avroType.getType() == Schema.Type.NULL) {
        return (getter, ordinal) -> null;
    }
    if (sparkType instanceof BooleanType && avroType.getType() == Schema.Type.BOOLEAN) {
        return (getter, ordinal) -> getter.getBoolean(ordinal);
    }
    if (sparkType instanceof ByteType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getByte(ordinal));
    }
    if (sparkType instanceof ShortType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getShort(ordinal));
    }
    if (sparkType instanceof IntegerType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> Long.valueOf(getter.getInt(ordinal));
    }
    if (sparkType instanceof LongType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof FloatType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> Double.valueOf(getter.getFloat(ordinal));
    }
    if (sparkType instanceof DoubleType && avroType.getType() == Schema.Type.DOUBLE) {
        return (getter, ordinal) -> getter.getDouble(ordinal);
    }
    if (sparkType instanceof DecimalType && avroType.getType() == Schema.Type.BYTES) {
        DecimalType decimalType = (DecimalType) sparkType;
        return (getter, ordinal) -> {
            Decimal decimal = getter.getDecimal(ordinal, decimalType.precision(), decimalType.scale());
            return DECIMAL_CONVERSIONS.toBytes(decimal.toJavaBigDecimal(), avroType, LogicalTypes.decimal(decimalType.precision(), decimalType.scale()));
        };
    }
    if (sparkType instanceof StringType && avroType.getType() == Schema.Type.STRING) {
        return (getter, ordinal) -> new Utf8(getter.getUTF8String(ordinal).getBytes());
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.FIXED) {
        int size = avroType.getFixedSize();
        return (getter, ordinal) -> {
            byte[] data = getter.getBinary(ordinal);
            if (data.length != size) {
                throw new IllegalArgumentException(String.format("Cannot write %s bytes of binary data into FIXED Type with size of %s bytes", data.length, size));
            }
            return new GenericData.Fixed(avroType, data);
        };
    }
    if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.BYTES) {
        return (getter, ordinal) -> ByteBuffer.wrap(getter.getBinary(ordinal));
    }
    if (sparkType instanceof DateType && avroType.getType() == Schema.Type.INT) {
        return (getter, ordinal) -> getter.getInt(ordinal);
    }
    if (sparkType instanceof TimestampType && avroType.getType() == Schema.Type.LONG) {
        return (getter, ordinal) -> getter.getLong(ordinal);
    }
    if (sparkType instanceof ArrayType && avroType.getType() == Schema.Type.ARRAY) {
        DataType et = ((ArrayType) sparkType).elementType();
        boolean containsNull = ((ArrayType) sparkType).containsNull();
        Converter elementConverter = createConverterFor(et, resolveNullableType(avroType.getElementType(), containsNull));
        return (getter, ordinal) -> {
            ArrayData arrayData = getter.getArray(ordinal);
            int len = arrayData.numElements();
            Object[] result = new Object[len];
            for (int i = 0; i < len; i++) {
                if (containsNull && arrayData.isNullAt(i)) {
                    result[i] = null;
                } else {
                    result[i] = elementConverter.convert(arrayData, i);
                }
            }
            // `ArrayList` backed by the specified array without data copying.
            return java.util.Arrays.asList(result);
        };
    }
    if (sparkType instanceof StructType && avroType.getType() == Schema.Type.RECORD) {
        StructType sparkStruct = (StructType) sparkType;
        StructConverter structConverter = new StructConverter(sparkStruct, avroType);
        int numFields = sparkStruct.length();
        return (getter, ordinal) -> structConverter.convert(getter.getStruct(ordinal, numFields));
    }
    if (sparkType instanceof UserDefinedType) {
        UserDefinedType userDefinedType = (UserDefinedType) sparkType;
        return createConverterFor(userDefinedType.sqlType(), avroType);
    }
    throw new IllegalArgumentException(String.format("Cannot convert Catalyst type %s to Avro type %s", sparkType, avroType));
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) Decimal(org.apache.spark.sql.types.Decimal) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FloatType(org.apache.spark.sql.types.FloatType) DecimalType(org.apache.spark.sql.types.DecimalType) ByteBuffer(java.nio.ByteBuffer) GenericData(org.apache.avro.generic.GenericData) ArrayType(org.apache.spark.sql.types.ArrayType) ByteType(org.apache.spark.sql.types.ByteType) LogicalTypes(org.apache.avro.LogicalTypes) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData) SpecializedGetters(org.apache.spark.sql.catalyst.expressions.SpecializedGetters) DoubleType(org.apache.spark.sql.types.DoubleType) Conversions(org.apache.avro.Conversions) NullType(org.apache.spark.sql.types.NullType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Utf8(org.apache.avro.util.Utf8) Schema(org.apache.avro.Schema) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) TimestampType(org.apache.spark.sql.types.TimestampType) ShortType(org.apache.spark.sql.types.ShortType) SchemaBuilder(org.apache.avro.SchemaBuilder) List(java.util.List) Optional(java.util.Optional) Preconditions(com.google.common.base.Preconditions) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) MapType(org.apache.spark.sql.types.MapType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ByteType(org.apache.spark.sql.types.ByteType) FloatType(org.apache.spark.sql.types.FloatType) ArrayType(org.apache.spark.sql.types.ArrayType) Decimal(org.apache.spark.sql.types.Decimal) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType) BinaryType(org.apache.spark.sql.types.BinaryType) ShortType(org.apache.spark.sql.types.ShortType) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) GenericData(org.apache.avro.generic.GenericData) IntegerType(org.apache.spark.sql.types.IntegerType) DoubleType(org.apache.spark.sql.types.DoubleType) DecimalType(org.apache.spark.sql.types.DecimalType) Utf8(org.apache.avro.util.Utf8) NullType(org.apache.spark.sql.types.NullType) ArrayData(org.apache.spark.sql.catalyst.util.ArrayData)

Example 5 with StringType$

use of org.apache.spark.sql.types.StringType$ in project spark-bigquery-connector by GoogleCloudDataproc.

the class AvroSchemaConverter method sparkTypeToRawAvroType.

static Schema sparkTypeToRawAvroType(DataType dataType, String recordName, SchemaBuilder.TypeBuilder<Schema> builder) {
    if (dataType instanceof BinaryType) {
        return builder.bytesType();
    }
    if (dataType instanceof ByteType || dataType instanceof ShortType || dataType instanceof IntegerType || dataType instanceof LongType) {
        return builder.longType();
    }
    if (dataType instanceof BooleanType) {
        return builder.booleanType();
    }
    if (dataType instanceof FloatType || dataType instanceof DoubleType) {
        return builder.doubleType();
    }
    if (dataType instanceof DecimalType) {
        DecimalType decimalType = (DecimalType) dataType;
        if (decimalType.precision() <= SchemaConverters.BQ_NUMERIC_PRECISION && decimalType.scale() <= SchemaConverters.BQ_NUMERIC_SCALE) {
            return LogicalTypes.decimal(decimalType.precision(), decimalType.scale()).addToSchema(builder.bytesType());
        } else {
            throw new IllegalArgumentException("Decimal type is too wide to fit in BigQuery Numeric format");
        }
    }
    if (dataType instanceof StringType) {
        return builder.stringType();
    }
    if (dataType instanceof TimestampType) {
        // team adds microsecond support to their backend
        return LogicalTypes.timestampMicros().addToSchema(builder.longType());
    }
    if (dataType instanceof DateType) {
        return LogicalTypes.date().addToSchema(builder.intType());
    }
    if (dataType instanceof ArrayType) {
        return builder.array().items(sparkTypeToRawAvroType(((ArrayType) dataType).elementType(), ((ArrayType) dataType).containsNull(), recordName));
    }
    if (dataType instanceof StructType) {
        SchemaBuilder.FieldAssembler<Schema> fieldsAssembler = builder.record(recordName).fields();
        for (StructField field : ((StructType) dataType).fields()) {
            Schema avroType = sparkTypeToRawAvroType(field.dataType(), field.nullable(), field.name());
            fieldsAssembler.name(field.name()).type(avroType).noDefault();
        }
        return fieldsAssembler.endRecord();
    }
    if (dataType instanceof UserDefinedType) {
        DataType userDefinedType = ((UserDefinedType) dataType).sqlType();
        return sparkTypeToRawAvroType(userDefinedType, recordName, builder);
    }
    if (dataType instanceof MapType) {
        throw new IllegalArgumentException(SchemaConverters.MAPTYPE_ERROR_MESSAGE);
    } else {
        throw new IllegalArgumentException("Data type not supported: " + dataType.simpleString());
    }
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) LongType(org.apache.spark.sql.types.LongType) StructType(org.apache.spark.sql.types.StructType) StringType(org.apache.spark.sql.types.StringType) ShortType(org.apache.spark.sql.types.ShortType) Schema(org.apache.avro.Schema) BooleanType(org.apache.spark.sql.types.BooleanType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) ByteType(org.apache.spark.sql.types.ByteType) MapType(org.apache.spark.sql.types.MapType) FloatType(org.apache.spark.sql.types.FloatType) IntegerType(org.apache.spark.sql.types.IntegerType) ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) DoubleType(org.apache.spark.sql.types.DoubleType) SchemaBuilder(org.apache.avro.SchemaBuilder) DecimalType(org.apache.spark.sql.types.DecimalType) TimestampType(org.apache.spark.sql.types.TimestampType) DataType(org.apache.spark.sql.types.DataType) DateType(org.apache.spark.sql.types.DateType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)10 StringType (org.apache.spark.sql.types.StringType)9 BooleanType (org.apache.spark.sql.types.BooleanType)7 StructField (org.apache.spark.sql.types.StructField)7 DataType (org.apache.spark.sql.types.DataType)6 ArrayType (org.apache.spark.sql.types.ArrayType)5 DoubleType (org.apache.spark.sql.types.DoubleType)5 List (java.util.List)4 OpenLineage (io.openlineage.client.OpenLineage)3 SparkAgentTestExtension (io.openlineage.spark.agent.SparkAgentTestExtension)3 Path (java.nio.file.Path)3 Optional (java.util.Optional)3 SparkSession (org.apache.spark.sql.SparkSession)3 DateType (org.apache.spark.sql.types.DateType)3 DecimalType (org.apache.spark.sql.types.DecimalType)3 FloatType (org.apache.spark.sql.types.FloatType)3 IntegerType (org.apache.spark.sql.types.IntegerType)3 LongType (org.apache.spark.sql.types.LongType)3 Metadata (org.apache.spark.sql.types.Metadata)3 ShortType (org.apache.spark.sql.types.ShortType)3