use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.
the class SparkReadWriteIntegTest method testWithLogicalRdd.
@Test
public void testWithLogicalRdd(@TempDir Path tmpDir, SparkSession spark) throws InterruptedException, TimeoutException {
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
String csvPath = tmpDir.toAbsolutePath() + "/csv_data";
String csvUri = "file://" + csvPath;
spark.createDataFrame(Arrays.asList(new GenericRow(new Object[] { 1, "seven" }), new GenericRow(new Object[] { 6, "one" }), new GenericRow(new Object[] { 72, "fourteen" }), new GenericRow(new Object[] { 99, "sixteen" })), schema).write().csv(csvUri);
StaticExecutionContextFactory.waitForExecutionEnd();
// reset to start counting now
reset(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT);
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getJobNamespace()).thenReturn("theNamespace");
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentJobName()).thenReturn("theParentJob");
when(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT.getParentRunId()).thenReturn(Optional.of(UUID.randomUUID()));
JobConf conf = new JobConf();
FileInputFormat.addInputPath(conf, new org.apache.hadoop.fs.Path(csvUri));
JavaRDD<Tuple2<LongWritable, Text>> csvRdd = spark.sparkContext().hadoopRDD(conf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD();
JavaRDD<Row> splitDf = csvRdd.map(t -> new String(t._2.getBytes()).split(",")).map(arr -> new GenericRow(new Object[] { Integer.parseInt(arr[0]), arr[1] }));
Dataset<Row> df = spark.createDataFrame(splitDf, schema);
String outputPath = tmpDir.toAbsolutePath() + "/output_data";
String jsonPath = "file://" + outputPath;
df.write().json(jsonPath);
// wait for event processing to complete
StaticExecutionContextFactory.waitForExecutionEnd();
ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
OpenLineage.RunEvent completeEvent = lineageEvent.getAllValues().get(2);
assertThat(completeEvent).hasFieldOrPropertyWithValue("eventType", RunEvent.EventType.COMPLETE);
assertThat(completeEvent.getInputs()).singleElement().hasFieldOrPropertyWithValue("name", csvPath).hasFieldOrPropertyWithValue("namespace", "file");
assertThat(completeEvent.getOutputs()).singleElement().hasFieldOrPropertyWithValue("name", outputPath).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.
the class LogicalRDDVisitorTest method testApply.
@Test
public void testApply(@TempDir Path tmpDir) {
SparkSession session = SparkSession.builder().master("local").getOrCreate();
LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
jobConf = new JobConf();
FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.sql.types.StringType$ in project OpenLineage by OpenLineage.
the class DeltaDataSourceTest method testInsertIntoDeltaSource.
@Test
public void testInsertIntoDeltaSource(@TempDir Path tempDir, SparkSession spark) throws IOException, InterruptedException, TimeoutException {
StructType tableSchema = new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()), new StructField("age", LongType$.MODULE$, false, Metadata.empty()) });
Dataset<Row> df = spark.createDataFrame(Arrays.asList(new GenericRowWithSchema(new Object[] { "john", 25L }, tableSchema), new GenericRowWithSchema(new Object[] { "sam", 22L }, tableSchema), new GenericRowWithSchema(new Object[] { "alicia", 35L }, tableSchema), new GenericRowWithSchema(new Object[] { "bob", 47L }, tableSchema), new GenericRowWithSchema(new Object[] { "jordan", 52L }, tableSchema), new GenericRowWithSchema(new Object[] { "liz", 19L }, tableSchema), new GenericRowWithSchema(new Object[] { "marcia", 83L }, tableSchema), new GenericRowWithSchema(new Object[] { "maria", 40L }, tableSchema), new GenericRowWithSchema(new Object[] { "luis", 8L }, tableSchema), new GenericRowWithSchema(new Object[] { "gabriel", 30L }, tableSchema)), tableSchema);
String deltaDir = tempDir.resolve("deltaData").toAbsolutePath().toString();
df.write().format("delta").option("path", deltaDir).mode(SaveMode.Overwrite).save();
// wait for event processing to complete
StaticExecutionContextFactory.waitForExecutionEnd();
ArgumentCaptor<RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, Mockito.atLeast(2)).emit(lineageEvent.capture());
List<RunEvent> events = lineageEvent.getAllValues();
Optional<RunEvent> completionEvent = events.stream().filter(e -> e.getEventType().equals(EventType.COMPLETE) && !e.getOutputs().isEmpty()).findFirst();
assertTrue(completionEvent.isPresent());
OpenLineage.RunEvent event = completionEvent.get();
List<OpenLineage.OutputDataset> outputs = event.getOutputs();
assertEquals(1, outputs.size());
assertEquals("file", outputs.get(0).getNamespace());
assertEquals(deltaDir, outputs.get(0).getName());
}
use of org.apache.spark.sql.types.StringType$ in project spark-bigquery-connector by GoogleCloudDataproc.
the class AvroSchemaConverter method createConverterFor.
static Converter createConverterFor(DataType sparkType, Schema avroType) {
if (sparkType instanceof NullType && avroType.getType() == Schema.Type.NULL) {
return (getter, ordinal) -> null;
}
if (sparkType instanceof BooleanType && avroType.getType() == Schema.Type.BOOLEAN) {
return (getter, ordinal) -> getter.getBoolean(ordinal);
}
if (sparkType instanceof ByteType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getByte(ordinal));
}
if (sparkType instanceof ShortType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getShort(ordinal));
}
if (sparkType instanceof IntegerType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> Long.valueOf(getter.getInt(ordinal));
}
if (sparkType instanceof LongType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> getter.getLong(ordinal);
}
if (sparkType instanceof FloatType && avroType.getType() == Schema.Type.DOUBLE) {
return (getter, ordinal) -> Double.valueOf(getter.getFloat(ordinal));
}
if (sparkType instanceof DoubleType && avroType.getType() == Schema.Type.DOUBLE) {
return (getter, ordinal) -> getter.getDouble(ordinal);
}
if (sparkType instanceof DecimalType && avroType.getType() == Schema.Type.BYTES) {
DecimalType decimalType = (DecimalType) sparkType;
return (getter, ordinal) -> {
Decimal decimal = getter.getDecimal(ordinal, decimalType.precision(), decimalType.scale());
return DECIMAL_CONVERSIONS.toBytes(decimal.toJavaBigDecimal(), avroType, LogicalTypes.decimal(decimalType.precision(), decimalType.scale()));
};
}
if (sparkType instanceof StringType && avroType.getType() == Schema.Type.STRING) {
return (getter, ordinal) -> new Utf8(getter.getUTF8String(ordinal).getBytes());
}
if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.FIXED) {
int size = avroType.getFixedSize();
return (getter, ordinal) -> {
byte[] data = getter.getBinary(ordinal);
if (data.length != size) {
throw new IllegalArgumentException(String.format("Cannot write %s bytes of binary data into FIXED Type with size of %s bytes", data.length, size));
}
return new GenericData.Fixed(avroType, data);
};
}
if (sparkType instanceof BinaryType && avroType.getType() == Schema.Type.BYTES) {
return (getter, ordinal) -> ByteBuffer.wrap(getter.getBinary(ordinal));
}
if (sparkType instanceof DateType && avroType.getType() == Schema.Type.INT) {
return (getter, ordinal) -> getter.getInt(ordinal);
}
if (sparkType instanceof TimestampType && avroType.getType() == Schema.Type.LONG) {
return (getter, ordinal) -> getter.getLong(ordinal);
}
if (sparkType instanceof ArrayType && avroType.getType() == Schema.Type.ARRAY) {
DataType et = ((ArrayType) sparkType).elementType();
boolean containsNull = ((ArrayType) sparkType).containsNull();
Converter elementConverter = createConverterFor(et, resolveNullableType(avroType.getElementType(), containsNull));
return (getter, ordinal) -> {
ArrayData arrayData = getter.getArray(ordinal);
int len = arrayData.numElements();
Object[] result = new Object[len];
for (int i = 0; i < len; i++) {
if (containsNull && arrayData.isNullAt(i)) {
result[i] = null;
} else {
result[i] = elementConverter.convert(arrayData, i);
}
}
// `ArrayList` backed by the specified array without data copying.
return java.util.Arrays.asList(result);
};
}
if (sparkType instanceof StructType && avroType.getType() == Schema.Type.RECORD) {
StructType sparkStruct = (StructType) sparkType;
StructConverter structConverter = new StructConverter(sparkStruct, avroType);
int numFields = sparkStruct.length();
return (getter, ordinal) -> structConverter.convert(getter.getStruct(ordinal, numFields));
}
if (sparkType instanceof UserDefinedType) {
UserDefinedType userDefinedType = (UserDefinedType) sparkType;
return createConverterFor(userDefinedType.sqlType(), avroType);
}
throw new IllegalArgumentException(String.format("Cannot convert Catalyst type %s to Avro type %s", sparkType, avroType));
}
use of org.apache.spark.sql.types.StringType$ in project spark-bigquery-connector by GoogleCloudDataproc.
the class AvroSchemaConverter method sparkTypeToRawAvroType.
static Schema sparkTypeToRawAvroType(DataType dataType, String recordName, SchemaBuilder.TypeBuilder<Schema> builder) {
if (dataType instanceof BinaryType) {
return builder.bytesType();
}
if (dataType instanceof ByteType || dataType instanceof ShortType || dataType instanceof IntegerType || dataType instanceof LongType) {
return builder.longType();
}
if (dataType instanceof BooleanType) {
return builder.booleanType();
}
if (dataType instanceof FloatType || dataType instanceof DoubleType) {
return builder.doubleType();
}
if (dataType instanceof DecimalType) {
DecimalType decimalType = (DecimalType) dataType;
if (decimalType.precision() <= SchemaConverters.BQ_NUMERIC_PRECISION && decimalType.scale() <= SchemaConverters.BQ_NUMERIC_SCALE) {
return LogicalTypes.decimal(decimalType.precision(), decimalType.scale()).addToSchema(builder.bytesType());
} else {
throw new IllegalArgumentException("Decimal type is too wide to fit in BigQuery Numeric format");
}
}
if (dataType instanceof StringType) {
return builder.stringType();
}
if (dataType instanceof TimestampType) {
// team adds microsecond support to their backend
return LogicalTypes.timestampMicros().addToSchema(builder.longType());
}
if (dataType instanceof DateType) {
return LogicalTypes.date().addToSchema(builder.intType());
}
if (dataType instanceof ArrayType) {
return builder.array().items(sparkTypeToRawAvroType(((ArrayType) dataType).elementType(), ((ArrayType) dataType).containsNull(), recordName));
}
if (dataType instanceof StructType) {
SchemaBuilder.FieldAssembler<Schema> fieldsAssembler = builder.record(recordName).fields();
for (StructField field : ((StructType) dataType).fields()) {
Schema avroType = sparkTypeToRawAvroType(field.dataType(), field.nullable(), field.name());
fieldsAssembler.name(field.name()).type(avroType).noDefault();
}
return fieldsAssembler.endRecord();
}
if (dataType instanceof UserDefinedType) {
DataType userDefinedType = ((UserDefinedType) dataType).sqlType();
return sparkTypeToRawAvroType(userDefinedType, recordName, builder);
}
if (dataType instanceof MapType) {
throw new IllegalArgumentException(SchemaConverters.MAPTYPE_ERROR_MESSAGE);
} else {
throw new IllegalArgumentException("Data type not supported: " + dataType.simpleString());
}
}
Aggregations