Search in sources :

Example 1 with SortOrder

use of org.apache.spark.sql.catalyst.expressions.SortOrder in project OpenLineage by OpenLineage.

the class LogicalRDDVisitorTest method testApply.

@Test
public void testApply(@TempDir Path tmpDir) {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    jobConf = new JobConf();
    FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
    RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
    LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
    assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
    List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
    assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Seq$(scala.collection.Seq$) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InternalRow(org.apache.spark.sql.catalyst.InternalRow) SinglePartition$(org.apache.spark.sql.catalyst.plans.physical.SinglePartition$) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) IntegerType$(org.apache.spark.sql.types.IntegerType$) SparkSession$(org.apache.spark.sql.SparkSession$) DatasetFactory(io.openlineage.spark.api.DatasetFactory) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) SortOrder(org.apache.spark.sql.catalyst.expressions.SortOrder) TempDir(org.junit.jupiter.api.io.TempDir) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) HashMap(scala.collection.immutable.HashMap) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) StructField(org.apache.spark.sql.types.StructField) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.jupiter.api.Test)

Aggregations

OpenLineage (io.openlineage.client.OpenLineage)1 SparkAgentTestExtension (io.openlineage.spark.agent.SparkAgentTestExtension)1 OpenLineageClient (io.openlineage.spark.agent.client.OpenLineageClient)1 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)1 DatasetFactory (io.openlineage.spark.api.DatasetFactory)1 Path (java.nio.file.Path)1 List (java.util.List)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)1 JobConf (org.apache.hadoop.mapred.JobConf)1 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)1 HadoopRDD (org.apache.spark.rdd.HadoopRDD)1 RDD (org.apache.spark.rdd.RDD)1 SparkSession (org.apache.spark.sql.SparkSession)1 SparkSession$ (org.apache.spark.sql.SparkSession$)1 InternalRow (org.apache.spark.sql.catalyst.InternalRow)1 AttributeReference (org.apache.spark.sql.catalyst.expressions.AttributeReference)1 GenericInternalRow (org.apache.spark.sql.catalyst.expressions.GenericInternalRow)1 SortOrder (org.apache.spark.sql.catalyst.expressions.SortOrder)1