Search in sources :

Example 6 with LogicalRelation

use of org.apache.spark.sql.execution.datasources.LogicalRelation in project OpenLineage by OpenLineage.

the class OptimizedCreateHiveTableAsSelectCommandVisitorTest method testOptimizedCreateHiveTableAsSelectCommand.

@Test
void testOptimizedCreateHiveTableAsSelectCommand() {
    OptimizedCreateHiveTableAsSelectCommandVisitor visitor = new OptimizedCreateHiveTableAsSelectCommandVisitor(SparkAgentTestExtension.newContext(session));
    OptimizedCreateHiveTableAsSelectCommand command = new OptimizedCreateHiveTableAsSelectCommand(SparkUtils.catalogTable(TableIdentifier$.MODULE$.apply("tablename", Option.apply("db")), CatalogTableType.EXTERNAL(), CatalogStorageFormat$.MODULE$.apply(Option.apply(URI.create("s3://bucket/directory")), null, null, null, false, Map$.MODULE$.empty()), new StructType(new StructField[] { new StructField("key", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("value", StringType$.MODULE$, false, new Metadata(new HashMap<>())) })), new LogicalRelation(new JDBCRelation(new StructType(new StructField[] { new StructField("key", IntegerType$.MODULE$, false, null), new StructField("value", StringType$.MODULE$, false, null) }), new Partition[] {}, new JDBCOptions("", "temp", scala.collection.immutable.Map$.MODULE$.newBuilder().$plus$eq(Tuple2.apply("driver", Driver.class.getName())).result()), session), Seq$.MODULE$.<AttributeReference>newBuilder().$plus$eq(new AttributeReference("key", IntegerType$.MODULE$, false, null, ExprId.apply(1L), Seq$.MODULE$.<String>empty())).$plus$eq(new AttributeReference("value", StringType$.MODULE$, false, null, ExprId.apply(2L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), false), ScalaConversionUtils.fromList(Arrays.asList("key", "value")), SaveMode.Overwrite);
    assertThat(visitor.isDefinedAt(command)).isTrue();
    List<OpenLineage.OutputDataset> datasets = visitor.apply(command);
    assertEquals(1, datasets.size());
    OpenLineage.OutputDataset outputDataset = datasets.get(0);
    assertEquals(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE, outputDataset.getFacets().getLifecycleStateChange().getLifecycleStateChange());
    assertEquals("directory", outputDataset.getName());
    assertEquals("s3://bucket", outputDataset.getNamespace());
}
Also used : StructType(org.apache.spark.sql.types.StructType) OptimizedCreateHiveTableAsSelectCommandVisitor(io.openlineage.spark.agent.lifecycle.plan.OptimizedCreateHiveTableAsSelectCommandVisitor) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) JDBCRelation(org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation) Driver(org.postgresql.Driver) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) StructField(org.apache.spark.sql.types.StructField) JDBCOptions(org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions) OptimizedCreateHiveTableAsSelectCommand(org.apache.spark.sql.hive.execution.OptimizedCreateHiveTableAsSelectCommand) OpenLineage(io.openlineage.client.OpenLineage) Test(org.junit.jupiter.api.Test)

Example 7 with LogicalRelation

use of org.apache.spark.sql.execution.datasources.LogicalRelation in project OpenLineage by OpenLineage.

the class LogicalPlanSerializerTest method testSerializeInsertIntoHadoopPlan.

@Test
public void testSerializeInsertIntoHadoopPlan() throws IOException, InvocationTargetException, IllegalAccessException {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    HadoopFsRelation hadoopFsRelation = new HadoopFsRelation(new CatalogFileIndex(session, CatalogTableTestUtils.getCatalogTable(new TableIdentifier("test", Option.apply("db"))), 100L), new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()) }), new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()) }), Option.empty(), new TextFileFormat(), new HashMap<>(), session);
    LogicalRelation logicalRelation = new LogicalRelation(hadoopFsRelation, Seq$.MODULE$.<AttributeReference>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), false);
    InsertIntoHadoopFsRelationCommand command = new InsertIntoHadoopFsRelationCommand(new org.apache.hadoop.fs.Path("/tmp"), new HashMap<>(), false, Seq$.MODULE$.<Attribute>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), new TextFileFormat(), new HashMap<>(), logicalRelation, SaveMode.Overwrite, Option.empty(), Option.empty(), Seq$.MODULE$.<String>newBuilder().$plus$eq("name").result());
    Map<String, Object> commandActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(command), mapTypeReference);
    Map<String, Object> hadoopFSActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(logicalRelation), mapTypeReference);
    Path expectedCommandNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "insertintofs-node.json");
    Path expectedHadoopFSNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "hadoopfsrelation-node.json");
    Map<String, Object> expectedCommandNode = objectMapper.readValue(expectedCommandNodePath.toFile(), mapTypeReference);
    Map<String, Object> expectedHadoopFSNode = objectMapper.readValue(expectedHadoopFSNodePath.toFile(), mapTypeReference);
    assertThat(commandActualNode).satisfies(new MatchesMapRecursively(expectedCommandNode, Collections.singleton("exprId")));
    assertThat(hadoopFSActualNode).satisfies(new MatchesMapRecursively(expectedHadoopFSNode, Collections.singleton("exprId")));
}
Also used : TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) StructField(org.apache.spark.sql.types.StructField) HadoopFsRelation(org.apache.spark.sql.execution.datasources.HadoopFsRelation) CatalogFileIndex(org.apache.spark.sql.execution.datasources.CatalogFileIndex) TextFileFormat(org.apache.spark.sql.execution.datasources.text.TextFileFormat) Test(org.junit.jupiter.api.Test)

Example 8 with LogicalRelation

use of org.apache.spark.sql.execution.datasources.LogicalRelation in project OpenLineage by OpenLineage.

the class LogicalPlanSerializerTest method testSerializeLogicalPlan.

@Test
public void testSerializeLogicalPlan() throws IOException {
    String jdbcUrl = "jdbc:postgresql://postgreshost:5432/sparkdata";
    String sparkTableName = "my_spark_table";
    scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.<String, String>newBuilder().$plus$eq(Tuple2.apply("driver", Driver.class.getName())).result();
    JDBCRelation relation = new JDBCRelation(new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()) }), new Partition[] {}, new JDBCOptions(jdbcUrl, sparkTableName, map), mock(SparkSession.class));
    LogicalRelation logicalRelation = new LogicalRelation(relation, Seq$.MODULE$.<AttributeReference>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), false);
    Aggregate aggregate = new Aggregate(Seq$.MODULE$.<Expression>empty(), Seq$.MODULE$.<NamedExpression>empty(), logicalRelation);
    Map<String, Object> aggregateActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(aggregate), mapTypeReference);
    Map<String, Object> logicalRelationActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(logicalRelation), mapTypeReference);
    Path expectedAggregateNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "aggregate-node.json");
    Path logicalRelationNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "logicalrelation-node.json");
    Map<String, Object> expectedAggregateNode = objectMapper.readValue(expectedAggregateNodePath.toFile(), mapTypeReference);
    Map<String, Object> expectedLogicalRelationNode = objectMapper.readValue(logicalRelationNodePath.toFile(), mapTypeReference);
    assertThat(aggregateActualNode).satisfies(new MatchesMapRecursively(expectedAggregateNode));
    assertThat(logicalRelationActualNode).satisfies(new MatchesMapRecursively(expectedLogicalRelationNode));
}
Also used : Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) JDBCRelation(org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) StructField(org.apache.spark.sql.types.StructField) JDBCOptions(org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions) Aggregate(org.apache.spark.sql.catalyst.plans.logical.Aggregate) Map(java.util.Map) ImmutableMap(com.google.cloud.spark.bigquery.repackaged.com.google.common.collect.ImmutableMap) HashMap(scala.collection.immutable.HashMap) Test(org.junit.jupiter.api.Test)

Example 9 with LogicalRelation

use of org.apache.spark.sql.execution.datasources.LogicalRelation in project OpenLineage by OpenLineage.

the class SqlDWDatabricksVisitor method apply.

@Override
public List<D> apply(LogicalPlan x) {
    BaseRelation relation = ((LogicalRelation) x).relation();
    List<D> output;
    Optional<String> name = getName(relation);
    Optional<String> namespace = getNameSpace(relation);
    if (name.isPresent() && namespace.isPresent()) {
        output = Collections.singletonList(factory.getDataset(name.get(), namespace.get(), relation.schema()));
    } else {
        output = Collections.emptyList();
    }
    return output;
}
Also used : LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) BaseRelation(org.apache.spark.sql.sources.BaseRelation)

Example 10 with LogicalRelation

use of org.apache.spark.sql.execution.datasources.LogicalRelation in project OpenLineage by OpenLineage.

the class LogicalPlanSerializerTest method testSerializeBigQueryPlan.

@Test
public void testSerializeBigQueryPlan() throws IOException {
    String query = "SELECT date FROM bigquery-public-data.google_analytics_sample.test";
    System.setProperty("GOOGLE_CLOUD_PROJECT", "test_serialization");
    SparkBigQueryConfig config = SparkBigQueryConfig.from(ImmutableMap.of("query", query, "dataset", "test-dataset", "maxparallelism", "2", "partitionexpirationms", "2"), ImmutableMap.of(), new Configuration(), 10, SQLConf.get(), "", Optional.empty());
    BigQueryRelation bigQueryRelation = new BigQueryRelation(config, TableInfo.newBuilder(TableId.of("dataset", "test"), new TestTableDefinition()).build(), mock(SQLContext.class));
    LogicalRelation logicalRelation = new LogicalRelation(bigQueryRelation, Seq$.MODULE$.<AttributeReference>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), false);
    InsertIntoDataSourceCommand command = new InsertIntoDataSourceCommand(logicalRelation, logicalRelation, false);
    Map<String, Object> commandActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(command), mapTypeReference);
    Map<String, Object> bigqueryActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(logicalRelation), mapTypeReference);
    Path expectedCommandNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "insertintods-node.json");
    Path expectedBigQueryRelationNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "bigqueryrelation-node.json");
    Map<String, Object> expectedCommandNode = objectMapper.readValue(expectedCommandNodePath.toFile(), mapTypeReference);
    Map<String, Object> expectedBigQueryRelationNode = objectMapper.readValue(expectedBigQueryRelationNodePath.toFile(), mapTypeReference);
    assertThat(commandActualNode).satisfies(new MatchesMapRecursively(expectedCommandNode, Collections.singleton("exprId")));
    assertThat(bigqueryActualNode).satisfies(new MatchesMapRecursively(expectedBigQueryRelationNode, Collections.singleton("exprId")));
}
Also used : Path(java.nio.file.Path) SparkBigQueryConfig(com.google.cloud.spark.bigquery.SparkBigQueryConfig) Configuration(org.apache.hadoop.conf.Configuration) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) InsertIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.InsertIntoDataSourceCommand) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) BigQueryRelation(com.google.cloud.spark.bigquery.BigQueryRelation) SQLContext(org.apache.spark.sql.SQLContext) Test(org.junit.jupiter.api.Test)

Aggregations

LogicalRelation (org.apache.spark.sql.execution.datasources.LogicalRelation)14 Test (org.junit.jupiter.api.Test)10 OpenLineage (io.openlineage.client.OpenLineage)9 AttributeReference (org.apache.spark.sql.catalyst.expressions.AttributeReference)9 StructField (org.apache.spark.sql.types.StructField)5 JDBCOptions (org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions)4 JDBCRelation (org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation)4 StructType (org.apache.spark.sql.types.StructType)4 OutputDataset (io.openlineage.client.OpenLineage.OutputDataset)3 Path (java.nio.file.Path)3 Configuration (org.apache.hadoop.conf.Configuration)3 SQLContext (org.apache.spark.sql.SQLContext)3 HadoopFsRelation (org.apache.spark.sql.execution.datasources.HadoopFsRelation)3 BigQueryRelation (com.google.cloud.spark.bigquery.BigQueryRelation)2 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)2 Path (org.apache.hadoop.fs.Path)2 SparkContext (org.apache.spark.SparkContext)2 SparkSession (org.apache.spark.sql.SparkSession)2 FileIndex (org.apache.spark.sql.execution.datasources.FileIndex)2 SaveIntoDataSourceCommand (org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand)2