Search in sources :

Example 1 with InsertIntoHadoopFsRelationCommand

use of org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand in project OpenLineage by OpenLineage.

the class InsertIntoHadoopFsRelationVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(LogicalPlan x) {
    InsertIntoHadoopFsRelationCommand command = (InsertIntoHadoopFsRelationCommand) x;
    DatasetIdentifier di = PathUtils.fromURI(command.outputPath().toUri(), "file");
    OpenLineage.OutputDataset outputDataset;
    if (SaveMode.Overwrite == command.mode()) {
        outputDataset = outputDataset().getDataset(di, command.query().schema(), OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE);
    } else {
        outputDataset = outputDataset().getDataset(di, command.query().schema());
    }
    return Collections.singletonList(outputDataset);
}
Also used : DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) OpenLineage(io.openlineage.client.OpenLineage) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand)

Example 2 with InsertIntoHadoopFsRelationCommand

use of org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand in project OpenLineage by OpenLineage.

the class OpenLineageSparkListenerTest method testSqlEventWithJobEventEmitsOnce.

@Test
public void testSqlEventWithJobEventEmitsOnce() {
    SparkSession sparkSession = mock(SparkSession.class);
    SparkContext sparkContext = mock(SparkContext.class);
    EventEmitter emitter = mock(EventEmitter.class);
    QueryExecution qe = mock(QueryExecution.class);
    LogicalPlan query = UnresolvedRelation$.MODULE$.apply(TableIdentifier.apply("tableName"));
    SparkPlan plan = mock(SparkPlan.class);
    when(sparkSession.sparkContext()).thenReturn(sparkContext);
    when(sparkContext.appName()).thenReturn("appName");
    when(qe.optimizedPlan()).thenReturn(new InsertIntoHadoopFsRelationCommand(new Path("file:///tmp/dir"), null, false, Seq$.MODULE$.empty(), Option.empty(), null, Map$.MODULE$.empty(), query, SaveMode.Overwrite, Option.empty(), Option.empty(), Seq$.MODULE$.<String>empty()));
    when(qe.executedPlan()).thenReturn(plan);
    when(plan.sparkContext()).thenReturn(sparkContext);
    when(plan.nodeName()).thenReturn("execute");
    OpenLineageContext olContext = OpenLineageContext.builder().sparkSession(Optional.of(sparkSession)).sparkContext(sparkSession.sparkContext()).openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).queryExecution(qe).build();
    olContext.getOutputDatasetQueryPlanVisitors().add(new InsertIntoHadoopFsRelationVisitor(olContext));
    ExecutionContext executionContext = new StaticExecutionContextFactory(emitter).createSparkSQLExecutionContext(1L, emitter, qe, olContext);
    executionContext.start(new SparkListenerSQLExecutionStart(1L, "", "", "", new SparkPlanInfo("name", "string", Seq$.MODULE$.empty(), Map$.MODULE$.empty(), Seq$.MODULE$.empty()), 1L));
    executionContext.start(new SparkListenerJobStart(0, 2L, Seq$.MODULE$.<StageInfo>empty(), new Properties()));
    ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    verify(emitter, times(2)).emit(lineageEvent.capture());
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) SparkPlan(org.apache.spark.sql.execution.SparkPlan) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) StageInfo(org.apache.spark.scheduler.StageInfo) StaticExecutionContextFactory(io.openlineage.spark.agent.lifecycle.StaticExecutionContextFactory) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand) Properties(java.util.Properties) QueryExecution(org.apache.spark.sql.execution.QueryExecution) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) SparkContext(org.apache.spark.SparkContext) ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext) SparkPlanInfo(org.apache.spark.sql.execution.SparkPlanInfo) OpenLineage(io.openlineage.client.OpenLineage) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) InsertIntoHadoopFsRelationVisitor(io.openlineage.spark.agent.lifecycle.plan.InsertIntoHadoopFsRelationVisitor) Test(org.junit.jupiter.api.Test)

Example 3 with InsertIntoHadoopFsRelationCommand

use of org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand in project OpenLineage by OpenLineage.

the class LogicalPlanSerializerTest method testSerializeInsertIntoHadoopPlan.

@Test
public void testSerializeInsertIntoHadoopPlan() throws IOException, InvocationTargetException, IllegalAccessException {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    HadoopFsRelation hadoopFsRelation = new HadoopFsRelation(new CatalogFileIndex(session, CatalogTableTestUtils.getCatalogTable(new TableIdentifier("test", Option.apply("db"))), 100L), new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()) }), new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()) }), Option.empty(), new TextFileFormat(), new HashMap<>(), session);
    LogicalRelation logicalRelation = new LogicalRelation(hadoopFsRelation, Seq$.MODULE$.<AttributeReference>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), false);
    InsertIntoHadoopFsRelationCommand command = new InsertIntoHadoopFsRelationCommand(new org.apache.hadoop.fs.Path("/tmp"), new HashMap<>(), false, Seq$.MODULE$.<Attribute>newBuilder().$plus$eq(new AttributeReference("name", StringType$.MODULE$, false, Metadata.empty(), ExprId.apply(1L), Seq$.MODULE$.<String>empty())).result(), Option.empty(), new TextFileFormat(), new HashMap<>(), logicalRelation, SaveMode.Overwrite, Option.empty(), Option.empty(), Seq$.MODULE$.<String>newBuilder().$plus$eq("name").result());
    Map<String, Object> commandActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(command), mapTypeReference);
    Map<String, Object> hadoopFSActualNode = objectMapper.readValue(logicalPlanSerializer.serialize(logicalRelation), mapTypeReference);
    Path expectedCommandNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "insertintofs-node.json");
    Path expectedHadoopFSNodePath = Paths.get("src", "test", "resources", "test_data", "serde", "hadoopfsrelation-node.json");
    Map<String, Object> expectedCommandNode = objectMapper.readValue(expectedCommandNodePath.toFile(), mapTypeReference);
    Map<String, Object> expectedHadoopFSNode = objectMapper.readValue(expectedHadoopFSNodePath.toFile(), mapTypeReference);
    assertThat(commandActualNode).satisfies(new MatchesMapRecursively(expectedCommandNode, Collections.singleton("exprId")));
    assertThat(hadoopFSActualNode).satisfies(new MatchesMapRecursively(expectedHadoopFSNode, Collections.singleton("exprId")));
}
Also used : TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) StructField(org.apache.spark.sql.types.StructField) HadoopFsRelation(org.apache.spark.sql.execution.datasources.HadoopFsRelation) CatalogFileIndex(org.apache.spark.sql.execution.datasources.CatalogFileIndex) TextFileFormat(org.apache.spark.sql.execution.datasources.text.TextFileFormat) Test(org.junit.jupiter.api.Test)

Aggregations

InsertIntoHadoopFsRelationCommand (org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand)3 OpenLineage (io.openlineage.client.OpenLineage)2 SparkSession (org.apache.spark.sql.SparkSession)2 Test (org.junit.jupiter.api.Test)2 ExecutionContext (io.openlineage.spark.agent.lifecycle.ExecutionContext)1 StaticExecutionContextFactory (io.openlineage.spark.agent.lifecycle.StaticExecutionContextFactory)1 InsertIntoHadoopFsRelationVisitor (io.openlineage.spark.agent.lifecycle.plan.InsertIntoHadoopFsRelationVisitor)1 DatasetIdentifier (io.openlineage.spark.agent.util.DatasetIdentifier)1 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)1 Path (java.nio.file.Path)1 Properties (java.util.Properties)1 Path (org.apache.hadoop.fs.Path)1 SparkContext (org.apache.spark.SparkContext)1 SparkListenerJobStart (org.apache.spark.scheduler.SparkListenerJobStart)1 StageInfo (org.apache.spark.scheduler.StageInfo)1 TableIdentifier (org.apache.spark.sql.catalyst.TableIdentifier)1 AttributeReference (org.apache.spark.sql.catalyst.expressions.AttributeReference)1 LogicalPlan (org.apache.spark.sql.catalyst.plans.logical.LogicalPlan)1 QueryExecution (org.apache.spark.sql.execution.QueryExecution)1 SparkPlan (org.apache.spark.sql.execution.SparkPlan)1