use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.
the class OpenLineageSparkListener method sparkSQLExecStart.
/**
* called by the SparkListener when a spark-sql (Dataset api) execution starts
*/
private static void sparkSQLExecStart(SparkListenerSQLExecutionStart startEvent) {
ExecutionContext context = getSparkSQLExecutionContext(startEvent.executionId());
context.start(startEvent);
}
use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.
the class OpenLineageSparkListener method onJobEnd.
/**
* called by the SparkListener when a job ends
*/
@Override
public void onJobEnd(SparkListenerJobEnd jobEnd) {
ExecutionContext context = rddExecutionRegistry.remove(jobEnd.jobId());
if (context != null)
context.end(jobEnd);
jobMetrics.cleanUp(jobEnd.jobId());
}
use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.
the class OpenLineageSparkListener method getExecutionContext.
public static ExecutionContext getExecutionContext(int jobId, long executionId) {
ExecutionContext executionContext = getSparkSQLExecutionContext(executionId);
rddExecutionRegistry.put(jobId, executionContext);
return executionContext;
}
use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.
the class OpenLineageSparkListenerTest method testSqlEventWithJobEventEmitsOnce.
@Test
public void testSqlEventWithJobEventEmitsOnce() {
SparkSession sparkSession = mock(SparkSession.class);
SparkContext sparkContext = mock(SparkContext.class);
EventEmitter emitter = mock(EventEmitter.class);
QueryExecution qe = mock(QueryExecution.class);
LogicalPlan query = UnresolvedRelation$.MODULE$.apply(TableIdentifier.apply("tableName"));
SparkPlan plan = mock(SparkPlan.class);
when(sparkSession.sparkContext()).thenReturn(sparkContext);
when(sparkContext.appName()).thenReturn("appName");
when(qe.optimizedPlan()).thenReturn(new InsertIntoHadoopFsRelationCommand(new Path("file:///tmp/dir"), null, false, Seq$.MODULE$.empty(), Option.empty(), null, Map$.MODULE$.empty(), query, SaveMode.Overwrite, Option.empty(), Option.empty(), Seq$.MODULE$.<String>empty()));
when(qe.executedPlan()).thenReturn(plan);
when(plan.sparkContext()).thenReturn(sparkContext);
when(plan.nodeName()).thenReturn("execute");
OpenLineageContext olContext = OpenLineageContext.builder().sparkSession(Optional.of(sparkSession)).sparkContext(sparkSession.sparkContext()).openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).queryExecution(qe).build();
olContext.getOutputDatasetQueryPlanVisitors().add(new InsertIntoHadoopFsRelationVisitor(olContext));
ExecutionContext executionContext = new StaticExecutionContextFactory(emitter).createSparkSQLExecutionContext(1L, emitter, qe, olContext);
executionContext.start(new SparkListenerSQLExecutionStart(1L, "", "", "", new SparkPlanInfo("name", "string", Seq$.MODULE$.empty(), Map$.MODULE$.empty(), Seq$.MODULE$.empty()), 1L));
executionContext.start(new SparkListenerJobStart(0, 2L, Seq$.MODULE$.<StageInfo>empty(), new Properties()));
ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
verify(emitter, times(2)).emit(lineageEvent.capture());
}
use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.
the class OpenLineageSparkListener method onJobStart.
/**
* called by the SparkListener when a job starts
*/
@Override
public void onJobStart(SparkListenerJobStart jobStart) {
Optional<ActiveJob> activeJob = asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId()))).flatMap(ScalaConversionUtils::asJavaOptional);
Set<Integer> stages = ScalaConversionUtils.fromSeq(jobStart.stageIds()).stream().map(Integer.class::cast).collect(Collectors.toSet());
jobMetrics.addJobStages(jobStart.jobId(), stages);
ExecutionContext context = Optional.ofNullable(getSqlExecutionId(jobStart.properties())).map(Optional::of).orElseGet(() -> asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId())).flatMap(ScalaConversionUtils::asJavaOptional)).map(job -> getSqlExecutionId(job.properties()))).map(id -> {
long executionId = Long.parseLong(id);
return getExecutionContext(jobStart.jobId(), executionId);
}).orElseGet(() -> getExecutionContext(jobStart.jobId()));
// set it in the rddExecutionRegistry so jobEnd is called
rddExecutionRegistry.put(jobStart.jobId(), context);
activeJob.ifPresent(context::setActiveJob);
context.start(jobStart);
}
Aggregations