Search in sources :

Example 1 with ExecutionContext

use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.

the class OpenLineageSparkListener method sparkSQLExecStart.

/**
 * called by the SparkListener when a spark-sql (Dataset api) execution starts
 */
private static void sparkSQLExecStart(SparkListenerSQLExecutionStart startEvent) {
    ExecutionContext context = getSparkSQLExecutionContext(startEvent.executionId());
    context.start(startEvent);
}
Also used : ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext)

Example 2 with ExecutionContext

use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.

the class OpenLineageSparkListener method onJobEnd.

/**
 * called by the SparkListener when a job ends
 */
@Override
public void onJobEnd(SparkListenerJobEnd jobEnd) {
    ExecutionContext context = rddExecutionRegistry.remove(jobEnd.jobId());
    if (context != null)
        context.end(jobEnd);
    jobMetrics.cleanUp(jobEnd.jobId());
}
Also used : ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext)

Example 3 with ExecutionContext

use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.

the class OpenLineageSparkListener method getExecutionContext.

public static ExecutionContext getExecutionContext(int jobId, long executionId) {
    ExecutionContext executionContext = getSparkSQLExecutionContext(executionId);
    rddExecutionRegistry.put(jobId, executionContext);
    return executionContext;
}
Also used : ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext)

Example 4 with ExecutionContext

use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.

the class OpenLineageSparkListenerTest method testSqlEventWithJobEventEmitsOnce.

@Test
public void testSqlEventWithJobEventEmitsOnce() {
    SparkSession sparkSession = mock(SparkSession.class);
    SparkContext sparkContext = mock(SparkContext.class);
    EventEmitter emitter = mock(EventEmitter.class);
    QueryExecution qe = mock(QueryExecution.class);
    LogicalPlan query = UnresolvedRelation$.MODULE$.apply(TableIdentifier.apply("tableName"));
    SparkPlan plan = mock(SparkPlan.class);
    when(sparkSession.sparkContext()).thenReturn(sparkContext);
    when(sparkContext.appName()).thenReturn("appName");
    when(qe.optimizedPlan()).thenReturn(new InsertIntoHadoopFsRelationCommand(new Path("file:///tmp/dir"), null, false, Seq$.MODULE$.empty(), Option.empty(), null, Map$.MODULE$.empty(), query, SaveMode.Overwrite, Option.empty(), Option.empty(), Seq$.MODULE$.<String>empty()));
    when(qe.executedPlan()).thenReturn(plan);
    when(plan.sparkContext()).thenReturn(sparkContext);
    when(plan.nodeName()).thenReturn("execute");
    OpenLineageContext olContext = OpenLineageContext.builder().sparkSession(Optional.of(sparkSession)).sparkContext(sparkSession.sparkContext()).openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).queryExecution(qe).build();
    olContext.getOutputDatasetQueryPlanVisitors().add(new InsertIntoHadoopFsRelationVisitor(olContext));
    ExecutionContext executionContext = new StaticExecutionContextFactory(emitter).createSparkSQLExecutionContext(1L, emitter, qe, olContext);
    executionContext.start(new SparkListenerSQLExecutionStart(1L, "", "", "", new SparkPlanInfo("name", "string", Seq$.MODULE$.empty(), Map$.MODULE$.empty(), Seq$.MODULE$.empty()), 1L));
    executionContext.start(new SparkListenerJobStart(0, 2L, Seq$.MODULE$.<StageInfo>empty(), new Properties()));
    ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    verify(emitter, times(2)).emit(lineageEvent.capture());
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) SparkPlan(org.apache.spark.sql.execution.SparkPlan) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) StageInfo(org.apache.spark.scheduler.StageInfo) StaticExecutionContextFactory(io.openlineage.spark.agent.lifecycle.StaticExecutionContextFactory) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand) Properties(java.util.Properties) QueryExecution(org.apache.spark.sql.execution.QueryExecution) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) SparkContext(org.apache.spark.SparkContext) ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext) SparkPlanInfo(org.apache.spark.sql.execution.SparkPlanInfo) OpenLineage(io.openlineage.client.OpenLineage) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) InsertIntoHadoopFsRelationVisitor(io.openlineage.spark.agent.lifecycle.plan.InsertIntoHadoopFsRelationVisitor) Test(org.junit.jupiter.api.Test)

Example 5 with ExecutionContext

use of io.openlineage.spark.agent.lifecycle.ExecutionContext in project OpenLineage by OpenLineage.

the class OpenLineageSparkListener method onJobStart.

/**
 * called by the SparkListener when a job starts
 */
@Override
public void onJobStart(SparkListenerJobStart jobStart) {
    Optional<ActiveJob> activeJob = asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId()))).flatMap(ScalaConversionUtils::asJavaOptional);
    Set<Integer> stages = ScalaConversionUtils.fromSeq(jobStart.stageIds()).stream().map(Integer.class::cast).collect(Collectors.toSet());
    jobMetrics.addJobStages(jobStart.jobId(), stages);
    ExecutionContext context = Optional.ofNullable(getSqlExecutionId(jobStart.properties())).map(Optional::of).orElseGet(() -> asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId())).flatMap(ScalaConversionUtils::asJavaOptional)).map(job -> getSqlExecutionId(job.properties()))).map(id -> {
        long executionId = Long.parseLong(id);
        return getExecutionContext(jobStart.jobId(), executionId);
    }).orElseGet(() -> getExecutionContext(jobStart.jobId()));
    // set it in the rddExecutionRegistry so jobEnd is called
    rddExecutionRegistry.put(jobStart.jobId(), context);
    activeJob.ifPresent(context::setActiveJob);
    context.start(jobStart);
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) SparkListenerApplicationStart(org.apache.spark.scheduler.SparkListenerApplicationStart) DEFAULTS(io.openlineage.spark.agent.ArgumentParser.DEFAULTS) URISyntaxException(java.net.URISyntaxException) ZonedDateTime(java.time.ZonedDateTime) Function0(scala.Function0) Function1(scala.Function1) SparkConfUtils.findSparkConfigKey(io.openlineage.spark.agent.util.SparkConfUtils.findSparkConfigKey) HashMap(java.util.HashMap) ScalaConversionUtils.asJavaOptional(io.openlineage.spark.agent.util.ScalaConversionUtils.asJavaOptional) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) SparkListenerTaskEnd(org.apache.spark.scheduler.SparkListenerTaskEnd) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) SparkContext$(org.apache.spark.SparkContext$) ContextFactory(io.openlineage.spark.agent.lifecycle.ContextFactory) SparkListenerApplicationEnd(org.apache.spark.scheduler.SparkListenerApplicationEnd) SparkEnv(org.apache.spark.SparkEnv) WeakHashMap(java.util.WeakHashMap) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) SparkSession(org.apache.spark.sql.SparkSession) SparkListenerSQLExecutionEnd(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd) PrintWriter(java.io.PrintWriter) Properties(java.util.Properties) ActiveJob(org.apache.spark.scheduler.ActiveJob) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) SparkConf(org.apache.spark.SparkConf) SparkContext(org.apache.spark.SparkContext) Set(java.util.Set) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) Field(java.lang.reflect.Field) Option(scala.Option) Collectors(java.util.stream.Collectors) SparkListenerEvent(org.apache.spark.scheduler.SparkListenerEvent) Slf4j(lombok.extern.slf4j.Slf4j) SparkConfUtils.findSparkUrlParams(io.openlineage.spark.agent.util.SparkConfUtils.findSparkUrlParams) Optional(java.util.Optional) PairRDDFunctions(org.apache.spark.rdd.PairRDDFunctions) ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext) PairRDDFunctionsTransformer(io.openlineage.spark.agent.transformers.PairRDDFunctionsTransformer) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) RDD(org.apache.spark.rdd.RDD) SparkEnv$(org.apache.spark.SparkEnv$) ExecutionContext(io.openlineage.spark.agent.lifecycle.ExecutionContext) ActiveJob(org.apache.spark.scheduler.ActiveJob) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils)

Aggregations

ExecutionContext (io.openlineage.spark.agent.lifecycle.ExecutionContext)5 OpenLineage (io.openlineage.client.OpenLineage)2 Properties (java.util.Properties)2 SparkContext (org.apache.spark.SparkContext)2 SparkListenerJobStart (org.apache.spark.scheduler.SparkListenerJobStart)2 SparkSession (org.apache.spark.sql.SparkSession)2 SparkListenerSQLExecutionStart (org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart)2 DEFAULTS (io.openlineage.spark.agent.ArgumentParser.DEFAULTS)1 OpenLineageClient (io.openlineage.spark.agent.client.OpenLineageClient)1 ContextFactory (io.openlineage.spark.agent.lifecycle.ContextFactory)1 StaticExecutionContextFactory (io.openlineage.spark.agent.lifecycle.StaticExecutionContextFactory)1 InsertIntoHadoopFsRelationVisitor (io.openlineage.spark.agent.lifecycle.plan.InsertIntoHadoopFsRelationVisitor)1 PairRDDFunctionsTransformer (io.openlineage.spark.agent.transformers.PairRDDFunctionsTransformer)1 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)1 ScalaConversionUtils.asJavaOptional (io.openlineage.spark.agent.util.ScalaConversionUtils.asJavaOptional)1 SparkConfUtils.findSparkConfigKey (io.openlineage.spark.agent.util.SparkConfUtils.findSparkConfigKey)1 SparkConfUtils.findSparkUrlParams (io.openlineage.spark.agent.util.SparkConfUtils.findSparkUrlParams)1 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)1 PrintWriter (java.io.PrintWriter)1 Field (java.lang.reflect.Field)1