use of io.openlineage.spark.agent.util.ScalaConversionUtils in project OpenLineage by OpenLineage.
the class IcebergHandler method getDatasetIdentifier.
@Override
public DatasetIdentifier getDatasetIdentifier(SparkSession session, TableCatalog tableCatalog, Identifier identifier, Map<String, String> properties) {
SparkCatalog sparkCatalog = (SparkCatalog) tableCatalog;
String catalogName = sparkCatalog.name();
String prefix = String.format("spark.sql.catalog.%s", catalogName);
Map<String, String> conf = ScalaConversionUtils.<String, String>fromMap(session.conf().getAll());
log.info(conf.toString());
Map<String, String> catalogConf = conf.entrySet().stream().filter(x -> x.getKey().startsWith(prefix)).filter(x -> x.getKey().length() > prefix.length()).collect(Collectors.toMap(// handle dot after prefix
x -> x.getKey().substring(prefix.length() + 1), Map.Entry::getValue));
log.info(catalogConf.toString());
if (catalogConf.isEmpty() || !catalogConf.containsKey("type")) {
throw new UnsupportedCatalogException(catalogName);
}
log.info(catalogConf.get("type"));
switch(catalogConf.get("type")) {
case "hadoop":
return getHadoopIdentifier(catalogConf, identifier.toString());
case "hive":
return getHiveIdentifier(session, catalogConf.get(CatalogProperties.URI), identifier.toString());
default:
throw new UnsupportedCatalogException(catalogConf.get("type"));
}
}
use of io.openlineage.spark.agent.util.ScalaConversionUtils in project OpenLineage by OpenLineage.
the class OpenLineageSparkListener method onJobStart.
/**
* called by the SparkListener when a job starts
*/
@Override
public void onJobStart(SparkListenerJobStart jobStart) {
Optional<ActiveJob> activeJob = asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId()))).flatMap(ScalaConversionUtils::asJavaOptional);
Set<Integer> stages = ScalaConversionUtils.fromSeq(jobStart.stageIds()).stream().map(Integer.class::cast).collect(Collectors.toSet());
jobMetrics.addJobStages(jobStart.jobId(), stages);
ExecutionContext context = Optional.ofNullable(getSqlExecutionId(jobStart.properties())).map(Optional::of).orElseGet(() -> asJavaOptional(SparkSession.getDefaultSession().map(sparkContextFromSession).orElse(activeSparkContext)).flatMap(ctx -> Optional.ofNullable(ctx.dagScheduler()).map(ds -> ds.jobIdToActiveJob().get(jobStart.jobId())).flatMap(ScalaConversionUtils::asJavaOptional)).map(job -> getSqlExecutionId(job.properties()))).map(id -> {
long executionId = Long.parseLong(id);
return getExecutionContext(jobStart.jobId(), executionId);
}).orElseGet(() -> getExecutionContext(jobStart.jobId()));
// set it in the rddExecutionRegistry so jobEnd is called
rddExecutionRegistry.put(jobStart.jobId(), context);
activeJob.ifPresent(context::setActiveJob);
context.start(jobStart);
}
Aggregations