use of io.openlineage.spark.api.DatasetFactory in project OpenLineage by OpenLineage.
the class DataSourceV2RelationDatasetBuilderTest method provideBuildersWithSparkListeners.
private static Stream<Arguments> provideBuildersWithSparkListeners() {
OpenLineageContext context = mock(OpenLineageContext.class);
DatasetFactory factory = mock(DatasetFactory.class);
return Stream.of(Arguments.of(new DataSourceV2RelationInputDatasetBuilder(context, factory), mock(SparkListenerJobStart.class), true), Arguments.of(new DataSourceV2RelationInputDatasetBuilder(context, factory), mock(SparkListenerSQLExecutionStart.class), true), Arguments.of(new DataSourceV2RelationInputDatasetBuilder(context, factory), mock(SparkListenerJobEnd.class), false), Arguments.of(new DataSourceV2RelationInputDatasetBuilder(context, factory), mock(SparkListenerSQLExecutionEnd.class), false), Arguments.of(new DataSourceV2RelationOutputDatasetBuilder(context, factory), mock(SparkListenerJobStart.class), false), Arguments.of(new DataSourceV2RelationOutputDatasetBuilder(context, factory), mock(SparkListenerSQLExecutionStart.class), false), Arguments.of(new DataSourceV2RelationOutputDatasetBuilder(context, factory), mock(SparkListenerJobEnd.class), true), Arguments.of(new DataSourceV2RelationOutputDatasetBuilder(context, factory), mock(SparkListenerSQLExecutionEnd.class), true));
}
use of io.openlineage.spark.api.DatasetFactory in project OpenLineage by OpenLineage.
the class DataSourceV2RelationDatasetBuilderTest method provideBuilders.
private static Stream<Arguments> provideBuilders() {
OpenLineageContext context = mock(OpenLineageContext.class);
DatasetFactory factory = mock(DatasetFactory.class);
OpenLineage openLineage = mock(OpenLineage.class);
return Stream.of(Arguments.of(new DataSourceV2RelationInputDatasetBuilder(context, factory), mock(DataSourceV2Relation.class), context, factory, openLineage), Arguments.of(new DataSourceV2RelationOutputDatasetBuilder(context, factory), mock(DataSourceV2Relation.class), context, factory, openLineage));
}
use of io.openlineage.spark.api.DatasetFactory in project OpenLineage by OpenLineage.
the class KafkaRelationVisitor method createDatasetsFromOptions.
private static <D extends OpenLineage.Dataset> List<D> createDatasetsFromOptions(DatasetFactory<D> datasetFactory, Map<String, String> sourceOptions, StructType schema) {
List<String> topics;
Optional<String> servers = asJavaOptional(sourceOptions.get("kafka.bootstrap.servers"));
// don't support subscribePattern, as it will report dataset nodes that don't exist
topics = Stream.concat(// handle "subscribe" and "topic" here to handle single topic reads/writes
Stream.of("subscribe", "topic").map(it -> sourceOptions.get(it)).filter(it -> it.nonEmpty()).map(it -> it.get()).map(String.class::cast), // https://spark.apache.org/docs/3.1.2/structured-streaming-kafka-integration.html
ScalaConversionUtils.asJavaOptional(sourceOptions.get("assign")).map((String str) -> {
try {
JsonNode jsonNode = new ObjectMapper().readTree(str);
long fieldCount = jsonNode.size();
return StreamSupport.stream(Spliterators.spliterator(jsonNode.fieldNames(), fieldCount, Spliterator.SIZED & Spliterator.IMMUTABLE), false);
} catch (IOException e) {
log.warn("Unable to find topics from Kafka source configuration {}", str, e);
}
return Stream.<String>empty();
}).orElse(Stream.empty())).collect(Collectors.toList());
String server = servers.map(str -> {
if (!str.matches("\\w+://.*")) {
return "PLAINTEXT://" + str;
} else {
return str;
}
}).map(str -> URI.create(str.split(",")[0])).map(uri -> uri.getHost() + ":" + uri.getPort()).orElse("");
String namespace = "kafka://" + server;
return topics.stream().map(topic -> datasetFactory.getDataset(topic, namespace, schema)).collect(Collectors.toList());
}
Aggregations