use of org.apache.spark.sql.execution.LogicalRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitor method apply.
@Override
public List<D> apply(LogicalPlan x) {
LogicalRDD logicalRdd = (LogicalRDD) x;
List<HadoopRDD> hadoopRdds = findHadoopRdds(logicalRdd);
return hadoopRdds.stream().flatMap(rdd -> {
Path[] inputPaths = FileInputFormat.getInputPaths(rdd.getJobConf());
Configuration hadoopConf = rdd.getConf();
return Arrays.stream(inputPaths).map(p -> PlanUtils.getDirectoryPath(p, hadoopConf));
}).distinct().map(p -> {
// static partitions in the relation
return datasetFactory.getDataset(p.toUri(), logicalRdd.schema());
}).collect(Collectors.toList());
}
use of org.apache.spark.sql.execution.LogicalRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitorTest method testApply.
@Test
public void testApply(@TempDir Path tmpDir) {
SparkSession session = SparkSession.builder().master("local").getOrCreate();
LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
jobConf = new JobConf();
FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.sql.execution.LogicalRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitor method findHadoopRdds.
private List<HadoopRDD> findHadoopRdds(LogicalRDD rdd) {
RDD root = rdd.rdd();
List<HadoopRDD> ret = new ArrayList<>();
Stack<RDD> deps = new Stack<>();
deps.add(root);
while (!deps.isEmpty()) {
RDD cur = deps.pop();
Seq<Dependency> dependencies = cur.getDependencies();
deps.addAll(ScalaConversionUtils.fromSeq(dependencies).stream().map(Dependency::rdd).collect(Collectors.toList()));
if (cur instanceof HadoopRDD) {
ret.add((HadoopRDD) cur);
}
}
return ret;
}
Aggregations