use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.
the class RddExecutionContext method nameRDD.
static String nameRDD(RDD<?> rdd) {
String rddName = (String) rdd.name();
if (rddName == null || // directory name
(rdd instanceof HadoopRDD && Arrays.stream(FileInputFormat.getInputPaths(((HadoopRDD) rdd).getJobConf())).anyMatch(p -> p.toString().contains(rdd.name()))) || // the name of the underlying HadoopRDD
(rdd instanceof MapPartitionsRDD && rdd.name().equals(((MapPartitionsRDD) rdd).prev().name()))) {
rddName = rdd.getClass().getSimpleName().replaceAll("RDD\\d*$", // remove the trailing RDD from the class name
"").replaceAll(CAMEL_TO_SNAKE_CASE, // camel case to snake case
"_$1").toLowerCase(Locale.ROOT);
}
Seq<Dependency<?>> deps = (Seq<Dependency<?>>) rdd.dependencies();
List<Dependency<?>> dependencies = ScalaConversionUtils.fromSeq(deps);
if (dependencies.isEmpty()) {
return rddName;
}
List<String> dependencyNames = new ArrayList<>();
for (Dependency d : dependencies) {
dependencyNames.add(nameRDD(d.rdd()));
}
String dependencyName = Strings.join(dependencyNames, "_");
if (!dependencyName.startsWith(rddName)) {
return rddName + "_" + dependencyName;
} else {
return dependencyName;
}
}
use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitor method apply.
@Override
public List<D> apply(LogicalPlan x) {
LogicalRDD logicalRdd = (LogicalRDD) x;
List<HadoopRDD> hadoopRdds = findHadoopRdds(logicalRdd);
return hadoopRdds.stream().flatMap(rdd -> {
Path[] inputPaths = FileInputFormat.getInputPaths(rdd.getJobConf());
Configuration hadoopConf = rdd.getConf();
return Arrays.stream(inputPaths).map(p -> PlanUtils.getDirectoryPath(p, hadoopConf));
}).distinct().map(p -> {
// static partitions in the relation
return datasetFactory.getDataset(p.toUri(), logicalRdd.schema());
}).collect(Collectors.toList());
}
use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitorTest method testApply.
@Test
public void testApply(@TempDir Path tmpDir) {
SparkSession session = SparkSession.builder().master("local").getOrCreate();
LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
jobConf = new JobConf();
FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
use of org.apache.spark.rdd.HadoopRDD in project hive by apache.
the class TestSparkPlan method testSetRDDCallSite.
@Test
public void testSetRDDCallSite() throws Exception {
String confDir = "../data/conf/spark/local/hive-site.xml";
HiveConf.setHiveSiteLocation(new File(confDir).toURI().toURL());
HiveConf conf = new HiveConf();
// Set to false because we don't launch a job using LocalHiveSparkClient so the
// hive-kryo-registrator jar is never added to the classpath
conf.setBoolVar(HiveConf.ConfVars.SPARK_OPTIMIZE_SHUFFLE_SERDE, false);
conf.set("spark.local.dir", Paths.get(System.getProperty("test.tmp.dir"), "TestSparkPlan-local-dir").toString());
FileSystem fs = FileSystem.getLocal(conf);
Path tmpDir = new Path("TestSparkPlan-tmp");
SessionState.start(conf);
IDriver driver = null;
JavaSparkContext sc = null;
try {
driver = DriverFactory.newDriver(conf);
driver.run("create table test (col int)");
((ReExecDriver) driver).compile("select * from test order by col", true);
List<SparkTask> sparkTasks = Utilities.getSparkTasks(driver.getPlan().getRootTasks());
Assert.assertEquals(1, sparkTasks.size());
SparkTask sparkTask = sparkTasks.get(0);
JobConf jobConf = new JobConf(conf);
SparkConf sparkConf = new SparkConf();
sparkConf.setMaster("local");
sparkConf.setAppName("TestSparkPlan-app");
sc = new JavaSparkContext(sparkConf);
SparkPlanGenerator sparkPlanGenerator = new SparkPlanGenerator(sc, null, jobConf, tmpDir, null);
SparkPlan sparkPlan = sparkPlanGenerator.generate(sparkTask.getWork());
RDD<Tuple2<HiveKey, BytesWritable>> reducerRdd = sparkPlan.generateGraph().rdd();
Assert.assertTrue(reducerRdd.name().contains("Reducer 2"));
Assert.assertTrue(reducerRdd instanceof MapPartitionsRDD);
Assert.assertTrue(reducerRdd.creationSite().shortForm().contains("Reducer 2"));
Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Explain Plan"));
Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Reducer 2"));
List<Dependency<?>> rdds = JavaConversions.seqAsJavaList(reducerRdd.dependencies());
Assert.assertEquals(1, rdds.size());
RDD shuffledRdd = rdds.get(0).rdd();
Assert.assertTrue(shuffledRdd.name().contains("Reducer 2"));
Assert.assertTrue(shuffledRdd.name().contains("SORT"));
Assert.assertTrue(shuffledRdd instanceof ShuffledRDD);
Assert.assertTrue(shuffledRdd.creationSite().shortForm().contains("Reducer 2"));
Assert.assertTrue(shuffledRdd.creationSite().longForm().contains("Explain Plan"));
Assert.assertTrue(shuffledRdd.creationSite().longForm().contains("Reducer 2"));
rdds = JavaConversions.seqAsJavaList(shuffledRdd.dependencies());
Assert.assertEquals(1, rdds.size());
RDD mapRdd = rdds.get(0).rdd();
Assert.assertTrue(mapRdd.name().contains("Map 1"));
Assert.assertTrue(mapRdd instanceof MapPartitionsRDD);
Assert.assertTrue(mapRdd.creationSite().shortForm().contains("Map 1"));
Assert.assertTrue(mapRdd.creationSite().longForm().contains("Explain Plan"));
Assert.assertTrue(mapRdd.creationSite().longForm().contains("Map 1"));
rdds = JavaConversions.seqAsJavaList(mapRdd.dependencies());
Assert.assertEquals(1, rdds.size());
RDD hadoopRdd = rdds.get(0).rdd();
Assert.assertTrue(hadoopRdd.name().contains("Map 1"));
Assert.assertTrue(hadoopRdd.name().contains("test"));
Assert.assertTrue(hadoopRdd instanceof HadoopRDD);
Assert.assertTrue(hadoopRdd.creationSite().shortForm().contains("Map 1"));
} finally {
if (driver != null) {
driver.run("drop table if exists test");
driver.destroy();
}
if (sc != null) {
sc.close();
}
if (fs.exists(tmpDir)) {
fs.delete(tmpDir, true);
}
}
}
use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.
the class LogicalRDDVisitor method findHadoopRdds.
private List<HadoopRDD> findHadoopRdds(LogicalRDD rdd) {
RDD root = rdd.rdd();
List<HadoopRDD> ret = new ArrayList<>();
Stack<RDD> deps = new Stack<>();
deps.add(root);
while (!deps.isEmpty()) {
RDD cur = deps.pop();
Seq<Dependency> dependencies = cur.getDependencies();
deps.addAll(ScalaConversionUtils.fromSeq(dependencies).stream().map(Dependency::rdd).collect(Collectors.toList()));
if (cur instanceof HadoopRDD) {
ret.add((HadoopRDD) cur);
}
}
return ret;
}
Aggregations