Search in sources :

Example 1 with HadoopRDD

use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.

the class RddExecutionContext method nameRDD.

static String nameRDD(RDD<?> rdd) {
    String rddName = (String) rdd.name();
    if (rddName == null || // directory name
    (rdd instanceof HadoopRDD && Arrays.stream(FileInputFormat.getInputPaths(((HadoopRDD) rdd).getJobConf())).anyMatch(p -> p.toString().contains(rdd.name()))) || // the name of the underlying HadoopRDD
    (rdd instanceof MapPartitionsRDD && rdd.name().equals(((MapPartitionsRDD) rdd).prev().name()))) {
        rddName = rdd.getClass().getSimpleName().replaceAll("RDD\\d*$", // remove the trailing RDD from the class name
        "").replaceAll(CAMEL_TO_SNAKE_CASE, // camel case to snake case
        "_$1").toLowerCase(Locale.ROOT);
    }
    Seq<Dependency<?>> deps = (Seq<Dependency<?>>) rdd.dependencies();
    List<Dependency<?>> dependencies = ScalaConversionUtils.fromSeq(deps);
    if (dependencies.isEmpty()) {
        return rddName;
    }
    List<String> dependencyNames = new ArrayList<>();
    for (Dependency d : dependencies) {
        dependencyNames.add(nameRDD(d.rdd()));
    }
    String dependencyName = Strings.join(dependencyNames, "_");
    if (!dependencyName.startsWith(rddName)) {
        return rddName + "_" + dependencyName;
    } else {
        return dependencyName;
    }
}
Also used : HadoopRDD(org.apache.spark.rdd.HadoopRDD) NewHadoopRDD(org.apache.spark.rdd.NewHadoopRDD) ArrayList(java.util.ArrayList) Dependency(org.apache.spark.Dependency) MapPartitionsRDD(org.apache.spark.rdd.MapPartitionsRDD) Seq(scala.collection.Seq)

Example 2 with HadoopRDD

use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.

the class LogicalRDDVisitor method apply.

@Override
public List<D> apply(LogicalPlan x) {
    LogicalRDD logicalRdd = (LogicalRDD) x;
    List<HadoopRDD> hadoopRdds = findHadoopRdds(logicalRdd);
    return hadoopRdds.stream().flatMap(rdd -> {
        Path[] inputPaths = FileInputFormat.getInputPaths(rdd.getJobConf());
        Configuration hadoopConf = rdd.getConf();
        return Arrays.stream(inputPaths).map(p -> PlanUtils.getDirectoryPath(p, hadoopConf));
    }).distinct().map(p -> {
        // static partitions in the relation
        return datasetFactory.getDataset(p.toUri(), logicalRdd.schema());
    }).collect(Collectors.toList());
}
Also used : Arrays(java.util.Arrays) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) QueryPlanVisitor(io.openlineage.spark.api.QueryPlanVisitor) DatasetFactory(io.openlineage.spark.api.DatasetFactory) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) Seq(scala.collection.Seq) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) Collectors(java.util.stream.Collectors) Stack(java.util.Stack) ArrayList(java.util.ArrayList) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) List(java.util.List) Dependency(org.apache.spark.Dependency) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Configuration(org.apache.hadoop.conf.Configuration) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) Path(org.apache.hadoop.fs.Path) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) Configuration(org.apache.hadoop.conf.Configuration) HadoopRDD(org.apache.spark.rdd.HadoopRDD)

Example 3 with HadoopRDD

use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.

the class LogicalRDDVisitorTest method testApply.

@Test
public void testApply(@TempDir Path tmpDir) {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    jobConf = new JobConf();
    FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
    RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
    LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
    assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
    List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
    assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Seq$(scala.collection.Seq$) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InternalRow(org.apache.spark.sql.catalyst.InternalRow) SinglePartition$(org.apache.spark.sql.catalyst.plans.physical.SinglePartition$) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) IntegerType$(org.apache.spark.sql.types.IntegerType$) SparkSession$(org.apache.spark.sql.SparkSession$) DatasetFactory(io.openlineage.spark.api.DatasetFactory) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) SortOrder(org.apache.spark.sql.catalyst.expressions.SortOrder) TempDir(org.junit.jupiter.api.io.TempDir) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) HashMap(scala.collection.immutable.HashMap) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) StructField(org.apache.spark.sql.types.StructField) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.jupiter.api.Test)

Example 4 with HadoopRDD

use of org.apache.spark.rdd.HadoopRDD in project hive by apache.

the class TestSparkPlan method testSetRDDCallSite.

@Test
public void testSetRDDCallSite() throws Exception {
    String confDir = "../data/conf/spark/local/hive-site.xml";
    HiveConf.setHiveSiteLocation(new File(confDir).toURI().toURL());
    HiveConf conf = new HiveConf();
    // Set to false because we don't launch a job using LocalHiveSparkClient so the
    // hive-kryo-registrator jar is never added to the classpath
    conf.setBoolVar(HiveConf.ConfVars.SPARK_OPTIMIZE_SHUFFLE_SERDE, false);
    conf.set("spark.local.dir", Paths.get(System.getProperty("test.tmp.dir"), "TestSparkPlan-local-dir").toString());
    FileSystem fs = FileSystem.getLocal(conf);
    Path tmpDir = new Path("TestSparkPlan-tmp");
    SessionState.start(conf);
    IDriver driver = null;
    JavaSparkContext sc = null;
    try {
        driver = DriverFactory.newDriver(conf);
        driver.run("create table test (col int)");
        ((ReExecDriver) driver).compile("select * from test order by col", true);
        List<SparkTask> sparkTasks = Utilities.getSparkTasks(driver.getPlan().getRootTasks());
        Assert.assertEquals(1, sparkTasks.size());
        SparkTask sparkTask = sparkTasks.get(0);
        JobConf jobConf = new JobConf(conf);
        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local");
        sparkConf.setAppName("TestSparkPlan-app");
        sc = new JavaSparkContext(sparkConf);
        SparkPlanGenerator sparkPlanGenerator = new SparkPlanGenerator(sc, null, jobConf, tmpDir, null);
        SparkPlan sparkPlan = sparkPlanGenerator.generate(sparkTask.getWork());
        RDD<Tuple2<HiveKey, BytesWritable>> reducerRdd = sparkPlan.generateGraph().rdd();
        Assert.assertTrue(reducerRdd.name().contains("Reducer 2"));
        Assert.assertTrue(reducerRdd instanceof MapPartitionsRDD);
        Assert.assertTrue(reducerRdd.creationSite().shortForm().contains("Reducer 2"));
        Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Explain Plan"));
        Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Reducer 2"));
        List<Dependency<?>> rdds = JavaConversions.seqAsJavaList(reducerRdd.dependencies());
        Assert.assertEquals(1, rdds.size());
        RDD shuffledRdd = rdds.get(0).rdd();
        Assert.assertTrue(shuffledRdd.name().contains("Reducer 2"));
        Assert.assertTrue(shuffledRdd.name().contains("SORT"));
        Assert.assertTrue(shuffledRdd instanceof ShuffledRDD);
        Assert.assertTrue(shuffledRdd.creationSite().shortForm().contains("Reducer 2"));
        Assert.assertTrue(shuffledRdd.creationSite().longForm().contains("Explain Plan"));
        Assert.assertTrue(shuffledRdd.creationSite().longForm().contains("Reducer 2"));
        rdds = JavaConversions.seqAsJavaList(shuffledRdd.dependencies());
        Assert.assertEquals(1, rdds.size());
        RDD mapRdd = rdds.get(0).rdd();
        Assert.assertTrue(mapRdd.name().contains("Map 1"));
        Assert.assertTrue(mapRdd instanceof MapPartitionsRDD);
        Assert.assertTrue(mapRdd.creationSite().shortForm().contains("Map 1"));
        Assert.assertTrue(mapRdd.creationSite().longForm().contains("Explain Plan"));
        Assert.assertTrue(mapRdd.creationSite().longForm().contains("Map 1"));
        rdds = JavaConversions.seqAsJavaList(mapRdd.dependencies());
        Assert.assertEquals(1, rdds.size());
        RDD hadoopRdd = rdds.get(0).rdd();
        Assert.assertTrue(hadoopRdd.name().contains("Map 1"));
        Assert.assertTrue(hadoopRdd.name().contains("test"));
        Assert.assertTrue(hadoopRdd instanceof HadoopRDD);
        Assert.assertTrue(hadoopRdd.creationSite().shortForm().contains("Map 1"));
    } finally {
        if (driver != null) {
            driver.run("drop table if exists test");
            driver.destroy();
        }
        if (sc != null) {
            sc.close();
        }
        if (fs.exists(tmpDir)) {
            fs.delete(tmpDir, true);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Dependency(org.apache.spark.Dependency) MapPartitionsRDD(org.apache.spark.rdd.MapPartitionsRDD) ReExecDriver(org.apache.hadoop.hive.ql.reexec.ReExecDriver) HadoopRDD(org.apache.spark.rdd.HadoopRDD) ShuffledRDD(org.apache.spark.rdd.ShuffledRDD) MapPartitionsRDD(org.apache.spark.rdd.MapPartitionsRDD) RDD(org.apache.spark.rdd.RDD) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) HadoopRDD(org.apache.spark.rdd.HadoopRDD) ShuffledRDD(org.apache.spark.rdd.ShuffledRDD) IDriver(org.apache.hadoop.hive.ql.IDriver) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) SparkConf(org.apache.spark.SparkConf) Test(org.junit.Test)

Example 5 with HadoopRDD

use of org.apache.spark.rdd.HadoopRDD in project OpenLineage by OpenLineage.

the class LogicalRDDVisitor method findHadoopRdds.

private List<HadoopRDD> findHadoopRdds(LogicalRDD rdd) {
    RDD root = rdd.rdd();
    List<HadoopRDD> ret = new ArrayList<>();
    Stack<RDD> deps = new Stack<>();
    deps.add(root);
    while (!deps.isEmpty()) {
        RDD cur = deps.pop();
        Seq<Dependency> dependencies = cur.getDependencies();
        deps.addAll(ScalaConversionUtils.fromSeq(dependencies).stream().map(Dependency::rdd).collect(Collectors.toList()));
        if (cur instanceof HadoopRDD) {
            ret.add((HadoopRDD) cur);
        }
    }
    return ret;
}
Also used : HadoopRDD(org.apache.spark.rdd.HadoopRDD) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) RDD(org.apache.spark.rdd.RDD) HadoopRDD(org.apache.spark.rdd.HadoopRDD) ArrayList(java.util.ArrayList) Dependency(org.apache.spark.Dependency) Stack(java.util.Stack)

Aggregations

HadoopRDD (org.apache.spark.rdd.HadoopRDD)5 Dependency (org.apache.spark.Dependency)4 RDD (org.apache.spark.rdd.RDD)4 ArrayList (java.util.ArrayList)3 LogicalRDD (org.apache.spark.sql.execution.LogicalRDD)3 OpenLineage (io.openlineage.client.OpenLineage)2 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)2 DatasetFactory (io.openlineage.spark.api.DatasetFactory)2 List (java.util.List)2 Stack (java.util.Stack)2 Path (org.apache.hadoop.fs.Path)2 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)2 JobConf (org.apache.hadoop.mapred.JobConf)2 MapPartitionsRDD (org.apache.spark.rdd.MapPartitionsRDD)2 Seq (scala.collection.Seq)2 SparkAgentTestExtension (io.openlineage.spark.agent.SparkAgentTestExtension)1 OpenLineageClient (io.openlineage.spark.agent.client.OpenLineageClient)1 PlanUtils (io.openlineage.spark.agent.util.PlanUtils)1 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)1 QueryPlanVisitor (io.openlineage.spark.api.QueryPlanVisitor)1