Examples with SparkContext - org.apache.spark.SparkContext

Example 6 with SparkContext

use of org.apache.spark.SparkContext in project Gaffer by gchq.

the class ImportRDDOfElementsHandlerTest method checkImportRDDOfElements.

@Test
public void checkImportRDDOfElements() throws OperationException, IOException {
    final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
    final ArrayBuffer<Element> elements = new ArrayBuffer<>();
    for (int i = 0; i < 10; i++) {
        final Entity entity = new Entity(TestGroups.ENTITY);
        entity.setVertex("" + i);
        final Edge edge1 = new Edge(TestGroups.EDGE);
        edge1.setSource("" + i);
        edge1.setDestination("B");
        edge1.setDirected(false);
        edge1.putProperty(TestPropertyNames.COUNT, 2);
        final Edge edge2 = new Edge(TestGroups.EDGE);
        edge2.setSource("" + i);
        edge2.setDestination("C");
        edge2.setDirected(false);
        edge2.putProperty(TestPropertyNames.COUNT, 4);
        elements.$plus$eq(edge1);
        elements.$plus$eq(edge2);
        elements.$plus$eq(entity);
    }
    final User user = new User();
    final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("tests").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
    final SparkContext sparkContext = new SparkContext(sparkConf);
    // Create Hadoop configuration and serialise to a string
    final Configuration configuration = new Configuration();
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    configuration.write(new DataOutputStream(baos));
    final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
    final String outputPath = this.getClass().getResource("/").getPath().toString() + "load";
    final String failurePath = this.getClass().getResource("/").getPath().toString() + "failure";
    final File file = new File(outputPath);
    if (file.exists()) {
        FileUtils.forceDelete(file);
    }
    final RDD<Element> elementRDD = sparkContext.parallelize(elements, 8, ELEMENT_CLASS_TAG);
    final ImportRDDOfElements addRdd = new ImportRDDOfElements.Builder().sparkContext(sparkContext).input(elementRDD).option("outputPath", outputPath).option("failurePath", failurePath).build();
    graph1.execute(addRdd, user);
    FileUtils.forceDelete(file);
    // Check all elements were added
    final GetRDDOfAllElements rddQuery = new GetRDDOfAllElements.Builder().sparkContext(sparkContext).option(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString).build();
    final RDD<Element> rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    final Set<Element> results = new HashSet<>();
    final Element[] returnedElements = (Element[]) rdd.collect();
    for (int i = 0; i < returnedElements.length; i++) {
        results.add(returnedElements[i]);
    }
    assertEquals(elements.size(), results.size());
    sparkContext.stop();
}

Also used : Entity(uk.gov.gchq.gaffer.data.element.Entity) User(uk.gov.gchq.gaffer.user.User) Configuration(org.apache.hadoop.conf.Configuration) DataOutputStream(java.io.DataOutputStream) Element(uk.gov.gchq.gaffer.data.element.Element) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) Graph(uk.gov.gchq.gaffer.graph.Graph) SparkContext(org.apache.spark.SparkContext) ImportRDDOfElements(uk.gov.gchq.gaffer.spark.operation.scalardd.ImportRDDOfElements) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) ArrayBuffer(scala.collection.mutable.ArrayBuffer) Edge(uk.gov.gchq.gaffer.data.element.Edge) SparkConf(org.apache.spark.SparkConf) File(java.io.File) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 7 with SparkContext

use of org.apache.spark.SparkContext in project Gaffer by gchq.

the class GetRDDOfAllElementsHandler method doOperation.

private RDD<Element> doOperation(final GetRDDOfAllElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
    final SparkContext sparkContext = operation.getSparkContext();
    final Configuration conf = getConfiguration(operation);
    addIterators(accumuloStore, conf, context.getUser(), operation);
    final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
    return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}

Also used : SparkContext(org.apache.spark.SparkContext) Configuration(org.apache.hadoop.conf.Configuration) Tuple2(scala.Tuple2)

Example 8 with SparkContext

use of org.apache.spark.SparkContext in project Gaffer by gchq.

the class GetRDDOfElementsHandler method doOperation.

private RDD<Element> doOperation(final GetRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
    final SparkContext sparkContext = operation.getSparkContext();
    final Configuration conf = getConfiguration(operation);
    // Use batch scan option when performing seeded operation
    InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
    addIterators(accumuloStore, conf, context.getUser(), operation);
    addRanges(accumuloStore, conf, operation);
    final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
    return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}

Also used : SparkContext(org.apache.spark.SparkContext) Configuration(org.apache.hadoop.conf.Configuration) Tuple2(scala.Tuple2)

Example 9 with SparkContext

use of org.apache.spark.SparkContext in project zeppelin by apache.

the class SparkInterpreter method createSparkContext_1.

public SparkContext createSparkContext_1() {
    logger.info("------ Create new SparkContext {} -------", getProperty("master"));
    String execUri = System.getenv("SPARK_EXECUTOR_URI");
    String[] jars = null;
    if (Utils.isScala2_10()) {
        jars = (String[]) Utils.invokeStaticMethod(SparkILoop.class, "getAddedJars");
    } else {
        jars = (String[]) Utils.invokeStaticMethod(Utils.findClass("org.apache.spark.repl.Main"), "getAddedJars");
    }
    String classServerUri = null;
    String replClassOutputDirectory = null;
    try {
        // in case of spark 1.1x, spark 1.2x
        Method classServer = intp.getClass().getMethod("classServer");
        Object httpServer = classServer.invoke(intp);
        classServerUri = (String) Utils.invokeMethod(httpServer, "uri");
    } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
    // continue
    }
    if (classServerUri == null) {
        try {
            // for spark 1.3x
            Method classServer = intp.getClass().getMethod("classServerUri");
            classServerUri = (String) classServer.invoke(intp);
        } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
            // continue instead of: throw new InterpreterException(e);
            // Newer Spark versions (like the patched CDH5.7.0 one) don't contain this method
            logger.warn(String.format("Spark method classServerUri not available due to: [%s]", e.getMessage()));
        }
    }
    if (classServerUri == null) {
        try {
            // for RcpEnv
            Method getClassOutputDirectory = intp.getClass().getMethod("getClassOutputDirectory");
            File classOutputDirectory = (File) getClassOutputDirectory.invoke(intp);
            replClassOutputDirectory = classOutputDirectory.getAbsolutePath();
        } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
        // continue
        }
    }
    if (Utils.isScala2_11()) {
        classServer = createHttpServer(outputDir);
        Utils.invokeMethod(classServer, "start");
        classServerUri = (String) Utils.invokeMethod(classServer, "uri");
    }
    conf.setMaster(getProperty("master")).setAppName(getProperty("spark.app.name"));
    if (classServerUri != null) {
        conf.set("spark.repl.class.uri", classServerUri);
    }
    if (replClassOutputDirectory != null) {
        conf.set("spark.repl.class.outputDir", replClassOutputDirectory);
    }
    if (jars.length > 0) {
        conf.setJars(jars);
    }
    if (execUri != null) {
        conf.set("spark.executor.uri", execUri);
    }
    if (System.getenv("SPARK_HOME") != null) {
        conf.setSparkHome(System.getenv("SPARK_HOME"));
    }
    conf.set("spark.scheduler.mode", "FAIR");
    Properties intpProperty = getProperty();
    for (Object k : intpProperty.keySet()) {
        String key = (String) k;
        String val = toString(intpProperty.get(key));
        if (key.startsWith("spark.") && !val.trim().isEmpty()) {
            logger.debug(String.format("SparkConf: key = [%s], value = [%s]", key, val));
            conf.set(key, val);
        }
    }
    setupConfForPySpark(conf);
    setupConfForSparkR(conf);
    SparkContext sparkContext = new SparkContext(conf);
    return sparkContext;
}

Also used : Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AbstractFile(scala.reflect.io.AbstractFile) File(java.io.File)

Example 10 with SparkContext

use of org.apache.spark.SparkContext in project zeppelin by apache.

the class SparkSqlInterpreter method interpret.

@Override
public InterpreterResult interpret(String st, InterpreterContext context) {
    SQLContext sqlc = null;
    SparkInterpreter sparkInterpreter = getSparkInterpreter();
    if (sparkInterpreter.getSparkVersion().isUnsupportedVersion()) {
        return new InterpreterResult(Code.ERROR, "Spark " + sparkInterpreter.getSparkVersion().toString() + " is not supported");
    }
    sparkInterpreter.populateSparkWebUrl(context);
    sqlc = getSparkInterpreter().getSQLContext();
    SparkContext sc = sqlc.sparkContext();
    if (concurrentSQL()) {
        sc.setLocalProperty("spark.scheduler.pool", "fair");
    } else {
        sc.setLocalProperty("spark.scheduler.pool", null);
    }
    sc.setJobGroup(Utils.buildJobGroupId(context), "Zeppelin", false);
    Object rdd = null;
    try {
        // method signature of sqlc.sql() is changed
        // from  def sql(sqlText: String): SchemaRDD (1.2 and prior)
        // to    def sql(sqlText: String): DataFrame (1.3 and later).
        // Therefore need to use reflection to keep binary compatibility for all spark versions.
        Method sqlMethod = sqlc.getClass().getMethod("sql", String.class);
        rdd = sqlMethod.invoke(sqlc, st);
    } catch (InvocationTargetException ite) {
        if (Boolean.parseBoolean(getProperty("zeppelin.spark.sql.stacktrace"))) {
            throw new InterpreterException(ite);
        }
        logger.error("Invocation target exception", ite);
        String msg = ite.getTargetException().getMessage() + "\nset zeppelin.spark.sql.stacktrace = true to see full stacktrace";
        return new InterpreterResult(Code.ERROR, msg);
    } catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException e) {
        throw new InterpreterException(e);
    }
    String msg = ZeppelinContext.showDF(sc, context, rdd, maxResult);
    sc.clearJobGroup();
    return new InterpreterResult(Code.SUCCESS, msg);
}

Also used : InterpreterException(org.apache.zeppelin.interpreter.InterpreterException) InterpreterResult(org.apache.zeppelin.interpreter.InterpreterResult) Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) SparkContext(org.apache.spark.SparkContext) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

SparkContext (org.apache.spark.SparkContext)15 Configuration (org.apache.hadoop.conf.Configuration)6 SparkConf (org.apache.spark.SparkConf)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 Test (org.junit.Test)5 Graph (uk.gov.gchq.gaffer.graph.Graph)5 DataOutputStream (java.io.DataOutputStream)4 HashSet (java.util.HashSet)4 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)4 Edge (uk.gov.gchq.gaffer.data.element.Edge)4 Element (uk.gov.gchq.gaffer.data.element.Element)4 Entity (uk.gov.gchq.gaffer.data.element.Entity)4 User (uk.gov.gchq.gaffer.user.User)4 File (java.io.File)3 SQLContext (org.apache.spark.sql.SQLContext)3 Tuple2 (scala.Tuple2)3 InvocationTargetException (java.lang.reflect.InvocationTargetException)2 Method (java.lang.reflect.Method)2 ArrayList (java.util.ArrayList)2 ArrayBuffer (scala.collection.mutable.ArrayBuffer)2