Search in sources :

Example 66 with SparkConf

use of org.apache.spark.SparkConf in project Gaffer by gchq.

the class GetJavaRDDOfElementsHandlerTest method checkGetCorrectElementsInJavaRDDForEntitySeed.

@Test
public void checkGetCorrectElementsInJavaRDDForEntitySeed() throws OperationException, IOException {
    final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
    final List<Element> elements = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        final Entity entity = new Entity(ENTITY_GROUP);
        entity.setVertex("" + i);
        final Edge edge1 = new Edge(EDGE_GROUP);
        edge1.setSource("" + i);
        edge1.setDestination("B");
        edge1.setDirected(false);
        edge1.putProperty("count", 2);
        final Edge edge2 = new Edge(EDGE_GROUP);
        edge2.setSource("" + i);
        edge2.setDestination("C");
        edge2.setDirected(false);
        edge2.putProperty("count", 4);
        elements.add(edge1);
        elements.add(edge2);
        elements.add(entity);
    }
    final User user = new User();
    graph1.execute(new AddElements(elements), user);
    final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("testCheckGetCorrectElementsInJavaRDDForEntitySeed").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
    final JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    // Create Hadoop configuration and serialise to a string
    final Configuration configuration = new Configuration();
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    configuration.write(new DataOutputStream(baos));
    final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
    // Check get correct edges for "1"
    GetJavaRDDOfElements<EntitySeed> rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).build();
    rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
    JavaRDD<Element> rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    final Set<Element> results = new HashSet<>(rdd.collect());
    final Set<Element> expectedElements = new HashSet<>();
    final Entity entity1 = new Entity(ENTITY_GROUP);
    entity1.setVertex("1");
    final Edge edge1B = new Edge(EDGE_GROUP);
    edge1B.setSource("1");
    edge1B.setDestination("B");
    edge1B.setDirected(false);
    edge1B.putProperty("count", 2);
    final Edge edge1C = new Edge(EDGE_GROUP);
    edge1C.setSource("1");
    edge1C.setDestination("C");
    edge1C.setDirected(false);
    edge1C.putProperty("count", 4);
    expectedElements.add(entity1);
    expectedElements.add(edge1B);
    expectedElements.add(edge1C);
    assertEquals(expectedElements, results);
    // Check get correct edges for "1" when specify entities only
    rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).view(new View.Builder().entity(ENTITY_GROUP).build()).build();
    rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
    rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    results.clear();
    results.addAll(rdd.collect());
    expectedElements.clear();
    expectedElements.add(entity1);
    assertEquals(expectedElements, results);
    // Check get correct edges for "1" when specify edges only
    rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).view(new View.Builder().edge(EDGE_GROUP).build()).build();
    rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
    rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    results.clear();
    results.addAll(rdd.collect());
    expectedElements.clear();
    expectedElements.add(edge1B);
    expectedElements.add(edge1C);
    assertEquals(expectedElements, results);
    // Check get correct edges for "1" and "5"
    Set<EntitySeed> seeds = new HashSet<>();
    seeds.add(new EntitySeed("1"));
    seeds.add(new EntitySeed("5"));
    rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(seeds).build();
    rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
    rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    results.clear();
    results.addAll(rdd.collect());
    final Entity entity5 = new Entity(ENTITY_GROUP);
    entity5.setVertex("5");
    final Edge edge5B = new Edge(EDGE_GROUP);
    edge5B.setSource("5");
    edge5B.setDestination("B");
    edge5B.setDirected(false);
    edge5B.putProperty("count", 2);
    final Edge edge5C = new Edge(EDGE_GROUP);
    edge5C.setSource("5");
    edge5C.setDestination("C");
    edge5C.setDirected(false);
    edge5C.putProperty("count", 4);
    expectedElements.clear();
    expectedElements.add(entity1);
    expectedElements.add(edge1B);
    expectedElements.add(edge1C);
    expectedElements.add(entity5);
    expectedElements.add(edge5B);
    expectedElements.add(edge5C);
    assertEquals(expectedElements, results);
    sparkContext.stop();
}
Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Entity(uk.gov.gchq.gaffer.data.element.Entity) User(uk.gov.gchq.gaffer.user.User) Configuration(org.apache.hadoop.conf.Configuration) DataOutputStream(java.io.DataOutputStream) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GetJavaRDDOfElements(uk.gov.gchq.gaffer.spark.operation.javardd.GetJavaRDDOfElements) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashSet(java.util.HashSet) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) Graph(uk.gov.gchq.gaffer.graph.Graph) EntitySeed(uk.gov.gchq.gaffer.operation.data.EntitySeed) Edge(uk.gov.gchq.gaffer.data.element.Edge) SparkConf(org.apache.spark.SparkConf) Test(org.junit.Test)

Example 67 with SparkConf

use of org.apache.spark.SparkConf in project mongo-hadoop by mongodb.

the class DataframeExample method run.

public void run() {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf());
    // Set configuration options for the MongoDB Hadoop Connector.
    Configuration mongodbConfig = new Configuration();
    // MongoInputFormat allows us to read from a live MongoDB instance.
    // We could also use BSONFileInputFormat to read BSON snapshots.
    mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
    // MongoDB connection string naming a collection to use.
    // If using BSON, use "mapred.input.dir" to configure the directory
    // where BSON files are located instead.
    mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
    // Create an RDD backed by the MongoDB collection.
    JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
    mongodbConfig, // InputFormat: read from a live cluster.
    MongoInputFormat.class, // Key class
    Object.class, // Value class
    BSONObject.class);
    JavaRDD<Message> messages = documents.map(new Function<Tuple2<Object, BSONObject>, Message>() {

        public Message call(final Tuple2<Object, BSONObject> tuple) {
            Message m = new Message();
            BSONObject header = (BSONObject) tuple._2().get("headers");
            m.setTo((String) header.get("To"));
            m.setxFrom((String) header.get("From"));
            m.setMessageID((String) header.get("Message-ID"));
            m.setBody((String) tuple._2().get("body"));
            return m;
        }
    });
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class);
    messagesSchema.registerTempTable("messages");
    DataFrame ericsMessages = sqlContext.sql("SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\"");
    ericsMessages.show();
    messagesSchema.printSchema();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BSONObject(org.bson.BSONObject) DataFrame(org.apache.spark.sql.DataFrame) Tuple2(scala.Tuple2) BSONObject(org.bson.BSONObject) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Example 68 with SparkConf

use of org.apache.spark.SparkConf in project azure-tools-for-java by Microsoft.

the class JavaSparkPi method main.

public static void main(String[] args) throws Exception {
    // use this line if you want to run your application in the cluster
    // SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi").setMaster("local[2]");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
    int n = 100000 * slices;
    List<Integer> l = new ArrayList<Integer>(n);
    for (int i = 0; i < n; i++) {
        l.add(i);
    }
    JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
    int count = dataSet.map(new Function<Integer, Integer>() {

        @Override
        public Integer call(Integer integer) {
            double x = Math.random() * 2 - 1;
            double y = Math.random() * 2 - 1;
            return (x * x + y * y < 1) ? 1 : 0;
        }
    }).reduce(new Function2<Integer, Integer, Integer>() {

        @Override
        public Integer call(Integer integer, Integer integer2) {
            return integer + integer2;
        }
    });
    System.out.println("Pi is roughly " + 4.0 * count / n);
    jsc.stop();
}
Also used : Function(org.apache.spark.api.java.function.Function) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 69 with SparkConf

use of org.apache.spark.SparkConf in project beam by apache.

the class SparkContextFactory method createSparkContext.

private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
    if (usesProvidedSparkContext) {
        LOG.info("Using a provided Spark Context");
        JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
        if (jsc == null || jsc.sc().isStopped()) {
            LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
            throw new RuntimeException("The provided Spark context was not created or was stopped");
        }
        return jsc;
    } else {
        LOG.info("Creating a brand new Spark Context.");
        SparkConf conf = new SparkConf();
        if (!conf.contains("spark.master")) {
            // set master if not set.
            conf.setMaster(contextOptions.getSparkMaster());
        }
        conf.setAppName(contextOptions.getAppName());
        // register immutable collections serializers because the SDK uses them.
        conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
        conf.set("spark.serializer", KryoSerializer.class.getName());
        return new JavaSparkContext(conf);
    }
}
Also used : BeamSparkRunnerRegistrator(org.apache.beam.runners.spark.coders.BeamSparkRunnerRegistrator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) KryoSerializer(org.apache.spark.serializer.KryoSerializer)

Example 70 with SparkConf

use of org.apache.spark.SparkConf in project gora by apache.

the class LogAnalyticsSpark method run.

public int run(String[] args) throws Exception {
    DataStore<Long, Pageview> inStore;
    DataStore<String, MetricDatum> outStore;
    Configuration hadoopConf = new Configuration();
    if (args.length > 0) {
        String dataStoreClass = args[0];
        inStore = DataStoreFactory.getDataStore(dataStoreClass, Long.class, Pageview.class, hadoopConf);
        if (args.length > 1) {
            dataStoreClass = args[1];
        }
        outStore = DataStoreFactory.getDataStore(dataStoreClass, String.class, MetricDatum.class, hadoopConf);
    } else {
        inStore = DataStoreFactory.getDataStore(Long.class, Pageview.class, hadoopConf);
        outStore = DataStoreFactory.getDataStore(String.class, MetricDatum.class, hadoopConf);
    }
    //Spark engine initialization
    GoraSparkEngine<Long, Pageview> goraSparkEngine = new GoraSparkEngine<>(Long.class, Pageview.class);
    SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Integration Application").setMaster("local");
    Class[] c = new Class[1];
    c[0] = inStore.getPersistentClass();
    sparkConf.registerKryoClasses(c);
    //
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaPairRDD<Long, Pageview> goraRDD = goraSparkEngine.initialize(sc, inStore);
    long count = goraRDD.count();
    log.info("Total Log Count: {}", count);
    JavaRDD<Tuple2<Tuple2<String, Long>, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
    JavaPairRDD<String, MetricDatum> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc).mapToPair(metricFunc);
    log.info("MetricDatum count: {}", reducedGoraRdd.count());
    //Print output for debug purpose
    /*
    Map<String, MetricDatum> metricDatumMap = reducedGoraRdd.collectAsMap();
    for (String key : metricDatumMap.keySet()) {
      System.out.println(key);
    }
    */
    //
    //write output to datastore
    Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
    reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
    //
    inStore.close();
    outStore.close();
    log.info("Log completed with success");
    return 1;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) GoraSparkEngine(org.apache.gora.spark.GoraSparkEngine) MetricDatum(org.apache.gora.tutorial.log.generated.MetricDatum) Pageview(org.apache.gora.tutorial.log.generated.Pageview) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Aggregations

SparkConf (org.apache.spark.SparkConf)83 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)46 Test (org.junit.Test)21 ArrayList (java.util.ArrayList)20 Configuration (org.apache.hadoop.conf.Configuration)20 Tuple2 (scala.Tuple2)15 Graph (uk.gov.gchq.gaffer.graph.Graph)13 DataOutputStream (java.io.DataOutputStream)11 File (java.io.File)10 HashSet (java.util.HashSet)10 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)10 Edge (uk.gov.gchq.gaffer.data.element.Edge)10 Element (uk.gov.gchq.gaffer.data.element.Element)10 Entity (uk.gov.gchq.gaffer.data.element.Entity)10 User (uk.gov.gchq.gaffer.user.User)10 Ignore (org.junit.Ignore)6 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 JavaHBaseContext (org.apache.hadoop.hbase.spark.JavaHBaseContext)5 Test (org.testng.annotations.Test)5 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)5