use of org.apache.spark.SparkConf in project Gaffer by gchq.
the class GetJavaRDDOfElementsHandlerTest method checkGetCorrectElementsInJavaRDDForEntitySeed.
@Test
public void checkGetCorrectElementsInJavaRDDForEntitySeed() throws OperationException, IOException {
final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
final List<Element> elements = new ArrayList<>();
for (int i = 0; i < 10; i++) {
final Entity entity = new Entity(ENTITY_GROUP);
entity.setVertex("" + i);
final Edge edge1 = new Edge(EDGE_GROUP);
edge1.setSource("" + i);
edge1.setDestination("B");
edge1.setDirected(false);
edge1.putProperty("count", 2);
final Edge edge2 = new Edge(EDGE_GROUP);
edge2.setSource("" + i);
edge2.setDestination("C");
edge2.setDirected(false);
edge2.putProperty("count", 4);
elements.add(edge1);
elements.add(edge2);
elements.add(entity);
}
final User user = new User();
graph1.execute(new AddElements(elements), user);
final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("testCheckGetCorrectElementsInJavaRDDForEntitySeed").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
final JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
// Create Hadoop configuration and serialise to a string
final Configuration configuration = new Configuration();
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
configuration.write(new DataOutputStream(baos));
final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
// Check get correct edges for "1"
GetJavaRDDOfElements<EntitySeed> rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).build();
rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
JavaRDD<Element> rdd = graph1.execute(rddQuery, user);
if (rdd == null) {
fail("No RDD returned");
}
final Set<Element> results = new HashSet<>(rdd.collect());
final Set<Element> expectedElements = new HashSet<>();
final Entity entity1 = new Entity(ENTITY_GROUP);
entity1.setVertex("1");
final Edge edge1B = new Edge(EDGE_GROUP);
edge1B.setSource("1");
edge1B.setDestination("B");
edge1B.setDirected(false);
edge1B.putProperty("count", 2);
final Edge edge1C = new Edge(EDGE_GROUP);
edge1C.setSource("1");
edge1C.setDestination("C");
edge1C.setDirected(false);
edge1C.putProperty("count", 4);
expectedElements.add(entity1);
expectedElements.add(edge1B);
expectedElements.add(edge1C);
assertEquals(expectedElements, results);
// Check get correct edges for "1" when specify entities only
rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).view(new View.Builder().entity(ENTITY_GROUP).build()).build();
rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
rdd = graph1.execute(rddQuery, user);
if (rdd == null) {
fail("No RDD returned");
}
results.clear();
results.addAll(rdd.collect());
expectedElements.clear();
expectedElements.add(entity1);
assertEquals(expectedElements, results);
// Check get correct edges for "1" when specify edges only
rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(Collections.singleton(new EntitySeed("1"))).view(new View.Builder().edge(EDGE_GROUP).build()).build();
rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
rdd = graph1.execute(rddQuery, user);
if (rdd == null) {
fail("No RDD returned");
}
results.clear();
results.addAll(rdd.collect());
expectedElements.clear();
expectedElements.add(edge1B);
expectedElements.add(edge1C);
assertEquals(expectedElements, results);
// Check get correct edges for "1" and "5"
Set<EntitySeed> seeds = new HashSet<>();
seeds.add(new EntitySeed("1"));
seeds.add(new EntitySeed("5"));
rddQuery = new GetJavaRDDOfElements.Builder<EntitySeed>().javaSparkContext(sparkContext).seeds(seeds).build();
rddQuery.addOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString);
rdd = graph1.execute(rddQuery, user);
if (rdd == null) {
fail("No RDD returned");
}
results.clear();
results.addAll(rdd.collect());
final Entity entity5 = new Entity(ENTITY_GROUP);
entity5.setVertex("5");
final Edge edge5B = new Edge(EDGE_GROUP);
edge5B.setSource("5");
edge5B.setDestination("B");
edge5B.setDirected(false);
edge5B.putProperty("count", 2);
final Edge edge5C = new Edge(EDGE_GROUP);
edge5C.setSource("5");
edge5C.setDestination("C");
edge5C.setDirected(false);
edge5C.putProperty("count", 4);
expectedElements.clear();
expectedElements.add(entity1);
expectedElements.add(edge1B);
expectedElements.add(edge1C);
expectedElements.add(entity5);
expectedElements.add(edge5B);
expectedElements.add(edge5C);
assertEquals(expectedElements, results);
sparkContext.stop();
}
use of org.apache.spark.SparkConf in project mongo-hadoop by mongodb.
the class DataframeExample method run.
public void run() {
JavaSparkContext sc = new JavaSparkContext(new SparkConf());
// Set configuration options for the MongoDB Hadoop Connector.
Configuration mongodbConfig = new Configuration();
// MongoInputFormat allows us to read from a live MongoDB instance.
// We could also use BSONFileInputFormat to read BSON snapshots.
mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
// MongoDB connection string naming a collection to use.
// If using BSON, use "mapred.input.dir" to configure the directory
// where BSON files are located instead.
mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
// Create an RDD backed by the MongoDB collection.
JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
mongodbConfig, // InputFormat: read from a live cluster.
MongoInputFormat.class, // Key class
Object.class, // Value class
BSONObject.class);
JavaRDD<Message> messages = documents.map(new Function<Tuple2<Object, BSONObject>, Message>() {
public Message call(final Tuple2<Object, BSONObject> tuple) {
Message m = new Message();
BSONObject header = (BSONObject) tuple._2().get("headers");
m.setTo((String) header.get("To"));
m.setxFrom((String) header.get("From"));
m.setMessageID((String) header.get("Message-ID"));
m.setBody((String) tuple._2().get("body"));
return m;
}
});
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class);
messagesSchema.registerTempTable("messages");
DataFrame ericsMessages = sqlContext.sql("SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\"");
ericsMessages.show();
messagesSchema.printSchema();
}
use of org.apache.spark.SparkConf in project azure-tools-for-java by Microsoft.
the class JavaSparkPi method main.
public static void main(String[] args) throws Exception {
// use this line if you want to run your application in the cluster
// SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
int n = 100000 * slices;
List<Integer> l = new ArrayList<Integer>(n);
for (int i = 0; i < n; i++) {
l.add(i);
}
JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
int count = dataSet.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
return (x * x + y * y < 1) ? 1 : 0;
}
}).reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) {
return integer + integer2;
}
});
System.out.println("Pi is roughly " + 4.0 * count / n);
jsc.stop();
}
use of org.apache.spark.SparkConf in project beam by apache.
the class SparkContextFactory method createSparkContext.
private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
if (usesProvidedSparkContext) {
LOG.info("Using a provided Spark Context");
JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
if (jsc == null || jsc.sc().isStopped()) {
LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
throw new RuntimeException("The provided Spark context was not created or was stopped");
}
return jsc;
} else {
LOG.info("Creating a brand new Spark Context.");
SparkConf conf = new SparkConf();
if (!conf.contains("spark.master")) {
// set master if not set.
conf.setMaster(contextOptions.getSparkMaster());
}
conf.setAppName(contextOptions.getAppName());
// register immutable collections serializers because the SDK uses them.
conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
conf.set("spark.serializer", KryoSerializer.class.getName());
return new JavaSparkContext(conf);
}
}
use of org.apache.spark.SparkConf in project gora by apache.
the class LogAnalyticsSpark method run.
public int run(String[] args) throws Exception {
DataStore<Long, Pageview> inStore;
DataStore<String, MetricDatum> outStore;
Configuration hadoopConf = new Configuration();
if (args.length > 0) {
String dataStoreClass = args[0];
inStore = DataStoreFactory.getDataStore(dataStoreClass, Long.class, Pageview.class, hadoopConf);
if (args.length > 1) {
dataStoreClass = args[1];
}
outStore = DataStoreFactory.getDataStore(dataStoreClass, String.class, MetricDatum.class, hadoopConf);
} else {
inStore = DataStoreFactory.getDataStore(Long.class, Pageview.class, hadoopConf);
outStore = DataStoreFactory.getDataStore(String.class, MetricDatum.class, hadoopConf);
}
//Spark engine initialization
GoraSparkEngine<Long, Pageview> goraSparkEngine = new GoraSparkEngine<>(Long.class, Pageview.class);
SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Integration Application").setMaster("local");
Class[] c = new Class[1];
c[0] = inStore.getPersistentClass();
sparkConf.registerKryoClasses(c);
//
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaPairRDD<Long, Pageview> goraRDD = goraSparkEngine.initialize(sc, inStore);
long count = goraRDD.count();
log.info("Total Log Count: {}", count);
JavaRDD<Tuple2<Tuple2<String, Long>, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
JavaPairRDD<String, MetricDatum> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc).mapToPair(metricFunc);
log.info("MetricDatum count: {}", reducedGoraRdd.count());
//Print output for debug purpose
/*
Map<String, MetricDatum> metricDatumMap = reducedGoraRdd.collectAsMap();
for (String key : metricDatumMap.keySet()) {
System.out.println(key);
}
*/
//
//write output to datastore
Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
//
inStore.close();
outStore.close();
log.info("Log completed with success");
return 1;
}
Aggregations