Search in sources :

Example 51 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project geode by apache.

the class RDDSaveJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: RDDSaveJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("RDDSaveJavaDemo");
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    List<String> data = new ArrayList<String>();
    data.add("abcdefg");
    data.add("abcdefgh");
    data.add("abcdefghi");
    JavaRDD<String> rdd = sc.parallelize(data);
    GeodeConnectionConf connConf = GeodeConnectionConf.apply(conf);
    PairFunction<String, String, Integer> func = new PairFunction<String, String, Integer>() {

        @Override
        public Tuple2<String, Integer> call(String s) throws Exception {
            return new Tuple2<String, Integer>(s, s.length());
        }
    };
    javaFunctions(rdd).saveToGeode("str_int_region", func, connConf);
    sc.stop();
}
Also used : GeodeConnectionConf(org.apache.geode.spark.connector.GeodeConnectionConf) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 52 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project gora by apache.

the class SparkWordCount method wordCount.

public int wordCount(DataStore<String, WebPage> inStore, DataStore<String, TokenDatum> outStore) throws IOException {
    //Spark engine initialization
    GoraSparkEngine<String, WebPage> goraSparkEngine = new GoraSparkEngine<>(String.class, WebPage.class);
    SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Word Count Application").setMaster("local");
    Class[] c = new Class[1];
    c[0] = inStore.getPersistentClass();
    sparkConf.registerKryoClasses(c);
    //
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaPairRDD<String, WebPage> goraRDD = goraSparkEngine.initialize(sc, inStore);
    long count = goraRDD.count();
    log.info("Total Web page count: {}", count);
    JavaRDD<Tuple2<String, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
    JavaPairRDD<String, Long> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc);
    //Print output for debug purpose
    log.info("SparkWordCount debug purpose TokenDatum print starts:");
    Map<String, Long> tokenDatumMap = reducedGoraRdd.collectAsMap();
    for (String key : tokenDatumMap.keySet()) {
        log.info(key);
        log.info(tokenDatumMap.get(key).toString());
    }
    log.info("SparkWordCount debug purpose TokenDatum print ends:");
    //
    //write output to datastore
    Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
    reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
    return 1;
}
Also used : WebPage(org.apache.gora.examples.generated.WebPage) Configuration(org.apache.hadoop.conf.Configuration) GoraSparkEngine(org.apache.gora.spark.GoraSparkEngine) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 53 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project geode by apache.

the class JavaApiIntegrationTest method setUpBeforeClass.

@BeforeClass
public static void setUpBeforeClass() throws Exception {
    // start geode cluster, and spark context
    Properties settings = new Properties();
    settings.setProperty(ConfigurationProperties.CACHE_XML_FILE, "src/it/resources/test-retrieve-regions.xml");
    settings.setProperty("num-of-servers", Integer.toString(numServers));
    int locatorPort = GeodeCluster$.MODULE$.start(settings);
    // start spark context in local mode
    Properties props = new Properties();
    props.put("log4j.logger.org.apache.spark", "INFO");
    props.put("log4j.logger.org.apache.geode.spark.connector", "DEBUG");
    IOUtils.configTestLog4j("ERROR", props);
    SparkConf conf = new SparkConf().setAppName("RetrieveRegionIntegrationTest").setMaster("local[2]").set(package$.MODULE$.GeodeLocatorPropKey(), "localhost:" + locatorPort);
    // sc = new SparkContext(conf);
    jsc = new JavaSparkContext(conf);
    connConf = GeodeConnectionConf.apply(jsc.getConf());
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ConfigurationProperties(org.apache.geode.distributed.ConfigurationProperties) SparkConf(org.apache.spark.SparkConf) BeforeClass(org.junit.BeforeClass)

Example 54 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project beam by apache.

the class TrackStreamingSourcesTest method testTrackSingle.

@Test
public void testTrackSingle() {
    options.setRunner(SparkRunner.class);
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));
    Pipeline p = Pipeline.create(options);
    CreateStream<Integer> emptyStream = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())).emptyBatch();
    p.apply(emptyStream).apply(ParDo.of(new PassthroughFn<>()));
    p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0));
    assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 55 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project beam by apache.

the class SparkRunnerStreamingContextFactory method call.

@Override
public JavaStreamingContext call() throws Exception {
    LOG.info("Creating a new Spark Streaming Context");
    // validate unbounded read properties.
    checkArgument(options.getMinReadTimeMillis() < options.getBatchIntervalMillis(), "Minimum read time has to be less than batch time.");
    checkArgument(options.getReadTimePercentage() > 0 && options.getReadTimePercentage() < 1, "Read time percentage is bound to (0, 1).");
    SparkPipelineTranslator translator = new StreamingTransformTranslator.Translator(new TransformTranslator.Translator());
    Duration batchDuration = new Duration(options.getBatchIntervalMillis());
    LOG.info("Setting Spark streaming batchDuration to {} msec", batchDuration.milliseconds());
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, batchDuration);
    // We must first init accumulators since translators expect them to be instantiated.
    SparkRunner.initAccumulators(options, jsc);
    EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
    // update cache candidates
    SparkRunner.updateCacheCandidates(pipeline, translator, ctxt);
    pipeline.traverseTopologically(new SparkRunner.Evaluator(translator, ctxt));
    ctxt.computeOutputs();
    checkpoint(jssc, checkpointDir);
    return jssc;
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkRunner(org.apache.beam.runners.spark.SparkRunner) Duration(org.apache.spark.streaming.Duration) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)260 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 SparkConf (org.apache.spark.SparkConf)49 Tuple2 (scala.Tuple2)48 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)43 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)28 Configuration (org.apache.hadoop.conf.Configuration)24 JavaRDD (org.apache.spark.api.java.JavaRDD)24 File (java.io.File)23 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)14 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 HashSet (java.util.HashSet)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12