Search in sources :

Example 71 with SparkConf

use of org.apache.spark.SparkConf in project geode by apache.

the class PairRDDSaveJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: PairRDDSaveJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("PairRDDSaveJavaDemo");
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    GeodeConnectionConf connConf = GeodeConnectionConf.apply(conf);
    List<Tuple2<String, String>> data = new ArrayList<>();
    data.add(new Tuple2<>("7", "seven"));
    data.add(new Tuple2<>("8", "eight"));
    data.add(new Tuple2<>("9", "nine"));
    List<Tuple2<String, String>> data2 = new ArrayList<Tuple2<String, String>>();
    data2.add(new Tuple2<>("11", "eleven"));
    data2.add(new Tuple2<>("12", "twelve"));
    data2.add(new Tuple2<>("13", "thirteen"));
    // method 1: generate JavaPairRDD directly
    JavaPairRDD<String, String> rdd1 = sc.parallelizePairs(data);
    javaFunctions(rdd1).saveToGeode("str_str_region", connConf);
    // method 2: convert JavaRDD<Tuple2<K,V>> to JavaPairRDD<K, V>
    JavaRDD<Tuple2<String, String>> rdd2 = sc.parallelize(data2);
    javaFunctions(toJavaPairRDD(rdd2)).saveToGeode("str_str_region", connConf);
    sc.stop();
}
Also used : GeodeConnectionConf(org.apache.geode.spark.connector.GeodeConnectionConf) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 72 with SparkConf

use of org.apache.spark.SparkConf in project geode by apache.

the class RegionToRDDJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: RegionToRDDJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("RegionToRDDJavaDemo");
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaPairRDD<String, String> rdd = javaFunctions(sc).geodeRegion("str_str_region");
    System.out.println("=== geodeRegion =======\n" + rdd.collect() + "\n=========================");
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 73 with SparkConf

use of org.apache.spark.SparkConf in project ignite by apache.

the class SharedRDDExample method main.

/**
     * Executes the example.
     * @param args Command line arguments, none required.
     */
public static void main(String[] args) {
    // Spark Configuration.
    SparkConf sparkConf = new SparkConf().setAppName("JavaIgniteRDDExample").setMaster("local").set("spark.executor.instances", "2");
    // Spark context.
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    // Adjust the logger to exclude the logs of no interest.
    Logger.getRootLogger().setLevel(Level.ERROR);
    Logger.getLogger("org.apache.ignite").setLevel(Level.INFO);
    // Creates Ignite context with specific configuration and runs Ignite in the embedded mode.
    JavaIgniteContext<Integer, Integer> igniteContext = new JavaIgniteContext<Integer, Integer>(sparkContext, "examples/config/spark/example-shared-rdd.xml", false);
    // Create a Java Ignite RDD of Type (Int,Int) Integer Pair.
    JavaIgniteRDD<Integer, Integer> sharedRDD = igniteContext.<Integer, Integer>fromCache("sharedRDD");
    // Define data to be stored in the Ignite RDD (cache).
    List<Integer> data = new ArrayList<>(20);
    for (int i = 0; i < 20; i++) {
        data.add(i);
    }
    // Preparing a Java RDD.
    JavaRDD<Integer> javaRDD = sparkContext.<Integer>parallelize(data);
    // Fill the Ignite RDD in with Int pairs. Here Pairs are represented as Scala Tuple2.
    sharedRDD.savePairs(javaRDD.<Integer, Integer>mapToPair(new PairFunction<Integer, Integer, Integer>() {

        @Override
        public Tuple2<Integer, Integer> call(Integer val) throws Exception {
            return new Tuple2<Integer, Integer>(val, val);
        }
    }));
    System.out.println(">>> Iterating over Ignite Shared RDD...");
    // Iterate over the Ignite RDD.
    sharedRDD.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Transforming values stored in Ignite Shared RDD...");
    // Filter out even values as a transformed RDD.
    JavaPairRDD<Integer, Integer> transformedValues = sharedRDD.filter(new Function<Tuple2<Integer, Integer>, Boolean>() {

        @Override
        public Boolean call(Tuple2<Integer, Integer> tuple) throws Exception {
            return tuple._2() % 2 == 0;
        }
    });
    // Print out the transformed values.
    transformedValues.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Executing SQL query over Ignite Shared RDD...");
    // Execute SQL query over the Ignite RDD.
    Dataset df = sharedRDD.sql("select _val from Integer where _key < 9");
    // Show the result of the execution.
    df.show();
    // Close IgniteContext on all the workers.
    igniteContext.close(true);
}
Also used : Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) JavaIgniteContext(org.apache.ignite.spark.JavaIgniteContext) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 74 with SparkConf

use of org.apache.spark.SparkConf in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    for (StageInfo stageInfo : phaseSpec.getPhase()) {
        String stageName = stageInfo.getName();
        String pluginType = stageInfo.getPluginType();
        if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
            batchSource.prepareRun(sourceContext);
            finishers.add(batchSource, sourceContext);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
            batchSink.prepareRun(sinkContext);
            finishers.add(batchSink, sinkContext);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
            SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
            sparkSink.prepareRun(sparkPluginContext);
            finishers.add(sparkSink, sparkPluginContext);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
            aggregator.prepareRun(aggregatorContext);
            finishers.add(aggregator, aggregatorContext);
            stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
            joiner.prepareRun(sparkJoinerContext);
            finishers.add(joiner, sparkJoinerContext);
            stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = finishers.build();
    context.localize("HydratorSpark.config", configFile.toURI());
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) File(java.io.File) Writer(java.io.Writer)

Example 75 with SparkConf

use of org.apache.spark.SparkConf in project cdap by caskdata.

the class ExternalSparkProgram method initialize.

@Override
protected void initialize() throws Exception {
    SparkClientContext context = getContext();
    String stageName = context.getSpecification().getProperty(STAGE_NAME);
    Map<String, String> pluginProperties = context.getPluginProperties(stageName).getProperties();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    for (Map.Entry<String, String> pluginProperty : pluginProperties.entrySet()) {
        String key = pluginProperty.getKey();
        String val = pluginProperty.getValue();
        if (!key.equals(PROGRAM_ARGS)) {
            sparkConf.set(key, val);
        }
    }
    context.setSparkConf(sparkConf);
}
Also used : SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

SparkConf (org.apache.spark.SparkConf)83 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)46 Test (org.junit.Test)21 ArrayList (java.util.ArrayList)20 Configuration (org.apache.hadoop.conf.Configuration)20 Tuple2 (scala.Tuple2)15 Graph (uk.gov.gchq.gaffer.graph.Graph)13 DataOutputStream (java.io.DataOutputStream)11 File (java.io.File)10 HashSet (java.util.HashSet)10 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)10 Edge (uk.gov.gchq.gaffer.data.element.Edge)10 Element (uk.gov.gchq.gaffer.data.element.Element)10 Entity (uk.gov.gchq.gaffer.data.element.Entity)10 User (uk.gov.gchq.gaffer.user.User)10 Ignore (org.junit.Ignore)6 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 JavaHBaseContext (org.apache.hadoop.hbase.spark.JavaHBaseContext)5 Test (org.testng.annotations.Test)5 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)5