Search in sources :

Example 31 with SparkConf

use of org.apache.spark.SparkConf in project gora by apache.

the class SparkWordCount method wordCount.

public int wordCount(DataStore<String, WebPage> inStore, DataStore<String, TokenDatum> outStore) throws IOException {
    //Spark engine initialization
    GoraSparkEngine<String, WebPage> goraSparkEngine = new GoraSparkEngine<>(String.class, WebPage.class);
    SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Word Count Application").setMaster("local");
    Class[] c = new Class[1];
    c[0] = inStore.getPersistentClass();
    sparkConf.registerKryoClasses(c);
    //
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaPairRDD<String, WebPage> goraRDD = goraSparkEngine.initialize(sc, inStore);
    long count = goraRDD.count();
    log.info("Total Web page count: {}", count);
    JavaRDD<Tuple2<String, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
    JavaPairRDD<String, Long> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc);
    //Print output for debug purpose
    log.info("SparkWordCount debug purpose TokenDatum print starts:");
    Map<String, Long> tokenDatumMap = reducedGoraRdd.collectAsMap();
    for (String key : tokenDatumMap.keySet()) {
        log.info(key);
        log.info(tokenDatumMap.get(key).toString());
    }
    log.info("SparkWordCount debug purpose TokenDatum print ends:");
    //
    //write output to datastore
    Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
    reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
    return 1;
}
Also used : WebPage(org.apache.gora.examples.generated.WebPage) Configuration(org.apache.hadoop.conf.Configuration) GoraSparkEngine(org.apache.gora.spark.GoraSparkEngine) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 32 with SparkConf

use of org.apache.spark.SparkConf in project geode by apache.

the class JavaApiIntegrationTest method setUpBeforeClass.

@BeforeClass
public static void setUpBeforeClass() throws Exception {
    // start geode cluster, and spark context
    Properties settings = new Properties();
    settings.setProperty(ConfigurationProperties.CACHE_XML_FILE, "src/it/resources/test-retrieve-regions.xml");
    settings.setProperty("num-of-servers", Integer.toString(numServers));
    int locatorPort = GeodeCluster$.MODULE$.start(settings);
    // start spark context in local mode
    Properties props = new Properties();
    props.put("log4j.logger.org.apache.spark", "INFO");
    props.put("log4j.logger.org.apache.geode.spark.connector", "DEBUG");
    IOUtils.configTestLog4j("ERROR", props);
    SparkConf conf = new SparkConf().setAppName("RetrieveRegionIntegrationTest").setMaster("local[2]").set(package$.MODULE$.GeodeLocatorPropKey(), "localhost:" + locatorPort);
    // sc = new SparkContext(conf);
    jsc = new JavaSparkContext(conf);
    connConf = GeodeConnectionConf.apply(jsc.getConf());
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ConfigurationProperties(org.apache.geode.distributed.ConfigurationProperties) SparkConf(org.apache.spark.SparkConf) BeforeClass(org.junit.BeforeClass)

Example 33 with SparkConf

use of org.apache.spark.SparkConf in project incubator-systemml by apache.

the class SparkExecutionContext method initSparkContext.

private static synchronized void initSparkContext() {
    //check for redundant spark context init
    if (_spctx != null)
        return;
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    //create a default spark context (master, appname, etc refer to system properties
    //as given in the spark configuration or during spark-submit)
    MLContext mlCtxObj = MLContextProxy.getActiveMLContext();
    if (mlCtxObj != null) {
        // This is when DML is called through spark shell
        // Will clean the passing of static variables later as this involves minimal change to DMLScript
        _spctx = MLContextUtil.getJavaSparkContext(mlCtxObj);
    } else {
        if (DMLScript.USE_LOCAL_SPARK_CONFIG) {
            // For now set 4 cores for integration testing :)
            SparkConf conf = createSystemMLSparkConf().setMaster("local[*]").setAppName("My local integration test app");
            // This is discouraged in spark but have added only for those testcase that cannot stop the context properly
            // conf.set("spark.driver.allowMultipleContexts", "true");
            conf.set("spark.ui.enabled", "false");
            _spctx = new JavaSparkContext(conf);
        } else //default cluster setup
        {
            //setup systemml-preferred spark configuration (w/o user choice)
            SparkConf conf = createSystemMLSparkConf();
            _spctx = new JavaSparkContext(conf);
        }
        _parRDDs.clear();
    }
    // Set warning if spark.driver.maxResultSize is not set. It needs to be set before starting Spark Context for CP collect
    String strDriverMaxResSize = _spctx.getConf().get("spark.driver.maxResultSize", "1g");
    long driverMaxResSize = UtilFunctions.parseMemorySize(strDriverMaxResSize);
    if (driverMaxResSize != 0 && driverMaxResSize < OptimizerUtils.getLocalMemBudget() && !DMLScript.USE_LOCAL_SPARK_CONFIG)
        LOG.warn("Configuration parameter spark.driver.maxResultSize set to " + UtilFunctions.formatMemorySize(driverMaxResSize) + "." + " You can set it through Spark default configuration setting either to 0 (unlimited) or to available memory budget of size " + UtilFunctions.formatMemorySize((long) OptimizerUtils.getLocalMemBudget()) + ".");
    //TODO if spark context passed in from outside (mlcontext), we need to clean this up at the end
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(_spctx.hadoopConfiguration());
    //statistics maintenance
    if (DMLScript.STATISTICS) {
        Statistics.setSparkCtxCreateTime(System.nanoTime() - t0);
    }
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) MLContext(org.apache.sysml.api.mlcontext.MLContext)

Example 34 with SparkConf

use of org.apache.spark.SparkConf in project incubator-systemml by apache.

the class SparkExecutionContext method createSystemMLSparkConf.

/**
	 * Sets up a SystemML-preferred Spark configuration based on the implicit
	 * default configuration (as passed via configurations from outside).
	 *
	 * @return spark configuration
	 */
public static SparkConf createSystemMLSparkConf() {
    SparkConf conf = new SparkConf();
    //always set unlimited result size (required for cp collect)
    conf.set("spark.driver.maxResultSize", "0");
    //round robin assignment mitigates the problem of 'sticky slots')
    if (FAIR_SCHEDULER_MODE) {
        conf.set("spark.scheduler.mode", "FAIR");
    }
    //increase scheduler delay (usually more robust due to better data locality)
    if (!conf.contains("spark.locality.wait")) {
        //default 3s
        conf.set("spark.locality.wait", "5s");
    }
    return conf;
}
Also used : SparkConf(org.apache.spark.SparkConf)

Example 35 with SparkConf

use of org.apache.spark.SparkConf in project ignite by apache.

the class JavaEmbeddedIgniteRDDSelfTest method createContext.

/**
     * Creates default spark context
     *
     * @return Context.
     */
private JavaSparkContext createContext() {
    SparkConf conf = new SparkConf();
    conf.set("spark.executor.instances", String.valueOf(GRID_CNT));
    return new JavaSparkContext("local[" + GRID_CNT + "]", "test", conf);
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Aggregations

SparkConf (org.apache.spark.SparkConf)83 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)46 Test (org.junit.Test)21 ArrayList (java.util.ArrayList)20 Configuration (org.apache.hadoop.conf.Configuration)20 Tuple2 (scala.Tuple2)15 Graph (uk.gov.gchq.gaffer.graph.Graph)13 DataOutputStream (java.io.DataOutputStream)11 File (java.io.File)10 HashSet (java.util.HashSet)10 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)10 Edge (uk.gov.gchq.gaffer.data.element.Edge)10 Element (uk.gov.gchq.gaffer.data.element.Element)10 Entity (uk.gov.gchq.gaffer.data.element.Entity)10 User (uk.gov.gchq.gaffer.user.User)10 Ignore (org.junit.Ignore)6 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 JavaHBaseContext (org.apache.hadoop.hbase.spark.JavaHBaseContext)5 Test (org.testng.annotations.Test)5 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)5