use of org.apache.spark.SparkConf in project gora by apache.
the class SparkWordCount method wordCount.
public int wordCount(DataStore<String, WebPage> inStore, DataStore<String, TokenDatum> outStore) throws IOException {
//Spark engine initialization
GoraSparkEngine<String, WebPage> goraSparkEngine = new GoraSparkEngine<>(String.class, WebPage.class);
SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Word Count Application").setMaster("local");
Class[] c = new Class[1];
c[0] = inStore.getPersistentClass();
sparkConf.registerKryoClasses(c);
//
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaPairRDD<String, WebPage> goraRDD = goraSparkEngine.initialize(sc, inStore);
long count = goraRDD.count();
log.info("Total Web page count: {}", count);
JavaRDD<Tuple2<String, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
JavaPairRDD<String, Long> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc);
//Print output for debug purpose
log.info("SparkWordCount debug purpose TokenDatum print starts:");
Map<String, Long> tokenDatumMap = reducedGoraRdd.collectAsMap();
for (String key : tokenDatumMap.keySet()) {
log.info(key);
log.info(tokenDatumMap.get(key).toString());
}
log.info("SparkWordCount debug purpose TokenDatum print ends:");
//
//write output to datastore
Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
return 1;
}
use of org.apache.spark.SparkConf in project geode by apache.
the class JavaApiIntegrationTest method setUpBeforeClass.
@BeforeClass
public static void setUpBeforeClass() throws Exception {
// start geode cluster, and spark context
Properties settings = new Properties();
settings.setProperty(ConfigurationProperties.CACHE_XML_FILE, "src/it/resources/test-retrieve-regions.xml");
settings.setProperty("num-of-servers", Integer.toString(numServers));
int locatorPort = GeodeCluster$.MODULE$.start(settings);
// start spark context in local mode
Properties props = new Properties();
props.put("log4j.logger.org.apache.spark", "INFO");
props.put("log4j.logger.org.apache.geode.spark.connector", "DEBUG");
IOUtils.configTestLog4j("ERROR", props);
SparkConf conf = new SparkConf().setAppName("RetrieveRegionIntegrationTest").setMaster("local[2]").set(package$.MODULE$.GeodeLocatorPropKey(), "localhost:" + locatorPort);
// sc = new SparkContext(conf);
jsc = new JavaSparkContext(conf);
connConf = GeodeConnectionConf.apply(jsc.getConf());
}
use of org.apache.spark.SparkConf in project incubator-systemml by apache.
the class SparkExecutionContext method initSparkContext.
private static synchronized void initSparkContext() {
//check for redundant spark context init
if (_spctx != null)
return;
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//create a default spark context (master, appname, etc refer to system properties
//as given in the spark configuration or during spark-submit)
MLContext mlCtxObj = MLContextProxy.getActiveMLContext();
if (mlCtxObj != null) {
// This is when DML is called through spark shell
// Will clean the passing of static variables later as this involves minimal change to DMLScript
_spctx = MLContextUtil.getJavaSparkContext(mlCtxObj);
} else {
if (DMLScript.USE_LOCAL_SPARK_CONFIG) {
// For now set 4 cores for integration testing :)
SparkConf conf = createSystemMLSparkConf().setMaster("local[*]").setAppName("My local integration test app");
// This is discouraged in spark but have added only for those testcase that cannot stop the context properly
// conf.set("spark.driver.allowMultipleContexts", "true");
conf.set("spark.ui.enabled", "false");
_spctx = new JavaSparkContext(conf);
} else //default cluster setup
{
//setup systemml-preferred spark configuration (w/o user choice)
SparkConf conf = createSystemMLSparkConf();
_spctx = new JavaSparkContext(conf);
}
_parRDDs.clear();
}
// Set warning if spark.driver.maxResultSize is not set. It needs to be set before starting Spark Context for CP collect
String strDriverMaxResSize = _spctx.getConf().get("spark.driver.maxResultSize", "1g");
long driverMaxResSize = UtilFunctions.parseMemorySize(strDriverMaxResSize);
if (driverMaxResSize != 0 && driverMaxResSize < OptimizerUtils.getLocalMemBudget() && !DMLScript.USE_LOCAL_SPARK_CONFIG)
LOG.warn("Configuration parameter spark.driver.maxResultSize set to " + UtilFunctions.formatMemorySize(driverMaxResSize) + "." + " You can set it through Spark default configuration setting either to 0 (unlimited) or to available memory budget of size " + UtilFunctions.formatMemorySize((long) OptimizerUtils.getLocalMemBudget()) + ".");
//TODO if spark context passed in from outside (mlcontext), we need to clean this up at the end
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(_spctx.hadoopConfiguration());
//statistics maintenance
if (DMLScript.STATISTICS) {
Statistics.setSparkCtxCreateTime(System.nanoTime() - t0);
}
}
use of org.apache.spark.SparkConf in project incubator-systemml by apache.
the class SparkExecutionContext method createSystemMLSparkConf.
/**
* Sets up a SystemML-preferred Spark configuration based on the implicit
* default configuration (as passed via configurations from outside).
*
* @return spark configuration
*/
public static SparkConf createSystemMLSparkConf() {
SparkConf conf = new SparkConf();
//always set unlimited result size (required for cp collect)
conf.set("spark.driver.maxResultSize", "0");
//round robin assignment mitigates the problem of 'sticky slots')
if (FAIR_SCHEDULER_MODE) {
conf.set("spark.scheduler.mode", "FAIR");
}
//increase scheduler delay (usually more robust due to better data locality)
if (!conf.contains("spark.locality.wait")) {
//default 3s
conf.set("spark.locality.wait", "5s");
}
return conf;
}
use of org.apache.spark.SparkConf in project ignite by apache.
the class JavaEmbeddedIgniteRDDSelfTest method createContext.
/**
* Creates default spark context
*
* @return Context.
*/
private JavaSparkContext createContext() {
SparkConf conf = new SparkConf();
conf.set("spark.executor.instances", String.valueOf(GRID_CNT));
return new JavaSparkContext("local[" + GRID_CNT + "]", "test", conf);
}
Aggregations