use of org.apache.spark.SparkConf in project gatk by broadinstitute.
the class SparkContextFactory method setupSparkConf.
/**
* setup a spark context with the given name, master, and settings
*
* @param appName human readable name
* @param master spark master to use
* @param suggestedProperties properties to set if no values are set for them already
* @param overridingProperties properties to force to the given value ignoring values already set
*/
@VisibleForTesting
static SparkConf setupSparkConf(final String appName, final String master, final Map<String, String> suggestedProperties, final Map<String, String> overridingProperties) {
final SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster(master);
suggestedProperties.forEach(sparkConf::setIfMissing);
MANDATORY_PROPERTIES.forEach(sparkConf::set);
overridingProperties.forEach(sparkConf::set);
return sparkConf;
}
use of org.apache.spark.SparkConf in project cdap by caskdata.
the class WordCount method main.
public static void main(String[] args) throws Exception {
String inputFile = args[0];
String outputFile = args[1];
// Create a Java Spark Context.
SparkConf conf = new SparkConf().setAppName("wordCount");
JavaSparkContext sc = new JavaSparkContext(conf);
// Load our input data, assuming each line is one word
JavaRDD<String> words = sc.textFile(inputFile);
// Transform into word and count.
JavaRDD<String> counts = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x) {
return new Tuple2<>(x, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
}).map(new Function<Tuple2<String, Integer>, String>() {
@Override
public String call(Tuple2<String, Integer> input) throws Exception {
return input._1() + " " + input._2();
}
});
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile(outputFile);
}
use of org.apache.spark.SparkConf in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
sparkConf.set(property.getKey(), property.getValue());
}
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
context.setSparkConf(sparkConf);
if (!spec.isCheckpointsDisabled()) {
// Each pipeline has its own checkpoint directory within the checkpoint fileset.
// Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
// This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
// Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
// and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
// On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
String pipelineName = context.getApplicationSpecification().getName();
String checkpointDir = spec.getCheckpointDirectory();
Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
if (!ensureDirExists(pipelineCheckpointBase)) {
throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
}
try {
for (Location child : pipelineCheckpointBase.list()) {
if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
}
}
} catch (Exception e) {
LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
}
if (!ensureDirExists(pipelineCheckpointDir)) {
throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
}
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
use of org.apache.spark.SparkConf in project gatk by broadinstitute.
the class SAMRecordToGATKReadAdapterSerializerUnitTest method testSerializerRoundTripHeaderlessRead.
@Test
public void testSerializerRoundTripHeaderlessRead() {
SparkConf conf = new SparkConf().set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.SAMRecordToGATKReadAdapterSerializerUnitTest$TestGATKRegistrator");
// check round trip with no header
GATKRead read = ArtificialReadUtils.createHeaderlessSamBackedRead("read1", "1", 100, 50);
final GATKRead roundTrippedRead = SparkTestUtils.roundTripInKryo(read, GATKRead.class, conf);
Assert.assertEquals(roundTrippedRead, read);
}
use of org.apache.spark.SparkConf in project spark-dataflow by cloudera.
the class SparkContextFactory method createSparkContext.
private static JavaSparkContext createSparkContext(String master, String appName) {
SparkConf conf = new SparkConf();
conf.setMaster(master);
conf.setAppName(appName);
conf.set("spark.serializer", KryoSerializer.class.getCanonicalName());
return new JavaSparkContext(conf);
}
Aggregations