Search in sources :

Example 76 with SparkConf

use of org.apache.spark.SparkConf in project gatk by broadinstitute.

the class SparkContextFactory method setupSparkConf.

/**
     * setup a spark context with the given name, master, and settings
     *
     * @param appName human readable name
     * @param master spark master to use
     * @param suggestedProperties properties to set if no values are set for them already
     * @param overridingProperties properties to force to the given value ignoring values already set
     */
@VisibleForTesting
static SparkConf setupSparkConf(final String appName, final String master, final Map<String, String> suggestedProperties, final Map<String, String> overridingProperties) {
    final SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster(master);
    suggestedProperties.forEach(sparkConf::setIfMissing);
    MANDATORY_PROPERTIES.forEach(sparkConf::set);
    overridingProperties.forEach(sparkConf::set);
    return sparkConf;
}
Also used : SparkConf(org.apache.spark.SparkConf) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 77 with SparkConf

use of org.apache.spark.SparkConf in project cdap by caskdata.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    String inputFile = args[0];
    String outputFile = args[1];
    // Create a Java Spark Context.
    SparkConf conf = new SparkConf().setAppName("wordCount");
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Load our input data, assuming each line is one word
    JavaRDD<String> words = sc.textFile(inputFile);
    // Transform into word and count.
    JavaRDD<String> counts = words.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String x) {
            return new Tuple2<>(x, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    }).map(new Function<Tuple2<String, Integer>, String>() {

        @Override
        public String call(Tuple2<String, Integer> input) throws Exception {
            return input._1() + " " + input._2();
        }
    });
    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile(outputFile);
}
Also used : Function2(org.apache.spark.api.java.function.Function2) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 78 with SparkConf

use of org.apache.spark.SparkConf in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) Location(org.apache.twill.filesystem.Location)

Example 79 with SparkConf

use of org.apache.spark.SparkConf in project gatk by broadinstitute.

the class SAMRecordToGATKReadAdapterSerializerUnitTest method testSerializerRoundTripHeaderlessRead.

@Test
public void testSerializerRoundTripHeaderlessRead() {
    SparkConf conf = new SparkConf().set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.SAMRecordToGATKReadAdapterSerializerUnitTest$TestGATKRegistrator");
    // check round trip with no header
    GATKRead read = ArtificialReadUtils.createHeaderlessSamBackedRead("read1", "1", 100, 50);
    final GATKRead roundTrippedRead = SparkTestUtils.roundTripInKryo(read, GATKRead.class, conf);
    Assert.assertEquals(roundTrippedRead, read);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SparkConf(org.apache.spark.SparkConf) Test(org.testng.annotations.Test)

Example 80 with SparkConf

use of org.apache.spark.SparkConf in project spark-dataflow by cloudera.

the class SparkContextFactory method createSparkContext.

private static JavaSparkContext createSparkContext(String master, String appName) {
    SparkConf conf = new SparkConf();
    conf.setMaster(master);
    conf.setAppName(appName);
    conf.set("spark.serializer", KryoSerializer.class.getCanonicalName());
    return new JavaSparkContext(conf);
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) KryoSerializer(org.apache.spark.serializer.KryoSerializer)

Aggregations

SparkConf (org.apache.spark.SparkConf)83 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)46 Test (org.junit.Test)21 ArrayList (java.util.ArrayList)20 Configuration (org.apache.hadoop.conf.Configuration)20 Tuple2 (scala.Tuple2)15 Graph (uk.gov.gchq.gaffer.graph.Graph)13 DataOutputStream (java.io.DataOutputStream)11 File (java.io.File)10 HashSet (java.util.HashSet)10 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)10 Edge (uk.gov.gchq.gaffer.data.element.Edge)10 Element (uk.gov.gchq.gaffer.data.element.Element)10 Entity (uk.gov.gchq.gaffer.data.element.Entity)10 User (uk.gov.gchq.gaffer.user.User)10 Ignore (org.junit.Ignore)6 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 JavaHBaseContext (org.apache.hadoop.hbase.spark.JavaHBaseContext)5 Test (org.testng.annotations.Test)5 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)5