Search in sources :

Example 51 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project spark-dataflow by cloudera.

the class StreamingTransformTranslator method window.

private static <T, W extends BoundedWindow> TransformEvaluator<Window.Bound<T>> window() {
    return new TransformEvaluator<Window.Bound<T>>() {

        @Override
        public void evaluate(Window.Bound<T> transform, EvaluationContext context) {
            StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
            //--- first we apply windowing to the stream
            WindowFn<? super T, W> windowFn = WINDOW_FG.get("windowFn", transform);
            @SuppressWarnings("unchecked") JavaDStream<WindowedValue<T>> dStream = (JavaDStream<WindowedValue<T>>) sec.getStream(transform);
            if (windowFn instanceof FixedWindows) {
                Duration windowDuration = Durations.milliseconds(((FixedWindows) windowFn).getSize().getMillis());
                sec.setStream(transform, dStream.window(windowDuration));
            } else if (windowFn instanceof SlidingWindows) {
                Duration windowDuration = Durations.milliseconds(((SlidingWindows) windowFn).getSize().getMillis());
                Duration slideDuration = Durations.milliseconds(((SlidingWindows) windowFn).getPeriod().getMillis());
                sec.setStream(transform, dStream.window(windowDuration, slideDuration));
            }
            //--- then we apply windowing to the elements
            DoFn<T, T> addWindowsDoFn = new AssignWindowsDoFn<>(windowFn);
            DoFnFunction<T, T> dofn = new DoFnFunction<>(addWindowsDoFn, ((StreamingEvaluationContext) context).getRuntimeContext(), null);
            @SuppressWarnings("unchecked") JavaDStreamLike<WindowedValue<T>, ?, JavaRDD<WindowedValue<T>>> dstream = (JavaDStreamLike<WindowedValue<T>, ?, JavaRDD<WindowedValue<T>>>) sec.getStream(transform);
            sec.setStream(transform, dstream.mapPartitions(dofn));
        }
    };
}
Also used : BoundedWindow(com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow) Window(com.google.cloud.dataflow.sdk.transforms.windowing.Window) FixedWindows(com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows) Duration(org.apache.spark.streaming.Duration) AssignWindowsDoFn(com.google.cloud.dataflow.sdk.util.AssignWindowsDoFn) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) TransformEvaluator(com.cloudera.dataflow.spark.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) DoFnFunction(com.cloudera.dataflow.spark.DoFnFunction) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaDStreamLike(org.apache.spark.streaming.api.java.JavaDStreamLike) EvaluationContext(com.cloudera.dataflow.spark.EvaluationContext) SlidingWindows(com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows)

Example 52 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class SparkSharderUnitTest method testContigBoundary.

@Test
public void testContigBoundary() throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    // Consider the following reads (divided into four partitions), and intervals.
    // This test counts the number of reads that overlap each interval.
    //                      1                   2
    //    1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7
    // ---------------------------------------------------------
    // Reads in partition 0
    //   [-----] chr 1
    //           [-----] chr 1
    //               [-----] chr 1
    //   [-----] chr 2
    //     [-----] chr 2
    // ---------------------------------------------------------
    // Per-partition read extents
    //   [-----------------] chr 1
    //   [-------] chr 2
    // ---------------------------------------------------------
    // Intervals
    //     [-----] chr 1
    //                 [---------] chr 1
    //   [-----------------------] chr 2
    // ---------------------------------------------------------
    //    1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7
    JavaRDD<TestRead> reads = ctx.parallelize(ImmutableList.of(new TestRead("1", 1, 3), new TestRead("1", 5, 7), new TestRead("1", 7, 9), new TestRead("2", 1, 3), new TestRead("2", 2, 4)), 1);
    List<SimpleInterval> intervals = ImmutableList.of(new SimpleInterval("1", 2, 4), new SimpleInterval("1", 8, 12), new SimpleInterval("2", 1, 12));
    List<ShardBoundary> shardBoundaries = intervals.stream().map(si -> new ShardBoundary(si, si)).collect(Collectors.toList());
    ImmutableMap<SimpleInterval, Integer> expectedReadsPerInterval = ImmutableMap.of(intervals.get(0), 1, intervals.get(1), 1, intervals.get(2), 2);
    JavaPairRDD<Locatable, Integer> readsPerInterval = SparkSharder.shard(ctx, reads, TestRead.class, sequenceDictionary, shardBoundaries, STANDARD_READ_LENGTH, false).flatMapToPair(new CountOverlappingReadsFunction());
    assertEquals(readsPerInterval.collectAsMap(), expectedReadsPerInterval);
    JavaPairRDD<Locatable, Integer> readsPerIntervalShuffle = SparkSharder.shard(ctx, reads, TestRead.class, sequenceDictionary, shardBoundaries, STANDARD_READ_LENGTH, true).flatMapToPair(new CountOverlappingReadsFunction());
    assertEquals(readsPerIntervalShuffle.collectAsMap(), expectedReadsPerInterval);
}
Also used : Locatable(htsjdk.samtools.util.Locatable) OverlapDetector(htsjdk.samtools.util.OverlapDetector) java.util(java.util) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) IOException(java.io.IOException) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) Shard(org.broadinstitute.hellbender.engine.Shard) Serializable(java.io.Serializable) UserException(org.broadinstitute.hellbender.exceptions.UserException) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) Assert.assertTrue(org.testng.Assert.assertTrue) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) com.google.common.collect(com.google.common.collect) Assert.assertFalse(org.testng.Assert.assertFalse) JavaRDD(org.apache.spark.api.java.JavaRDD) ShardBoundary(org.broadinstitute.hellbender.engine.ShardBoundary) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Locatable(htsjdk.samtools.util.Locatable) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 53 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class JoinReadsWithVariantsSparkUnitTest method pairReadsAndVariantsTest.

@Test(dataProvider = "pairedReadsAndVariants", groups = "spark")
public void pairReadsAndVariantsTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, Iterable<GATKVariant>>> kvReadiVariant, JoinStrategy joinStrategy) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
    JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
    JavaPairRDD<GATKRead, Iterable<GATKVariant>> actual = joinStrategy == JoinStrategy.SHUFFLE ? ShuffleJoinReadsWithVariants.join(rddReads, rddVariants) : BroadcastJoinReadsWithVariants.join(rddReads, rddVariants);
    Map<GATKRead, Iterable<GATKVariant>> gatkReadIterableMap = actual.collectAsMap();
    Assert.assertEquals(gatkReadIterableMap.size(), kvReadiVariant.size());
    for (KV<GATKRead, Iterable<GATKVariant>> kv : kvReadiVariant) {
        List<GATKVariant> variants = Lists.newArrayList(gatkReadIterableMap.get(kv.getKey()));
        Assert.assertTrue(variants.stream().noneMatch(v -> v == null));
        HashSet<GATKVariant> hashVariants = new LinkedHashSet<>(variants);
        final Iterable<GATKVariant> iVariants = kv.getValue();
        HashSet<GATKVariant> expectedHashVariants = Sets.newLinkedHashSet(iVariants);
        Assert.assertEquals(hashVariants, expectedHashVariants);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) java.util(java.util) DataProvider(org.testng.annotations.DataProvider) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) KV(com.google.cloud.dataflow.sdk.values.KV) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.testng.annotations.Test) Read(com.google.api.services.genomics.model.Read) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Sets(com.google.common.collect.Sets) SAMRecord(htsjdk.samtools.SAMRecord) Lists(com.google.common.collect.Lists) Assert(org.testng.Assert) JavaRDD(org.apache.spark.api.java.JavaRDD) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 54 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class FindBreakpointEvidenceSpark method handleAssemblies.

/**
     * Transform all the reads for a supplied set of template names in each interval into FASTQ records
     * for each interval, and do something with the list of FASTQ records for each interval (like write it to a file).
     */
@VisibleForTesting
static List<AlignedAssemblyOrExcuse> handleAssemblies(final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final JavaRDD<GATKRead> reads, final int nIntervals, final boolean includeMappingLocation, final boolean dumpFASTQs, final LocalAssemblyHandler localAssemblyHandler) {
    final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNamesMultiMap = ctx.broadcast(qNamesMultiMap);
    final List<AlignedAssemblyOrExcuse> intervalDispositions = reads.mapPartitionsToPair(readItr -> new ReadsForQNamesFinder(broadcastQNamesMultiMap.value(), nIntervals, includeMappingLocation, dumpFASTQs).call(readItr).iterator(), false).combineByKey(x -> x, FindBreakpointEvidenceSpark::combineLists, FindBreakpointEvidenceSpark::combineLists, new HashPartitioner(nIntervals), false, null).map(localAssemblyHandler::apply).collect();
    broadcastQNamesMultiMap.destroy();
    BwaMemIndexSingleton.closeAllDistributedInstances(ctx);
    return intervalDispositions;
}
Also used : BwaMemIndexSingleton(org.broadinstitute.hellbender.utils.bwa.BwaMemIndexSingleton) IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentCollection(org.broadinstitute.barclay.argparser.ArgumentCollection) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) BwaMemAlignment(org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment) Function(java.util.function.Function) BwaMemAligner(org.broadinstitute.hellbender.utils.bwa.BwaMemAligner) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) FermiLiteAssembly(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembly) FermiLiteAssembler(org.broadinstitute.hellbender.utils.fermi.FermiLiteAssembler) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) Broadcast(org.apache.spark.broadcast.Broadcast) HashPartitioner(org.apache.spark.HashPartitioner) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StructuralVariationSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup) FindBreakpointEvidenceSparkArgumentCollection(org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) java.io(java.io) Utils(org.broadinstitute.hellbender.utils.Utils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) htsjdk.samtools(htsjdk.samtools) MapPartitioner(org.broadinstitute.hellbender.tools.spark.utils.MapPartitioner) LogManager(org.apache.logging.log4j.LogManager) HashPartitioner(org.apache.spark.HashPartitioner) HopscotchUniqueMultiMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 55 with JavaRDD

use of org.apache.spark.api.java.JavaRDD in project gatk by broadinstitute.

the class SortReadFileSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    JavaRDD<GATKRead> reads = getReads();
    int numReducers = getRecommendedNumReducers();
    logger.info("Using %s reducers" + numReducers);
    final SAMFileHeader readsHeader = getHeaderForReads();
    ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader);
    JavaRDD<GATKRead> sortedReads;
    if (shardedOutput) {
        sortedReads = reads.mapToPair(read -> new Tuple2<>(read, null)).sortByKey(comparator, true, numReducers).keys();
    } else {
        // sorting is done by writeReads below
        sortedReads = reads;
    }
    readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    writeReads(ctx, outputFile, sortedReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) SparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.SparkProgramGroup) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) ReadCoordinateComparator(org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) SAMFileHeader(htsjdk.samtools.SAMFileHeader) List(java.util.List) Collections(java.util.Collections) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) Tuple2(scala.Tuple2) ReadCoordinateComparator(org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

JavaRDD (org.apache.spark.api.java.JavaRDD)63 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)33 List (java.util.List)24 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)24 Collectors (java.util.stream.Collectors)20 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 Tuple2 (scala.Tuple2)20 Argument (org.broadinstitute.barclay.argparser.Argument)17 Broadcast (org.apache.spark.broadcast.Broadcast)15 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)15 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 IOException (java.io.IOException)14 UserException (org.broadinstitute.hellbender.exceptions.UserException)14 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)13 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)13 Serializable (java.io.Serializable)12 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)12 java.util (java.util)11 ArrayList (java.util.ArrayList)11