Search in sources :

Example 71 with Tuple2

use of scala.Tuple2 in project gatk by broadinstitute.

the class ReadsForQNamesFinder method call.

public Iterable<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> call(final Iterator<GATKRead> readsItr) {
    @SuppressWarnings({ "unchecked", "rawtypes" }) final List<SVFastqUtils.FastqRead>[] intervalReads = new List[nIntervals];
    int nPopulatedIntervals = 0;
    while (readsItr.hasNext()) {
        final GATKRead read = readsItr.next();
        final Iterator<QNameAndInterval> namesItr = qNamesMultiMap.findEach(read.getName());
        SVFastqUtils.FastqRead FastqRead = null;
        while (namesItr.hasNext()) {
            final int intervalId = namesItr.next().getIntervalId();
            if (intervalReads[intervalId] == null) {
                intervalReads[intervalId] = new ArrayList<>(nReadsPerInterval);
                nPopulatedIntervals += 1;
            }
            if (FastqRead == null) {
                final String readName = dumpFASTQs ? SVFastqUtils.readToFastqSeqId(read, includeMappingLocation) : null;
                FastqRead = new SVFastqUtils.FastqRead(readName, read.getBases(), read.getBaseQualities());
            }
            intervalReads[intervalId].add(FastqRead);
        }
    }
    final List<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> fastQRecords = new ArrayList<>(nPopulatedIntervals);
    if (nPopulatedIntervals > 0) {
        for (int idx = 0; idx != nIntervals; ++idx) {
            final List<SVFastqUtils.FastqRead> readList = intervalReads[idx];
            if (readList != null)
                fastQRecords.add(new Tuple2<>(idx, readList));
        }
    }
    return fastQRecords;
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ArrayList(java.util.ArrayList) Tuple2(scala.Tuple2) List(java.util.List) ArrayList(java.util.ArrayList)

Example 72 with Tuple2

use of scala.Tuple2 in project gatk by broadinstitute.

the class RunSGAViaProcessBuilderOnSpark method writeToLocal.

/**
     * Utility function that unloads the FASTQ contents for a breakpoint to a local file for later consumption by SGA.
     * @param oneBreakPoint input for one breakpoint, where the first is the path to the FASTQ file and the second is the FASTQ file's content
     * @return              the breakpoint ID and with the FASTQ file contents dumped to a local File
     * @throws IOException  if fails to create the temporary directory or fails to write to local file
     */
@VisibleForTesting
static Tuple2<Long, File> writeToLocal(final Tuple2<String, String> oneBreakPoint, final String subStringToStripout) throws IOException {
    final String fastqFilename = FilenameUtils.getName(oneBreakPoint._1());
    final File localTempWorkingDir = Files.createTempDirectory(fastqFilename + "_").toAbsolutePath().toFile();
    localTempWorkingDir.deleteOnExit();
    final File localFASTQFile = new File(localTempWorkingDir, fastqFilename);
    FileUtils.writeStringToFile(localFASTQFile, oneBreakPoint._2());
    final Long breakpointID = Long.parseLong(FilenameUtils.getBaseName(oneBreakPoint._1()).replace(subStringToStripout, ""));
    return new Tuple2<>(breakpointID, localFASTQFile);
}
Also used : Tuple2(scala.Tuple2) File(java.io.File) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 73 with Tuple2

use of scala.Tuple2 in project gatk by broadinstitute.

the class ShuffleJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param keyedByRead The read-keyed RDD for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
     */
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
    SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
    JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
        ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
        return new Tuple2<>(shard, pair);
    });
    JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
    return shardiRead.flatMapToPair(in -> {
        List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
        Iterable<Tuple2<GATKRead, T>> iReads = in._2();
        final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
        SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
        ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
        for (Tuple2<GATKRead, T> p : iReads) {
            final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
            out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
        }
        return out.iterator();
    });
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) List(java.util.List) Lists(com.google.common.collect.Lists) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) SerializableFunction(org.broadinstitute.hellbender.utils.SerializableFunction) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) JavaRDD(org.apache.spark.api.java.JavaRDD) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 74 with Tuple2

use of scala.Tuple2 in project gatk by broadinstitute.

the class ShuffleJoinReadsWithVariants method pairReadsWithVariantShards.

private static JavaPairRDD<VariantShard, GATKRead> pairReadsWithVariantShards(final JavaRDD<GATKRead> reads) {
    return reads.flatMapToPair(gatkRead -> {
        List<VariantShard> shards = VariantShard.getVariantShardsFromInterval(gatkRead);
        List<Tuple2<VariantShard, GATKRead>> out = Lists.newArrayList();
        for (VariantShard shard : shards) {
            out.add(new Tuple2<>(shard, gatkRead));
        }
        return out.iterator();
    });
}
Also used : VariantShard(org.broadinstitute.hellbender.engine.VariantShard) Tuple2(scala.Tuple2)

Example 75 with Tuple2

use of scala.Tuple2 in project gatk by broadinstitute.

the class BroadcastJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param reads The reads for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
     */
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
    JavaSparkContext ctx = new JavaSparkContext(reads.context());
    Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceDataflowSource);
    return reads.mapToPair(read -> {
        SimpleInterval interval = bReferenceSource.getValue().getReferenceWindowFunction().apply(read);
        return new Tuple2<>(read, bReferenceSource.getValue().getReferenceBases(null, interval));
    });
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

Tuple2 (scala.Tuple2)181 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)57 ArrayList (java.util.ArrayList)43 IOException (java.io.IOException)32 Test (org.junit.Test)32 INDArray (org.nd4j.linalg.api.ndarray.INDArray)28 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 List (java.util.List)22 Function (org.apache.spark.api.java.function.Function)19 File (java.io.File)18 Collectors (java.util.stream.Collectors)18 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)18 Configuration (org.apache.hadoop.conf.Configuration)17 UserException (org.broadinstitute.hellbender.exceptions.UserException)17 Broadcast (org.apache.spark.broadcast.Broadcast)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)16 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)16 SparkConf (org.apache.spark.SparkConf)15 JavaRDD (org.apache.spark.api.java.JavaRDD)15 VisibleForTesting (com.google.common.annotations.VisibleForTesting)14