use of scala.Tuple2 in project gatk by broadinstitute.
the class ReadsForQNamesFinder method call.
public Iterable<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> call(final Iterator<GATKRead> readsItr) {
@SuppressWarnings({ "unchecked", "rawtypes" }) final List<SVFastqUtils.FastqRead>[] intervalReads = new List[nIntervals];
int nPopulatedIntervals = 0;
while (readsItr.hasNext()) {
final GATKRead read = readsItr.next();
final Iterator<QNameAndInterval> namesItr = qNamesMultiMap.findEach(read.getName());
SVFastqUtils.FastqRead FastqRead = null;
while (namesItr.hasNext()) {
final int intervalId = namesItr.next().getIntervalId();
if (intervalReads[intervalId] == null) {
intervalReads[intervalId] = new ArrayList<>(nReadsPerInterval);
nPopulatedIntervals += 1;
}
if (FastqRead == null) {
final String readName = dumpFASTQs ? SVFastqUtils.readToFastqSeqId(read, includeMappingLocation) : null;
FastqRead = new SVFastqUtils.FastqRead(readName, read.getBases(), read.getBaseQualities());
}
intervalReads[intervalId].add(FastqRead);
}
}
final List<Tuple2<Integer, List<SVFastqUtils.FastqRead>>> fastQRecords = new ArrayList<>(nPopulatedIntervals);
if (nPopulatedIntervals > 0) {
for (int idx = 0; idx != nIntervals; ++idx) {
final List<SVFastqUtils.FastqRead> readList = intervalReads[idx];
if (readList != null)
fastQRecords.add(new Tuple2<>(idx, readList));
}
}
return fastQRecords;
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class RunSGAViaProcessBuilderOnSpark method writeToLocal.
/**
* Utility function that unloads the FASTQ contents for a breakpoint to a local file for later consumption by SGA.
* @param oneBreakPoint input for one breakpoint, where the first is the path to the FASTQ file and the second is the FASTQ file's content
* @return the breakpoint ID and with the FASTQ file contents dumped to a local File
* @throws IOException if fails to create the temporary directory or fails to write to local file
*/
@VisibleForTesting
static Tuple2<Long, File> writeToLocal(final Tuple2<String, String> oneBreakPoint, final String subStringToStripout) throws IOException {
final String fastqFilename = FilenameUtils.getName(oneBreakPoint._1());
final File localTempWorkingDir = Files.createTempDirectory(fastqFilename + "_").toAbsolutePath().toFile();
localTempWorkingDir.deleteOnExit();
final File localFASTQFile = new File(localTempWorkingDir, fastqFilename);
FileUtils.writeStringToFile(localFASTQFile, oneBreakPoint._2());
final Long breakpointID = Long.parseLong(FilenameUtils.getBaseName(oneBreakPoint._1()).replace(subStringToStripout, ""));
return new Tuple2<>(breakpointID, localFASTQFile);
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param keyedByRead The read-keyed RDD for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
*/
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
return new Tuple2<>(shard, pair);
});
JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
Iterable<Tuple2<GATKRead, T>> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (Tuple2<GATKRead, T> p : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
}
return out.iterator();
});
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class ShuffleJoinReadsWithVariants method pairReadsWithVariantShards.
private static JavaPairRDD<VariantShard, GATKRead> pairReadsWithVariantShards(final JavaRDD<GATKRead> reads) {
return reads.flatMapToPair(gatkRead -> {
List<VariantShard> shards = VariantShard.getVariantShardsFromInterval(gatkRead);
List<Tuple2<VariantShard, GATKRead>> out = Lists.newArrayList();
for (VariantShard shard : shards) {
out.add(new Tuple2<>(shard, gatkRead));
}
return out.iterator();
});
}
use of scala.Tuple2 in project gatk by broadinstitute.
the class BroadcastJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param reads The reads for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
*/
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
JavaSparkContext ctx = new JavaSparkContext(reads.context());
Broadcast<ReferenceMultiSource> bReferenceSource = ctx.broadcast(referenceDataflowSource);
return reads.mapToPair(read -> {
SimpleInterval interval = bReferenceSource.getValue().getReferenceWindowFunction().apply(read);
return new Tuple2<>(read, bReferenceSource.getValue().getReferenceBases(null, interval));
});
}
Aggregations